Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
BroadcastSum.cu
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #include <algorithm>
10 #include "../../FaissAssert.h"
11 
12 #include "../utils/DeviceUtils.h"
13 #include "../utils/MathOperators.cuh"
14 #include "../utils/Tensor.cuh"
15 #include "../utils/StaticUtils.h"
16 
17 namespace faiss { namespace gpu {
18 
19 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
20 __global__ void sumAlongColumns(Tensor<T, 1, true> input,
21  Tensor<T, 2, true> output) {
22  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
23 
24  // blockIdx.x: which chunk of rows we are responsible for updating
25  // blockIdx.y: which chunk of columns we are responsible for
26  // updating
27  int rowStart = blockIdx.x * kRowsPerBlock;
28  int rowEnd = rowStart + kRowsPerBlock;
29  int colStart = blockIdx.y * blockDim.x * kColLoad;
30 
31  // FIXME: if we have exact multiples, don't need this
32  bool endRow = (blockIdx.x == gridDim.x - 1);
33  bool endCol = (blockIdx.y == gridDim.y - 1);
34 
35  if (endRow) {
36  if (output.getSize(0) % kRowsPerBlock == 0) {
37  endRow = false;
38  }
39  }
40 
41  if (endCol) {
42  for (int col = colStart + threadIdx.x;
43  col < input.getSize(0); col += blockDim.x) {
44  T val = input[col];
45 
46  if (endRow) {
47  for (int row = rowStart; row < output.getSize(0); ++row) {
48  T out = output[row][col];
49  out = Math<T>::add(out, val);
50  output[row][col] = out;
51  }
52  } else {
53  T rows[kRowUnroll];
54 
55  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
56 #pragma unroll
57  for (int i = 0; i < kRowUnroll; ++i) {
58  rows[i] = output[row + i][col];
59  }
60 
61 #pragma unroll
62  for (int i = 0; i < kRowUnroll; ++i) {
63  rows[i] = Math<T>::add(rows[i], val);
64  }
65 
66 #pragma unroll
67  for (int i = 0; i < kRowUnroll; ++i) {
68  output[row + i][col] = rows[i];
69  }
70  }
71  }
72  }
73  } else {
74  int col = colStart + threadIdx.x;
75 
76  T val[kColLoad];
77 
78 #pragma unroll
79  for (int i = 0; i < kColLoad; ++i) {
80  val[i] = input[col + i * blockDim.x];
81  }
82 
83  if (endRow) {
84  for (int row = rowStart; row < output.getSize(0); ++row) {
85 #pragma unroll
86  for (int i = 0; i < kColLoad; ++i) {
87  T out = output[row][col + i * blockDim.x];
88  out = Math<T>::add(out, val[i]);
89  output[row][col + i * blockDim.x] = out;
90  }
91  }
92  } else {
93  T rows[kRowUnroll * kColLoad];
94 
95  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
96 #pragma unroll
97  for (int i = 0; i < kRowUnroll; ++i) {
98 #pragma unroll
99  for (int j = 0; j < kColLoad; ++j) {
100  rows[i * kColLoad + j] =
101  output[row + i][col + j * blockDim.x];
102  }
103  }
104 
105 #pragma unroll
106  for (int i = 0; i < kRowUnroll; ++i) {
107 #pragma unroll
108  for (int j = 0; j < kColLoad; ++j) {
109  rows[i * kColLoad + j] =
110  Math<T>::add(rows[i * kColLoad + j], val[j]);
111  }
112  }
113 
114 #pragma unroll
115  for (int i = 0; i < kRowUnroll; ++i) {
116 #pragma unroll
117  for (int j = 0; j < kColLoad; ++j) {
118  output[row + i][col + j * blockDim.x] =
119  rows[i * kColLoad + j];
120  }
121  }
122  }
123  }
124  }
125 }
126 
127 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
128 __global__ void assignAlongColumns(Tensor<T, 1, true> input,
129  Tensor<T, 2, true> output) {
130  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
131 
132  // blockIdx.x: which chunk of rows we are responsible for updating
133  // blockIdx.y: which chunk of columns we are responsible for
134  // updating
135  int rowStart = blockIdx.x * kRowsPerBlock;
136  int rowEnd = rowStart + kRowsPerBlock;
137  int colStart = blockIdx.y * blockDim.x * kColLoad;
138 
139  // FIXME: if we have exact multiples, don't need this
140  bool endRow = (blockIdx.x == gridDim.x - 1);
141  bool endCol = (blockIdx.y == gridDim.y - 1);
142 
143  if (endRow) {
144  if (output.getSize(0) % kRowsPerBlock == 0) {
145  endRow = false;
146  }
147  }
148 
149  if (endCol) {
150  for (int col = colStart + threadIdx.x;
151  col < input.getSize(0); col += blockDim.x) {
152  T val = input[col];
153 
154  if (endRow) {
155  for (int row = rowStart; row < output.getSize(0); ++row) {
156  output[row][col] = val;
157  }
158  } else {
159  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
160 #pragma unroll
161  for (int i = 0; i < kRowUnroll; ++i) {
162  output[row + i][col] = val;
163  }
164  }
165  }
166  }
167  } else {
168  int col = colStart + threadIdx.x;
169 
170  T val[kColLoad];
171 
172 #pragma unroll
173  for (int i = 0; i < kColLoad; ++i) {
174  val[i] = input[col + i * blockDim.x];
175  }
176 
177  if (endRow) {
178  for (int row = rowStart; row < output.getSize(0); ++row) {
179 #pragma unroll
180  for (int i = 0; i < kColLoad; ++i) {
181  output[row][col + i * blockDim.x] = val[i];
182  }
183  }
184  } else {
185  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
186 #pragma unroll
187  for (int i = 0; i < kRowUnroll; ++i) {
188 #pragma unroll
189  for (int j = 0; j < kColLoad; ++j) {
190  output[row + i][col + j * blockDim.x] = val[j];
191  }
192  }
193  }
194  }
195  }
196 }
197 
198 template <typename T, bool ZeroClamp>
199 __global__ void sumAlongRows(Tensor<T, 1, true> input,
200  Tensor<T, 2, true> output) {
201  __shared__ T sval;
202 
203  int row = blockIdx.x;
204 
205  if (threadIdx.x == 0) {
206  sval = input[row];
207  }
208 
209  __syncthreads();
210 
211  T val = sval;
212 
213  // FIXME: speed up
214  for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
215  T out = output[row][i];
216  out = Math<T>::add(out, val);
217  out = Math<T>::lt(out, Math<T>::zero()) ? Math<T>::zero() : out;
218 
219  output[row][i] = out;
220  }
221 }
222 
223 template <typename T, typename TVec>
224 void runSumAlongColumns(Tensor<T, 1, true>& input,
225  Tensor<T, 2, true>& output,
226  cudaStream_t stream) {
227  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
228 
229  int threadsPerBlock = 256;
230  constexpr int kRowUnroll = 4;
231  constexpr int kRowsPerBlock = kRowUnroll * 4;
232  constexpr int kColLoad = 4;
233 
234  auto block = dim3(threadsPerBlock);
235 
236  if (input.template canCastResize<TVec>() &&
237  output.template canCastResize<TVec>()) {
238  auto inputV = input.template castResize<TVec>();
239  auto outputV = output.template castResize<TVec>();
240 
241  auto grid =
242  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
243  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
244 
245  sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
246  <<<grid, block, 0, stream>>>(inputV, outputV);
247  } else {
248  auto grid =
249  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
250  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
251 
252  sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
253  <<<grid, block, 0, stream>>>(input, output);
254  }
255 
256  CUDA_TEST_ERROR();
257 }
258 
259 void runSumAlongColumns(Tensor<float, 1, true>& input,
260  Tensor<float, 2, true>& output,
261  cudaStream_t stream) {
262  runSumAlongColumns<float, float4>(input, output, stream);
263 }
264 
265 #ifdef FAISS_USE_FLOAT16
266 void runSumAlongColumns(Tensor<half, 1, true>& input,
267  Tensor<half, 2, true>& output,
268  cudaStream_t stream) {
269  runSumAlongColumns<half, half2>(input, output, stream);
270 }
271 #endif
272 
273 template <typename T, typename TVec>
274 void runAssignAlongColumns(Tensor<T, 1, true>& input,
275  Tensor<T, 2, true>& output,
276  cudaStream_t stream) {
277  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
278 
279  int threadsPerBlock = 256;
280  constexpr int kRowUnroll = 4;
281  constexpr int kRowsPerBlock = kRowUnroll * 4;
282  constexpr int kColLoad = 4;
283 
284  auto block = dim3(threadsPerBlock);
285 
286  if (input.template canCastResize<TVec>() &&
287  output.template canCastResize<TVec>()) {
288  auto inputV = input.template castResize<TVec>();
289  auto outputV = output.template castResize<TVec>();
290 
291  auto grid =
292  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
293  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
294 
295  assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
296  <<<grid, block, 0, stream>>>(inputV, outputV);
297  } else {
298  auto grid =
299  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
300  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
301 
302  assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
303  <<<grid, block, 0, stream>>>(input, output);
304  }
305 
306  CUDA_TEST_ERROR();
307 }
308 
309 void runAssignAlongColumns(Tensor<float, 1, true>& input,
310  Tensor<float, 2, true>& output,
311  cudaStream_t stream) {
312  runAssignAlongColumns<float, float4>(input, output, stream);
313 }
314 
315 #ifdef FAISS_USE_FLOAT16
316 void runAssignAlongColumns(Tensor<half, 1, true>& input,
317  Tensor<half, 2, true>& output,
318  cudaStream_t stream) {
319  runAssignAlongColumns<half, half2>(input, output, stream);
320 }
321 #endif
322 
323 template <typename T>
324 void runSumAlongRows(Tensor<T, 1, true>& input,
325  Tensor<T, 2, true>& output,
326  bool zeroClamp,
327  cudaStream_t stream) {
328  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
329 
330  int threadsPerBlock =
331  std::min(output.getSize(1), getMaxThreadsCurrentDevice());
332  auto grid = dim3(output.getSize(0));
333  auto block = dim3(threadsPerBlock);
334 
335  if (zeroClamp) {
336  sumAlongRows<T, true><<<grid, block, 0, stream>>>(input, output);
337  } else {
338  sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
339  }
340 
341  CUDA_TEST_ERROR();
342 }
343 
344 void runSumAlongRows(Tensor<float, 1, true>& input,
345  Tensor<float, 2, true>& output,
346  bool zeroClamp,
347  cudaStream_t stream) {
348  runSumAlongRows<float>(input, output, zeroClamp, stream);
349 }
350 
351 #ifdef FAISS_USE_FLOAT16
352 void runSumAlongRows(Tensor<half, 1, true>& input,
353  Tensor<half, 2, true>& output,
354  bool zeroClamp,
355  cudaStream_t stream) {
356  runSumAlongRows<half>(input, output, zeroClamp, stream);
357 }
358 #endif
359 
360 } } // namespace