Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
BroadcastSum.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #include <algorithm>
11 #include "../../FaissAssert.h"
12 
13 #include "../utils/DeviceUtils.h"
14 #include "../utils/MathOperators.cuh"
15 #include "../utils/Tensor.cuh"
16 #include "../utils/StaticUtils.h"
17 
18 namespace faiss { namespace gpu {
19 
20 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
21 __global__ void sumAlongColumns(Tensor<T, 1, true> input,
22  Tensor<T, 2, true> output) {
23  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
24 
25  // blockIdx.x: which chunk of rows we are responsible for updating
26  // blockIdx.y: which chunk of columns we are responsible for
27  // updating
28  int rowStart = blockIdx.x * kRowsPerBlock;
29  int rowEnd = rowStart + kRowsPerBlock;
30  int colStart = blockIdx.y * blockDim.x * kColLoad;
31 
32  // FIXME: if we have exact multiples, don't need this
33  bool endRow = (blockIdx.x == gridDim.x - 1);
34  bool endCol = (blockIdx.y == gridDim.y - 1);
35 
36  if (endRow) {
37  if (output.getSize(0) % kRowsPerBlock == 0) {
38  endRow = false;
39  }
40  }
41 
42  if (endCol) {
43  for (int col = colStart + threadIdx.x;
44  col < input.getSize(0); col += blockDim.x) {
45  T val = input[col];
46 
47  if (endRow) {
48  for (int row = rowStart; row < output.getSize(0); ++row) {
49  T out = output[row][col];
50  out = Math<T>::add(out, val);
51  output[row][col] = out;
52  }
53  } else {
54  T rows[kRowUnroll];
55 
56  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
57 #pragma unroll
58  for (int i = 0; i < kRowUnroll; ++i) {
59  rows[i] = output[row + i][col];
60  }
61 
62 #pragma unroll
63  for (int i = 0; i < kRowUnroll; ++i) {
64  rows[i] = Math<T>::add(rows[i], val);
65  }
66 
67 #pragma unroll
68  for (int i = 0; i < kRowUnroll; ++i) {
69  output[row + i][col] = rows[i];
70  }
71  }
72  }
73  }
74  } else {
75  int col = colStart + threadIdx.x;
76 
77  T val[kColLoad];
78 
79 #pragma unroll
80  for (int i = 0; i < kColLoad; ++i) {
81  val[i] = input[col + i * blockDim.x];
82  }
83 
84  if (endRow) {
85  for (int row = rowStart; row < output.getSize(0); ++row) {
86 #pragma unroll
87  for (int i = 0; i < kColLoad; ++i) {
88  T out = output[row][col + i * blockDim.x];
89  out = Math<T>::add(out, val[i]);
90  output[row][col + i * blockDim.x] = out;
91  }
92  }
93  } else {
94  T rows[kRowUnroll * kColLoad];
95 
96  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
97 #pragma unroll
98  for (int i = 0; i < kRowUnroll; ++i) {
99 #pragma unroll
100  for (int j = 0; j < kColLoad; ++j) {
101  rows[i * kColLoad + j] =
102  output[row + i][col + j * blockDim.x];
103  }
104  }
105 
106 #pragma unroll
107  for (int i = 0; i < kRowUnroll; ++i) {
108 #pragma unroll
109  for (int j = 0; j < kColLoad; ++j) {
110  rows[i * kColLoad + j] =
111  Math<T>::add(rows[i * kColLoad + j], val[j]);
112  }
113  }
114 
115 #pragma unroll
116  for (int i = 0; i < kRowUnroll; ++i) {
117 #pragma unroll
118  for (int j = 0; j < kColLoad; ++j) {
119  output[row + i][col + j * blockDim.x] =
120  rows[i * kColLoad + j];
121  }
122  }
123  }
124  }
125  }
126 }
127 
128 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
129 __global__ void assignAlongColumns(Tensor<T, 1, true> input,
130  Tensor<T, 2, true> output) {
131  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
132 
133  // blockIdx.x: which chunk of rows we are responsible for updating
134  // blockIdx.y: which chunk of columns we are responsible for
135  // updating
136  int rowStart = blockIdx.x * kRowsPerBlock;
137  int rowEnd = rowStart + kRowsPerBlock;
138  int colStart = blockIdx.y * blockDim.x * kColLoad;
139 
140  // FIXME: if we have exact multiples, don't need this
141  bool endRow = (blockIdx.x == gridDim.x - 1);
142  bool endCol = (blockIdx.y == gridDim.y - 1);
143 
144  if (endRow) {
145  if (output.getSize(0) % kRowsPerBlock == 0) {
146  endRow = false;
147  }
148  }
149 
150  if (endCol) {
151  for (int col = colStart + threadIdx.x;
152  col < input.getSize(0); col += blockDim.x) {
153  T val = input[col];
154 
155  if (endRow) {
156  for (int row = rowStart; row < output.getSize(0); ++row) {
157  output[row][col] = val;
158  }
159  } else {
160  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
161 #pragma unroll
162  for (int i = 0; i < kRowUnroll; ++i) {
163  output[row + i][col] = val;
164  }
165  }
166  }
167  }
168  } else {
169  int col = colStart + threadIdx.x;
170 
171  T val[kColLoad];
172 
173 #pragma unroll
174  for (int i = 0; i < kColLoad; ++i) {
175  val[i] = input[col + i * blockDim.x];
176  }
177 
178  if (endRow) {
179  for (int row = rowStart; row < output.getSize(0); ++row) {
180 #pragma unroll
181  for (int i = 0; i < kColLoad; ++i) {
182  output[row][col + i * blockDim.x] = val[i];
183  }
184  }
185  } else {
186  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
187 #pragma unroll
188  for (int i = 0; i < kRowUnroll; ++i) {
189 #pragma unroll
190  for (int j = 0; j < kColLoad; ++j) {
191  output[row + i][col + j * blockDim.x] = val[j];
192  }
193  }
194  }
195  }
196  }
197 }
198 
199 template <typename T, bool ZeroClamp>
200 __global__ void sumAlongRows(Tensor<T, 1, true> input,
201  Tensor<T, 2, true> output) {
202  __shared__ T sval;
203 
204  int row = blockIdx.x;
205 
206  if (threadIdx.x == 0) {
207  sval = input[row];
208  }
209 
210  __syncthreads();
211 
212  T val = sval;
213 
214  // FIXME: speed up
215  for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
216  T out = output[row][i];
217  out = Math<T>::add(out, val);
218  out = Math<T>::lt(out, Math<T>::zero()) ? Math<T>::zero() : out;
219 
220  output[row][i] = out;
221  }
222 }
223 
224 template <typename T, typename TVec>
225 void runSumAlongColumns(Tensor<T, 1, true>& input,
226  Tensor<T, 2, true>& output,
227  cudaStream_t stream) {
228  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
229 
230  int threadsPerBlock = 256;
231  constexpr int kRowUnroll = 4;
232  constexpr int kRowsPerBlock = kRowUnroll * 4;
233  constexpr int kColLoad = 4;
234 
235  auto block = dim3(threadsPerBlock);
236 
237  if (input.template canCastResize<TVec>() &&
238  output.template canCastResize<TVec>()) {
239  auto inputV = input.template castResize<TVec>();
240  auto outputV = output.template castResize<TVec>();
241 
242  auto grid =
243  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
244  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
245 
246  sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
247  <<<grid, block, 0, stream>>>(inputV, outputV);
248  } else {
249  auto grid =
250  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
251  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
252 
253  sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
254  <<<grid, block, 0, stream>>>(input, output);
255  }
256 
257  CUDA_TEST_ERROR();
258 }
259 
260 void runSumAlongColumns(Tensor<float, 1, true>& input,
261  Tensor<float, 2, true>& output,
262  cudaStream_t stream) {
263  runSumAlongColumns<float, float4>(input, output, stream);
264 }
265 
266 #ifdef FAISS_USE_FLOAT16
267 void runSumAlongColumns(Tensor<half, 1, true>& input,
268  Tensor<half, 2, true>& output,
269  cudaStream_t stream) {
270  runSumAlongColumns<half, half2>(input, output, stream);
271 }
272 #endif
273 
274 template <typename T, typename TVec>
275 void runAssignAlongColumns(Tensor<T, 1, true>& input,
276  Tensor<T, 2, true>& output,
277  cudaStream_t stream) {
278  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
279 
280  int threadsPerBlock = 256;
281  constexpr int kRowUnroll = 4;
282  constexpr int kRowsPerBlock = kRowUnroll * 4;
283  constexpr int kColLoad = 4;
284 
285  auto block = dim3(threadsPerBlock);
286 
287  if (input.template canCastResize<TVec>() &&
288  output.template canCastResize<TVec>()) {
289  auto inputV = input.template castResize<TVec>();
290  auto outputV = output.template castResize<TVec>();
291 
292  auto grid =
293  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
294  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
295 
296  assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
297  <<<grid, block, 0, stream>>>(inputV, outputV);
298  } else {
299  auto grid =
300  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
301  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
302 
303  assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
304  <<<grid, block, 0, stream>>>(input, output);
305  }
306 
307  CUDA_TEST_ERROR();
308 }
309 
310 void runAssignAlongColumns(Tensor<float, 1, true>& input,
311  Tensor<float, 2, true>& output,
312  cudaStream_t stream) {
313  runAssignAlongColumns<float, float4>(input, output, stream);
314 }
315 
316 #ifdef FAISS_USE_FLOAT16
317 void runAssignAlongColumns(Tensor<half, 1, true>& input,
318  Tensor<half, 2, true>& output,
319  cudaStream_t stream) {
320  runAssignAlongColumns<half, half2>(input, output, stream);
321 }
322 #endif
323 
324 template <typename T>
325 void runSumAlongRows(Tensor<T, 1, true>& input,
326  Tensor<T, 2, true>& output,
327  bool zeroClamp,
328  cudaStream_t stream) {
329  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
330 
331  int threadsPerBlock =
332  std::min(output.getSize(1), getMaxThreadsCurrentDevice());
333  auto grid = dim3(output.getSize(0));
334  auto block = dim3(threadsPerBlock);
335 
336  if (zeroClamp) {
337  sumAlongRows<T, true><<<grid, block, 0, stream>>>(input, output);
338  } else {
339  sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
340  }
341 
342  CUDA_TEST_ERROR();
343 }
344 
345 void runSumAlongRows(Tensor<float, 1, true>& input,
346  Tensor<float, 2, true>& output,
347  bool zeroClamp,
348  cudaStream_t stream) {
349  runSumAlongRows<float>(input, output, zeroClamp, stream);
350 }
351 
352 #ifdef FAISS_USE_FLOAT16
353 void runSumAlongRows(Tensor<half, 1, true>& input,
354  Tensor<half, 2, true>& output,
355  bool zeroClamp,
356  cudaStream_t stream) {
357  runSumAlongRows<half>(input, output, zeroClamp, stream);
358 }
359 #endif
360 
361 } } // namespace