Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
BroadcastSum.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include <algorithm>
13 #include "../../FaissAssert.h"
14 
15 #include "../utils/DeviceUtils.h"
16 #include "../utils/MathOperators.cuh"
17 #include "../utils/Tensor.cuh"
18 #include "../utils/StaticUtils.h"
19 
20 namespace faiss { namespace gpu {
21 
22 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
23 __global__ void sumAlongColumns(Tensor<T, 1, true> input,
24  Tensor<T, 2, true> output) {
25  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
26 
27  // blockIdx.x: which chunk of rows we are responsible for updating
28  // blockIdx.y: which chunk of columns we are responsible for
29  // updating
30  int rowStart = blockIdx.x * kRowsPerBlock;
31  int rowEnd = rowStart + kRowsPerBlock;
32  int colStart = blockIdx.y * blockDim.x * kColLoad;
33 
34  // FIXME: if we have exact multiples, don't need this
35  bool endRow = (blockIdx.x == gridDim.x - 1);
36  bool endCol = (blockIdx.y == gridDim.y - 1);
37 
38  if (endRow) {
39  if (output.getSize(0) % kRowsPerBlock == 0) {
40  endRow = false;
41  }
42  }
43 
44  if (endCol) {
45  for (int col = colStart + threadIdx.x;
46  col < input.getSize(0); col += blockDim.x) {
47  T val = input[col];
48 
49  if (endRow) {
50  for (int row = rowStart; row < output.getSize(0); ++row) {
51  T out = output[row][col].ldg();
52  out = Math<T>::add(out, val);
53  output[row][col] = out;
54  }
55  } else {
56  T rows[kRowUnroll];
57 
58  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
59 #pragma unroll
60  for (int i = 0; i < kRowUnroll; ++i) {
61  rows[i] = output[row + i][col].ldg();
62  }
63 
64 #pragma unroll
65  for (int i = 0; i < kRowUnroll; ++i) {
66  rows[i] = Math<T>::add(rows[i], val);
67  }
68 
69 #pragma unroll
70  for (int i = 0; i < kRowUnroll; ++i) {
71  output[row + i][col] = rows[i];
72  }
73  }
74  }
75  }
76  } else {
77  int col = colStart + threadIdx.x;
78 
79  T val[kColLoad];
80 
81 #pragma unroll
82  for (int i = 0; i < kColLoad; ++i) {
83  val[i] = input[col + i * blockDim.x];
84  }
85 
86  if (endRow) {
87  for (int row = rowStart; row < output.getSize(0); ++row) {
88 #pragma unroll
89  for (int i = 0; i < kColLoad; ++i) {
90  T out = output[row][col + i * blockDim.x].ldg();
91  out = Math<T>::add(out, val[i]);
92  output[row][col + i * blockDim.x] = out;
93  }
94  }
95  } else {
96  T rows[kRowUnroll * kColLoad];
97 
98  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
99 #pragma unroll
100  for (int i = 0; i < kRowUnroll; ++i) {
101 #pragma unroll
102  for (int j = 0; j < kColLoad; ++j) {
103  rows[i * kColLoad + j] =
104  output[row + i][col + j * blockDim.x].ldg();
105  }
106  }
107 
108 #pragma unroll
109  for (int i = 0; i < kRowUnroll; ++i) {
110 #pragma unroll
111  for (int j = 0; j < kColLoad; ++j) {
112  rows[i * kColLoad + j] =
113  Math<T>::add(rows[i * kColLoad + j], val[j]);
114  }
115  }
116 
117 #pragma unroll
118  for (int i = 0; i < kRowUnroll; ++i) {
119 #pragma unroll
120  for (int j = 0; j < kColLoad; ++j) {
121  output[row + i][col + j * blockDim.x] =
122  rows[i * kColLoad + j];
123  }
124  }
125  }
126  }
127  }
128 }
129 
130 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
131 __global__ void assignAlongColumns(Tensor<T, 1, true> input,
132  Tensor<T, 2, true> output) {
133  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
134 
135  // blockIdx.x: which chunk of rows we are responsible for updating
136  // blockIdx.y: which chunk of columns we are responsible for
137  // updating
138  int rowStart = blockIdx.x * kRowsPerBlock;
139  int rowEnd = rowStart + kRowsPerBlock;
140  int colStart = blockIdx.y * blockDim.x * kColLoad;
141 
142  // FIXME: if we have exact multiples, don't need this
143  bool endRow = (blockIdx.x == gridDim.x - 1);
144  bool endCol = (blockIdx.y == gridDim.y - 1);
145 
146  if (endRow) {
147  if (output.getSize(0) % kRowsPerBlock == 0) {
148  endRow = false;
149  }
150  }
151 
152  if (endCol) {
153  for (int col = colStart + threadIdx.x;
154  col < input.getSize(0); col += blockDim.x) {
155  T val = input[col];
156 
157  if (endRow) {
158  for (int row = rowStart; row < output.getSize(0); ++row) {
159  output[row][col] = val;
160  }
161  } else {
162  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
163 #pragma unroll
164  for (int i = 0; i < kRowUnroll; ++i) {
165  output[row + i][col] = val;
166  }
167  }
168  }
169  }
170  } else {
171  int col = colStart + threadIdx.x;
172 
173  T val[kColLoad];
174 
175 #pragma unroll
176  for (int i = 0; i < kColLoad; ++i) {
177  val[i] = input[col + i * blockDim.x];
178  }
179 
180  if (endRow) {
181  for (int row = rowStart; row < output.getSize(0); ++row) {
182 #pragma unroll
183  for (int i = 0; i < kColLoad; ++i) {
184  output[row][col + i * blockDim.x] = val[i];
185  }
186  }
187  } else {
188  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
189 #pragma unroll
190  for (int i = 0; i < kRowUnroll; ++i) {
191 #pragma unroll
192  for (int j = 0; j < kColLoad; ++j) {
193  output[row + i][col + j * blockDim.x] = val[j];
194  }
195  }
196  }
197  }
198  }
199 }
200 
201 template <typename T, typename TVec>
202 __global__ void sumAlongRows(Tensor<T, 1, true> input,
203  Tensor<TVec, 2, true> output) {
204  __shared__ T sval;
205 
206  int row = blockIdx.x;
207 
208  if (threadIdx.x == 0) {
209  sval = input[row];
210  }
211 
212  __syncthreads();
213 
214  T val = sval;
215 
216  // FIXME: speed up
217  for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
218  TVec out = output[row][i];
219  out = Math<TVec>::add(out, val);
220  output[row][i] = out;
221  }
222 }
223 
224 template <typename T, typename TVec>
225 void runSumAlongColumns(Tensor<T, 1, true>& input,
226  Tensor<T, 2, true>& output,
227  cudaStream_t stream) {
228  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
229 
230  int threadsPerBlock = 256;
231  constexpr int kRowUnroll = 4;
232  constexpr int kRowsPerBlock = kRowUnroll * 4;
233  constexpr int kColLoad = 4;
234 
235  auto block = dim3(threadsPerBlock);
236 
237  if (input.template canCastResize<TVec>() &&
238  output.template canCastResize<TVec>()) {
239  auto inputV = input.template castResize<TVec>();
240  auto outputV = output.template castResize<TVec>();
241 
242  auto grid =
243  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
244  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
245 
246  sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
247  <<<grid, block, 0, stream>>>(inputV, outputV);
248  } else {
249  auto grid =
250  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
251  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
252 
253  sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
254  <<<grid, block, 0, stream>>>(input, output);
255  }
256 
257  CUDA_VERIFY(cudaGetLastError());
258 }
259 
260 void runSumAlongColumns(Tensor<float, 1, true>& input,
261  Tensor<float, 2, true>& output,
262  cudaStream_t stream) {
263  runSumAlongColumns<float, float4>(input, output, stream);
264 }
265 
266 #ifdef FAISS_USE_FLOAT16
267 void runSumAlongColumns(Tensor<half, 1, true>& input,
268  Tensor<half, 2, true>& output,
269  cudaStream_t stream) {
270  runSumAlongColumns<half, half2>(input, output, stream);
271 }
272 #endif
273 
274 template <typename T, typename TVec>
275 void runAssignAlongColumns(Tensor<T, 1, true>& input,
276  Tensor<T, 2, true>& output,
277  cudaStream_t stream) {
278  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
279 
280  int threadsPerBlock = 256;
281  constexpr int kRowUnroll = 4;
282  constexpr int kRowsPerBlock = kRowUnroll * 4;
283  constexpr int kColLoad = 4;
284 
285  auto block = dim3(threadsPerBlock);
286 
287  if (input.template canCastResize<TVec>() &&
288  output.template canCastResize<TVec>()) {
289  auto inputV = input.template castResize<TVec>();
290  auto outputV = output.template castResize<TVec>();
291 
292  auto grid =
293  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
294  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
295 
296  assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
297  <<<grid, block, 0, stream>>>(inputV, outputV);
298  } else {
299  auto grid =
300  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
301  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
302 
303  assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
304  <<<grid, block, 0, stream>>>(input, output);
305  }
306 
307  CUDA_VERIFY(cudaGetLastError());
308 }
309 
310 void runAssignAlongColumns(Tensor<float, 1, true>& input,
311  Tensor<float, 2, true>& output,
312  cudaStream_t stream) {
313  runAssignAlongColumns<float, float4>(input, output, stream);
314 }
315 
316 #ifdef FAISS_USE_FLOAT16
317 void runAssignAlongColumns(Tensor<half, 1, true>& input,
318  Tensor<half, 2, true>& output,
319  cudaStream_t stream) {
320  runAssignAlongColumns<half, half2>(input, output, stream);
321 }
322 #endif
323 
324 template <typename T, typename TVec>
325 void runSumAlongRows(Tensor<T, 1, true>& input,
326  Tensor<T, 2, true>& output,
327  cudaStream_t stream) {
328  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
329 
330  if (output.template canCastResize<TVec>()) {
331  auto outputV = output.template castResize<TVec>();
332 
333  int threadsPerBlock =
334  std::min(outputV.getSize(1), getMaxThreadsCurrentDevice());
335  auto grid = dim3(outputV.getSize(0));
336  auto block = dim3(threadsPerBlock);
337 
338  sumAlongRows<T, TVec><<<grid, block, 0, stream>>>(input, outputV);
339  } else {
340  int threadsPerBlock =
341  std::min(output.getSize(1), getMaxThreadsCurrentDevice());
342  auto grid = dim3(output.getSize(0));
343  auto block = dim3(threadsPerBlock);
344 
345  sumAlongRows<T, T><<<grid, block, 0, stream>>>(input, output);
346  }
347 
348  CUDA_VERIFY(cudaGetLastError());
349 }
350 
351 void runSumAlongRows(Tensor<float, 1, true>& input,
352  Tensor<float, 2, true>& output,
353  cudaStream_t stream) {
354  runSumAlongRows<float, float4>(input, output, stream);
355 }
356 
357 #ifdef FAISS_USE_FLOAT16
358 void runSumAlongRows(Tensor<half, 1, true>& input,
359  Tensor<half, 2, true>& output,
360  cudaStream_t stream) {
361  runSumAlongRows<half, half2>(input, output, stream);
362 }
363 #endif
364 
365 
366 } } // namespace