Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
BroadcastSum.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include <algorithm>
12 #include "../../FaissAssert.h"
13 
14 #include "../utils/DeviceUtils.h"
15 #include "../utils/MathOperators.cuh"
16 #include "../utils/Tensor.cuh"
17 #include "../utils/StaticUtils.h"
18 
19 namespace faiss { namespace gpu {
20 
21 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
22 __global__ void sumAlongColumns(Tensor<T, 1, true> input,
23  Tensor<T, 2, true> output) {
24  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
25 
26  // blockIdx.x: which chunk of rows we are responsible for updating
27  // blockIdx.y: which chunk of columns we are responsible for
28  // updating
29  int rowStart = blockIdx.x * kRowsPerBlock;
30  int rowEnd = rowStart + kRowsPerBlock;
31  int colStart = blockIdx.y * blockDim.x * kColLoad;
32 
33  // FIXME: if we have exact multiples, don't need this
34  bool endRow = (blockIdx.x == gridDim.x - 1);
35  bool endCol = (blockIdx.y == gridDim.y - 1);
36 
37  if (endRow) {
38  if (output.getSize(0) % kRowsPerBlock == 0) {
39  endRow = false;
40  }
41  }
42 
43  if (endCol) {
44  for (int col = colStart + threadIdx.x;
45  col < input.getSize(0); col += blockDim.x) {
46  T val = input[col];
47 
48  if (endRow) {
49  for (int row = rowStart; row < output.getSize(0); ++row) {
50  T out = output[row][col].ldg();
51  out = Math<T>::add(out, val);
52  output[row][col] = out;
53  }
54  } else {
55  T rows[kRowUnroll];
56 
57  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
58 #pragma unroll
59  for (int i = 0; i < kRowUnroll; ++i) {
60  rows[i] = output[row + i][col].ldg();
61  }
62 
63 #pragma unroll
64  for (int i = 0; i < kRowUnroll; ++i) {
65  rows[i] = Math<T>::add(rows[i], val);
66  }
67 
68 #pragma unroll
69  for (int i = 0; i < kRowUnroll; ++i) {
70  output[row + i][col] = rows[i];
71  }
72  }
73  }
74  }
75  } else {
76  int col = colStart + threadIdx.x;
77 
78  T val[kColLoad];
79 
80 #pragma unroll
81  for (int i = 0; i < kColLoad; ++i) {
82  val[i] = input[col + i * blockDim.x];
83  }
84 
85  if (endRow) {
86  for (int row = rowStart; row < output.getSize(0); ++row) {
87 #pragma unroll
88  for (int i = 0; i < kColLoad; ++i) {
89  T out = output[row][col + i * blockDim.x].ldg();
90  out = Math<T>::add(out, val[i]);
91  output[row][col + i * blockDim.x] = out;
92  }
93  }
94  } else {
95  T rows[kRowUnroll * kColLoad];
96 
97  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
98 #pragma unroll
99  for (int i = 0; i < kRowUnroll; ++i) {
100 #pragma unroll
101  for (int j = 0; j < kColLoad; ++j) {
102  rows[i * kColLoad + j] =
103  output[row + i][col + j * blockDim.x].ldg();
104  }
105  }
106 
107 #pragma unroll
108  for (int i = 0; i < kRowUnroll; ++i) {
109 #pragma unroll
110  for (int j = 0; j < kColLoad; ++j) {
111  rows[i * kColLoad + j] =
112  Math<T>::add(rows[i * kColLoad + j], val[j]);
113  }
114  }
115 
116 #pragma unroll
117  for (int i = 0; i < kRowUnroll; ++i) {
118 #pragma unroll
119  for (int j = 0; j < kColLoad; ++j) {
120  output[row + i][col + j * blockDim.x] =
121  rows[i * kColLoad + j];
122  }
123  }
124  }
125  }
126  }
127 }
128 
129 template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
130 __global__ void assignAlongColumns(Tensor<T, 1, true> input,
131  Tensor<T, 2, true> output) {
132  static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
133 
134  // blockIdx.x: which chunk of rows we are responsible for updating
135  // blockIdx.y: which chunk of columns we are responsible for
136  // updating
137  int rowStart = blockIdx.x * kRowsPerBlock;
138  int rowEnd = rowStart + kRowsPerBlock;
139  int colStart = blockIdx.y * blockDim.x * kColLoad;
140 
141  // FIXME: if we have exact multiples, don't need this
142  bool endRow = (blockIdx.x == gridDim.x - 1);
143  bool endCol = (blockIdx.y == gridDim.y - 1);
144 
145  if (endRow) {
146  if (output.getSize(0) % kRowsPerBlock == 0) {
147  endRow = false;
148  }
149  }
150 
151  if (endCol) {
152  for (int col = colStart + threadIdx.x;
153  col < input.getSize(0); col += blockDim.x) {
154  T val = input[col];
155 
156  if (endRow) {
157  for (int row = rowStart; row < output.getSize(0); ++row) {
158  output[row][col] = val;
159  }
160  } else {
161  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
162 #pragma unroll
163  for (int i = 0; i < kRowUnroll; ++i) {
164  output[row + i][col] = val;
165  }
166  }
167  }
168  }
169  } else {
170  int col = colStart + threadIdx.x;
171 
172  T val[kColLoad];
173 
174 #pragma unroll
175  for (int i = 0; i < kColLoad; ++i) {
176  val[i] = input[col + i * blockDim.x];
177  }
178 
179  if (endRow) {
180  for (int row = rowStart; row < output.getSize(0); ++row) {
181 #pragma unroll
182  for (int i = 0; i < kColLoad; ++i) {
183  output[row][col + i * blockDim.x] = val[i];
184  }
185  }
186  } else {
187  for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
188 #pragma unroll
189  for (int i = 0; i < kRowUnroll; ++i) {
190 #pragma unroll
191  for (int j = 0; j < kColLoad; ++j) {
192  output[row + i][col + j * blockDim.x] = val[j];
193  }
194  }
195  }
196  }
197  }
198 }
199 
200 template <typename T, typename TVec>
201 __global__ void sumAlongRows(Tensor<T, 1, true> input,
202  Tensor<TVec, 2, true> output) {
203  __shared__ T sval;
204 
205  int row = blockIdx.x;
206 
207  if (threadIdx.x == 0) {
208  sval = input[row];
209  }
210 
211  __syncthreads();
212 
213  T val = sval;
214 
215  // FIXME: speed up
216  for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
217  TVec out = output[row][i];
218  out = Math<TVec>::add(out, val);
219  output[row][i] = out;
220  }
221 }
222 
223 template <typename T, typename TVec>
224 void runSumAlongColumns(Tensor<T, 1, true>& input,
225  Tensor<T, 2, true>& output,
226  cudaStream_t stream) {
227  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
228 
229  int threadsPerBlock = 256;
230  constexpr int kRowUnroll = 4;
231  constexpr int kRowsPerBlock = kRowUnroll * 4;
232  constexpr int kColLoad = 4;
233 
234  auto block = dim3(threadsPerBlock);
235 
236  if (input.template canCastResize<TVec>() &&
237  output.template canCastResize<TVec>()) {
238  auto inputV = input.template castResize<TVec>();
239  auto outputV = output.template castResize<TVec>();
240 
241  auto grid =
242  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
243  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
244 
245  sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
246  <<<grid, block, 0, stream>>>(inputV, outputV);
247  } else {
248  auto grid =
249  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
250  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
251 
252  sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
253  <<<grid, block, 0, stream>>>(input, output);
254  }
255 
256  CUDA_TEST_ERROR();
257 }
258 
259 void runSumAlongColumns(Tensor<float, 1, true>& input,
260  Tensor<float, 2, true>& output,
261  cudaStream_t stream) {
262  runSumAlongColumns<float, float4>(input, output, stream);
263 }
264 
265 #ifdef FAISS_USE_FLOAT16
266 void runSumAlongColumns(Tensor<half, 1, true>& input,
267  Tensor<half, 2, true>& output,
268  cudaStream_t stream) {
269  runSumAlongColumns<half, half2>(input, output, stream);
270 }
271 #endif
272 
273 template <typename T, typename TVec>
274 void runAssignAlongColumns(Tensor<T, 1, true>& input,
275  Tensor<T, 2, true>& output,
276  cudaStream_t stream) {
277  FAISS_ASSERT(input.getSize(0) == output.getSize(1));
278 
279  int threadsPerBlock = 256;
280  constexpr int kRowUnroll = 4;
281  constexpr int kRowsPerBlock = kRowUnroll * 4;
282  constexpr int kColLoad = 4;
283 
284  auto block = dim3(threadsPerBlock);
285 
286  if (input.template canCastResize<TVec>() &&
287  output.template canCastResize<TVec>()) {
288  auto inputV = input.template castResize<TVec>();
289  auto outputV = output.template castResize<TVec>();
290 
291  auto grid =
292  dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
293  utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
294 
295  assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
296  <<<grid, block, 0, stream>>>(inputV, outputV);
297  } else {
298  auto grid =
299  dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
300  utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
301 
302  assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
303  <<<grid, block, 0, stream>>>(input, output);
304  }
305 
306  CUDA_TEST_ERROR();
307 }
308 
309 void runAssignAlongColumns(Tensor<float, 1, true>& input,
310  Tensor<float, 2, true>& output,
311  cudaStream_t stream) {
312  runAssignAlongColumns<float, float4>(input, output, stream);
313 }
314 
315 #ifdef FAISS_USE_FLOAT16
316 void runAssignAlongColumns(Tensor<half, 1, true>& input,
317  Tensor<half, 2, true>& output,
318  cudaStream_t stream) {
319  runAssignAlongColumns<half, half2>(input, output, stream);
320 }
321 #endif
322 
323 template <typename T, typename TVec>
324 void runSumAlongRows(Tensor<T, 1, true>& input,
325  Tensor<T, 2, true>& output,
326  cudaStream_t stream) {
327  FAISS_ASSERT(input.getSize(0) == output.getSize(0));
328 
329  if (output.template canCastResize<TVec>()) {
330  auto outputV = output.template castResize<TVec>();
331 
332  int threadsPerBlock =
333  std::min(outputV.getSize(1), getMaxThreadsCurrentDevice());
334  auto grid = dim3(outputV.getSize(0));
335  auto block = dim3(threadsPerBlock);
336 
337  sumAlongRows<T, TVec><<<grid, block, 0, stream>>>(input, outputV);
338  } else {
339  int threadsPerBlock =
340  std::min(output.getSize(1), getMaxThreadsCurrentDevice());
341  auto grid = dim3(output.getSize(0));
342  auto block = dim3(threadsPerBlock);
343 
344  sumAlongRows<T, T><<<grid, block, 0, stream>>>(input, output);
345  }
346 
347  CUDA_TEST_ERROR();
348 }
349 
350 void runSumAlongRows(Tensor<float, 1, true>& input,
351  Tensor<float, 2, true>& output,
352  cudaStream_t stream) {
353  runSumAlongRows<float, float4>(input, output, stream);
354 }
355 
356 #ifdef FAISS_USE_FLOAT16
357 void runSumAlongRows(Tensor<half, 1, true>& input,
358  Tensor<half, 2, true>& output,
359  cudaStream_t stream) {
360  runSumAlongRows<half, half2>(input, output, stream);
361 }
362 #endif
363 
364 } } // namespace