12 #include "../../FaissAssert.h"
14 #include "../utils/DeviceUtils.h"
15 #include "../utils/MathOperators.cuh"
16 #include "../utils/Tensor.cuh"
17 #include "../utils/StaticUtils.h"
19 namespace faiss {
namespace gpu {
21 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
22 __global__
void sumAlongColumns(Tensor<T, 1, true> input,
23 Tensor<T, 2, true> output) {
24 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
29 int rowStart = blockIdx.x * kRowsPerBlock;
30 int rowEnd = rowStart + kRowsPerBlock;
31 int colStart = blockIdx.y * blockDim.x * kColLoad;
34 bool endRow = (blockIdx.x == gridDim.x - 1);
35 bool endCol = (blockIdx.y == gridDim.y - 1);
38 if (output.getSize(0) % kRowsPerBlock == 0) {
44 for (
int col = colStart + threadIdx.x;
45 col < input.getSize(0); col += blockDim.x) {
49 for (
int row = rowStart; row < output.getSize(0); ++row) {
50 T out = output[row][col];
51 out = Math<T>::add(out, val);
52 output[row][col] = out;
57 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
59 for (
int i = 0; i < kRowUnroll; ++i) {
60 rows[i] = output[row + i][col];
64 for (
int i = 0; i < kRowUnroll; ++i) {
65 rows[i] = Math<T>::add(rows[i], val);
69 for (
int i = 0; i < kRowUnroll; ++i) {
70 output[row + i][col] = rows[i];
76 int col = colStart + threadIdx.x;
81 for (
int i = 0; i < kColLoad; ++i) {
82 val[i] = input[col + i * blockDim.x];
86 for (
int row = rowStart; row < output.getSize(0); ++row) {
88 for (
int i = 0; i < kColLoad; ++i) {
89 T out = output[row][col + i * blockDim.x];
90 out = Math<T>::add(out, val[i]);
91 output[row][col + i * blockDim.x] = out;
95 T rows[kRowUnroll * kColLoad];
97 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
99 for (
int i = 0; i < kRowUnroll; ++i) {
101 for (
int j = 0; j < kColLoad; ++j) {
102 rows[i * kColLoad + j] =
103 output[row + i][col + j * blockDim.x];
108 for (
int i = 0; i < kRowUnroll; ++i) {
110 for (
int j = 0; j < kColLoad; ++j) {
111 rows[i * kColLoad + j] =
112 Math<T>::add(rows[i * kColLoad + j], val[j]);
117 for (
int i = 0; i < kRowUnroll; ++i) {
119 for (
int j = 0; j < kColLoad; ++j) {
120 output[row + i][col + j * blockDim.x] =
121 rows[i * kColLoad + j];
129 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
130 __global__
void assignAlongColumns(Tensor<T, 1, true> input,
131 Tensor<T, 2, true> output) {
132 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
137 int rowStart = blockIdx.x * kRowsPerBlock;
138 int rowEnd = rowStart + kRowsPerBlock;
139 int colStart = blockIdx.y * blockDim.x * kColLoad;
142 bool endRow = (blockIdx.x == gridDim.x - 1);
143 bool endCol = (blockIdx.y == gridDim.y - 1);
146 if (output.getSize(0) % kRowsPerBlock == 0) {
152 for (
int col = colStart + threadIdx.x;
153 col < input.getSize(0); col += blockDim.x) {
157 for (
int row = rowStart; row < output.getSize(0); ++row) {
158 output[row][col] = val;
161 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
163 for (
int i = 0; i < kRowUnroll; ++i) {
164 output[row + i][col] = val;
170 int col = colStart + threadIdx.x;
175 for (
int i = 0; i < kColLoad; ++i) {
176 val[i] = input[col + i * blockDim.x];
180 for (
int row = rowStart; row < output.getSize(0); ++row) {
182 for (
int i = 0; i < kColLoad; ++i) {
183 output[row][col + i * blockDim.x] = val[i];
187 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
189 for (
int i = 0; i < kRowUnroll; ++i) {
191 for (
int j = 0; j < kColLoad; ++j) {
192 output[row + i][col + j * blockDim.x] = val[j];
200 template <
typename T,
typename TVec>
201 __global__
void sumAlongRows(Tensor<T, 1, true> input,
202 Tensor<TVec, 2, true> output) {
205 int row = blockIdx.x;
207 if (threadIdx.x == 0) {
216 for (
int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
217 TVec out = output[row][i];
218 out = Math<TVec>::add(out, val);
219 output[row][i] = out;
223 template <
typename T,
typename TVec>
224 void runSumAlongColumns(Tensor<T, 1, true>& input,
225 Tensor<T, 2, true>& output,
226 cudaStream_t stream) {
227 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
229 int threadsPerBlock = 256;
230 constexpr
int kRowUnroll = 4;
231 constexpr
int kRowsPerBlock = kRowUnroll * 4;
232 constexpr
int kColLoad = 4;
234 auto block = dim3(threadsPerBlock);
236 if (input.template canCastResize<TVec>() &&
237 output.template canCastResize<TVec>()) {
238 auto inputV = input.template castResize<TVec>();
239 auto outputV = output.template castResize<TVec>();
242 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
243 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
245 sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
246 <<<grid, block, 0, stream>>>(inputV, outputV);
249 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
250 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
252 sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
253 <<<grid, block, 0, stream>>>(input, output);
259 void runSumAlongColumns(Tensor<float, 1, true>& input,
260 Tensor<float, 2, true>& output,
261 cudaStream_t stream) {
262 runSumAlongColumns<float, float4>(input, output, stream);
265 #ifdef FAISS_USE_FLOAT16
266 void runSumAlongColumns(Tensor<half, 1, true>& input,
267 Tensor<half, 2, true>& output,
268 cudaStream_t stream) {
269 runSumAlongColumns<half, half2>(input, output, stream);
273 template <
typename T,
typename TVec>
274 void runAssignAlongColumns(Tensor<T, 1, true>& input,
275 Tensor<T, 2, true>& output,
276 cudaStream_t stream) {
277 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
279 int threadsPerBlock = 256;
280 constexpr
int kRowUnroll = 4;
281 constexpr
int kRowsPerBlock = kRowUnroll * 4;
282 constexpr
int kColLoad = 4;
284 auto block = dim3(threadsPerBlock);
286 if (input.template canCastResize<TVec>() &&
287 output.template canCastResize<TVec>()) {
288 auto inputV = input.template castResize<TVec>();
289 auto outputV = output.template castResize<TVec>();
292 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
293 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
295 assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
296 <<<grid, block, 0, stream>>>(inputV, outputV);
299 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
300 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
302 assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
303 <<<grid, block, 0, stream>>>(input, output);
309 void runAssignAlongColumns(Tensor<float, 1, true>& input,
310 Tensor<float, 2, true>& output,
311 cudaStream_t stream) {
312 runAssignAlongColumns<float, float4>(input, output, stream);
315 #ifdef FAISS_USE_FLOAT16
316 void runAssignAlongColumns(Tensor<half, 1, true>& input,
317 Tensor<half, 2, true>& output,
318 cudaStream_t stream) {
319 runAssignAlongColumns<half, half2>(input, output, stream);
323 template <
typename T,
typename TVec>
324 void runSumAlongRows(Tensor<T, 1, true>& input,
325 Tensor<T, 2, true>& output,
326 cudaStream_t stream) {
327 FAISS_ASSERT(input.getSize(0) == output.getSize(0));
329 if (output.template canCastResize<TVec>()) {
330 auto outputV = output.template castResize<TVec>();
332 int threadsPerBlock =
333 std::min(outputV.getSize(1), getMaxThreadsCurrentDevice());
334 auto grid = dim3(outputV.getSize(0));
335 auto block = dim3(threadsPerBlock);
337 sumAlongRows<T, TVec><<<grid, block, 0, stream>>>(input, outputV);
339 int threadsPerBlock =
340 std::min(output.getSize(1), getMaxThreadsCurrentDevice());
341 auto grid = dim3(output.getSize(0));
342 auto block = dim3(threadsPerBlock);
344 sumAlongRows<T, T><<<grid, block, 0, stream>>>(input, output);
350 void runSumAlongRows(Tensor<float, 1, true>& input,
351 Tensor<float, 2, true>& output,
352 cudaStream_t stream) {
353 runSumAlongRows<float, float4>(input, output, stream);
356 #ifdef FAISS_USE_FLOAT16
357 void runSumAlongRows(Tensor<half, 1, true>& input,
358 Tensor<half, 2, true>& output,
359 cudaStream_t stream) {
360 runSumAlongRows<half, half2>(input, output, stream);