13 #include "../../FaissAssert.h"
15 #include "../utils/DeviceUtils.h"
16 #include "../utils/MathOperators.cuh"
17 #include "../utils/Tensor.cuh"
18 #include "../utils/StaticUtils.h"
20 namespace faiss {
namespace gpu {
22 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
23 __global__
void sumAlongColumns(Tensor<T, 1, true> input,
24 Tensor<T, 2, true> output) {
25 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
30 int rowStart = blockIdx.x * kRowsPerBlock;
31 int rowEnd = rowStart + kRowsPerBlock;
32 int colStart = blockIdx.y * blockDim.x * kColLoad;
35 bool endRow = (blockIdx.x == gridDim.x - 1);
36 bool endCol = (blockIdx.y == gridDim.y - 1);
39 if (output.getSize(0) % kRowsPerBlock == 0) {
45 for (
int col = colStart + threadIdx.x;
46 col < input.getSize(0); col += blockDim.x) {
50 for (
int row = rowStart; row < output.getSize(0); ++row) {
51 T out = output[row][col].ldg();
52 out = Math<T>::add(out, val);
53 output[row][col] = out;
58 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
60 for (
int i = 0; i < kRowUnroll; ++i) {
61 rows[i] = output[row + i][col].ldg();
65 for (
int i = 0; i < kRowUnroll; ++i) {
66 rows[i] = Math<T>::add(rows[i], val);
70 for (
int i = 0; i < kRowUnroll; ++i) {
71 output[row + i][col] = rows[i];
77 int col = colStart + threadIdx.x;
82 for (
int i = 0; i < kColLoad; ++i) {
83 val[i] = input[col + i * blockDim.x];
87 for (
int row = rowStart; row < output.getSize(0); ++row) {
89 for (
int i = 0; i < kColLoad; ++i) {
90 T out = output[row][col + i * blockDim.x].ldg();
91 out = Math<T>::add(out, val[i]);
92 output[row][col + i * blockDim.x] = out;
96 T rows[kRowUnroll * kColLoad];
98 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
100 for (
int i = 0; i < kRowUnroll; ++i) {
102 for (
int j = 0; j < kColLoad; ++j) {
103 rows[i * kColLoad + j] =
104 output[row + i][col + j * blockDim.x].ldg();
109 for (
int i = 0; i < kRowUnroll; ++i) {
111 for (
int j = 0; j < kColLoad; ++j) {
112 rows[i * kColLoad + j] =
113 Math<T>::add(rows[i * kColLoad + j], val[j]);
118 for (
int i = 0; i < kRowUnroll; ++i) {
120 for (
int j = 0; j < kColLoad; ++j) {
121 output[row + i][col + j * blockDim.x] =
122 rows[i * kColLoad + j];
130 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
131 __global__
void assignAlongColumns(Tensor<T, 1, true> input,
132 Tensor<T, 2, true> output) {
133 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
138 int rowStart = blockIdx.x * kRowsPerBlock;
139 int rowEnd = rowStart + kRowsPerBlock;
140 int colStart = blockIdx.y * blockDim.x * kColLoad;
143 bool endRow = (blockIdx.x == gridDim.x - 1);
144 bool endCol = (blockIdx.y == gridDim.y - 1);
147 if (output.getSize(0) % kRowsPerBlock == 0) {
153 for (
int col = colStart + threadIdx.x;
154 col < input.getSize(0); col += blockDim.x) {
158 for (
int row = rowStart; row < output.getSize(0); ++row) {
159 output[row][col] = val;
162 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
164 for (
int i = 0; i < kRowUnroll; ++i) {
165 output[row + i][col] = val;
171 int col = colStart + threadIdx.x;
176 for (
int i = 0; i < kColLoad; ++i) {
177 val[i] = input[col + i * blockDim.x];
181 for (
int row = rowStart; row < output.getSize(0); ++row) {
183 for (
int i = 0; i < kColLoad; ++i) {
184 output[row][col + i * blockDim.x] = val[i];
188 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
190 for (
int i = 0; i < kRowUnroll; ++i) {
192 for (
int j = 0; j < kColLoad; ++j) {
193 output[row + i][col + j * blockDim.x] = val[j];
201 template <
typename T,
typename TVec>
202 __global__
void sumAlongRows(Tensor<T, 1, true> input,
203 Tensor<TVec, 2, true> output) {
206 int row = blockIdx.x;
208 if (threadIdx.x == 0) {
217 for (
int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
218 TVec out = output[row][i];
219 out = Math<TVec>::add(out, val);
220 output[row][i] = out;
224 template <
typename T,
typename TVec>
225 void runSumAlongColumns(Tensor<T, 1, true>& input,
226 Tensor<T, 2, true>& output,
227 cudaStream_t stream) {
228 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
230 int threadsPerBlock = 256;
231 constexpr
int kRowUnroll = 4;
232 constexpr
int kRowsPerBlock = kRowUnroll * 4;
233 constexpr
int kColLoad = 4;
235 auto block = dim3(threadsPerBlock);
237 if (input.template canCastResize<TVec>() &&
238 output.template canCastResize<TVec>()) {
239 auto inputV = input.template castResize<TVec>();
240 auto outputV = output.template castResize<TVec>();
243 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
244 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
246 sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
247 <<<grid, block, 0, stream>>>(inputV, outputV);
250 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
251 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
253 sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
254 <<<grid, block, 0, stream>>>(input, output);
257 CUDA_VERIFY(cudaGetLastError());
260 void runSumAlongColumns(Tensor<float, 1, true>& input,
261 Tensor<float, 2, true>& output,
262 cudaStream_t stream) {
263 runSumAlongColumns<float, float4>(input, output, stream);
266 #ifdef FAISS_USE_FLOAT16
267 void runSumAlongColumns(Tensor<half, 1, true>& input,
268 Tensor<half, 2, true>& output,
269 cudaStream_t stream) {
270 runSumAlongColumns<half, half2>(input, output, stream);
274 template <
typename T,
typename TVec>
275 void runAssignAlongColumns(Tensor<T, 1, true>& input,
276 Tensor<T, 2, true>& output,
277 cudaStream_t stream) {
278 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
280 int threadsPerBlock = 256;
281 constexpr
int kRowUnroll = 4;
282 constexpr
int kRowsPerBlock = kRowUnroll * 4;
283 constexpr
int kColLoad = 4;
285 auto block = dim3(threadsPerBlock);
287 if (input.template canCastResize<TVec>() &&
288 output.template canCastResize<TVec>()) {
289 auto inputV = input.template castResize<TVec>();
290 auto outputV = output.template castResize<TVec>();
293 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
294 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
296 assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
297 <<<grid, block, 0, stream>>>(inputV, outputV);
300 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
301 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
303 assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
304 <<<grid, block, 0, stream>>>(input, output);
307 CUDA_VERIFY(cudaGetLastError());
310 void runAssignAlongColumns(Tensor<float, 1, true>& input,
311 Tensor<float, 2, true>& output,
312 cudaStream_t stream) {
313 runAssignAlongColumns<float, float4>(input, output, stream);
316 #ifdef FAISS_USE_FLOAT16
317 void runAssignAlongColumns(Tensor<half, 1, true>& input,
318 Tensor<half, 2, true>& output,
319 cudaStream_t stream) {
320 runAssignAlongColumns<half, half2>(input, output, stream);
324 template <
typename T,
typename TVec>
325 void runSumAlongRows(Tensor<T, 1, true>& input,
326 Tensor<T, 2, true>& output,
327 cudaStream_t stream) {
328 FAISS_ASSERT(input.getSize(0) == output.getSize(0));
330 if (output.template canCastResize<TVec>()) {
331 auto outputV = output.template castResize<TVec>();
333 int threadsPerBlock =
334 std::min(outputV.getSize(1), getMaxThreadsCurrentDevice());
335 auto grid = dim3(outputV.getSize(0));
336 auto block = dim3(threadsPerBlock);
338 sumAlongRows<T, TVec><<<grid, block, 0, stream>>>(input, outputV);
340 int threadsPerBlock =
341 std::min(output.getSize(1), getMaxThreadsCurrentDevice());
342 auto grid = dim3(output.getSize(0));
343 auto block = dim3(threadsPerBlock);
345 sumAlongRows<T, T><<<grid, block, 0, stream>>>(input, output);
348 CUDA_VERIFY(cudaGetLastError());
351 void runSumAlongRows(Tensor<float, 1, true>& input,
352 Tensor<float, 2, true>& output,
353 cudaStream_t stream) {
354 runSumAlongRows<float, float4>(input, output, stream);
357 #ifdef FAISS_USE_FLOAT16
358 void runSumAlongRows(Tensor<half, 1, true>& input,
359 Tensor<half, 2, true>& output,
360 cudaStream_t stream) {
361 runSumAlongRows<half, half2>(input, output, stream);