11 #include "../../FaissAssert.h"
13 #include "../utils/DeviceUtils.h"
14 #include "../utils/MathOperators.cuh"
15 #include "../utils/Tensor.cuh"
16 #include "../utils/StaticUtils.h"
18 namespace faiss {
namespace gpu {
20 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
21 __global__
void sumAlongColumns(Tensor<T, 1, true> input,
22 Tensor<T, 2, true> output) {
23 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
28 int rowStart = blockIdx.x * kRowsPerBlock;
29 int rowEnd = rowStart + kRowsPerBlock;
30 int colStart = blockIdx.y * blockDim.x * kColLoad;
33 bool endRow = (blockIdx.x == gridDim.x - 1);
34 bool endCol = (blockIdx.y == gridDim.y - 1);
37 if (output.getSize(0) % kRowsPerBlock == 0) {
43 for (
int col = colStart + threadIdx.x;
44 col < input.getSize(0); col += blockDim.x) {
48 for (
int row = rowStart; row < output.getSize(0); ++row) {
49 T out = output[row][col];
50 out = Math<T>::add(out, val);
51 output[row][col] = out;
56 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
58 for (
int i = 0; i < kRowUnroll; ++i) {
59 rows[i] = output[row + i][col];
63 for (
int i = 0; i < kRowUnroll; ++i) {
64 rows[i] = Math<T>::add(rows[i], val);
68 for (
int i = 0; i < kRowUnroll; ++i) {
69 output[row + i][col] = rows[i];
75 int col = colStart + threadIdx.x;
80 for (
int i = 0; i < kColLoad; ++i) {
81 val[i] = input[col + i * blockDim.x];
85 for (
int row = rowStart; row < output.getSize(0); ++row) {
87 for (
int i = 0; i < kColLoad; ++i) {
88 T out = output[row][col + i * blockDim.x];
89 out = Math<T>::add(out, val[i]);
90 output[row][col + i * blockDim.x] = out;
94 T rows[kRowUnroll * kColLoad];
96 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
98 for (
int i = 0; i < kRowUnroll; ++i) {
100 for (
int j = 0; j < kColLoad; ++j) {
101 rows[i * kColLoad + j] =
102 output[row + i][col + j * blockDim.x];
107 for (
int i = 0; i < kRowUnroll; ++i) {
109 for (
int j = 0; j < kColLoad; ++j) {
110 rows[i * kColLoad + j] =
111 Math<T>::add(rows[i * kColLoad + j], val[j]);
116 for (
int i = 0; i < kRowUnroll; ++i) {
118 for (
int j = 0; j < kColLoad; ++j) {
119 output[row + i][col + j * blockDim.x] =
120 rows[i * kColLoad + j];
128 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
129 __global__
void assignAlongColumns(Tensor<T, 1, true> input,
130 Tensor<T, 2, true> output) {
131 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
136 int rowStart = blockIdx.x * kRowsPerBlock;
137 int rowEnd = rowStart + kRowsPerBlock;
138 int colStart = blockIdx.y * blockDim.x * kColLoad;
141 bool endRow = (blockIdx.x == gridDim.x - 1);
142 bool endCol = (blockIdx.y == gridDim.y - 1);
145 if (output.getSize(0) % kRowsPerBlock == 0) {
151 for (
int col = colStart + threadIdx.x;
152 col < input.getSize(0); col += blockDim.x) {
156 for (
int row = rowStart; row < output.getSize(0); ++row) {
157 output[row][col] = val;
160 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
162 for (
int i = 0; i < kRowUnroll; ++i) {
163 output[row + i][col] = val;
169 int col = colStart + threadIdx.x;
174 for (
int i = 0; i < kColLoad; ++i) {
175 val[i] = input[col + i * blockDim.x];
179 for (
int row = rowStart; row < output.getSize(0); ++row) {
181 for (
int i = 0; i < kColLoad; ++i) {
182 output[row][col + i * blockDim.x] = val[i];
186 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
188 for (
int i = 0; i < kRowUnroll; ++i) {
190 for (
int j = 0; j < kColLoad; ++j) {
191 output[row + i][col + j * blockDim.x] = val[j];
199 template <
typename T,
bool ZeroClamp>
200 __global__
void sumAlongRows(Tensor<T, 1, true> input,
201 Tensor<T, 2, true> output) {
204 int row = blockIdx.x;
206 if (threadIdx.x == 0) {
215 for (
int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
216 T out = output[row][i];
217 out = Math<T>::add(out, val);
218 out = Math<T>::lt(out, Math<T>::zero()) ? Math<T>::zero() : out;
220 output[row][i] = out;
224 template <
typename T,
typename TVec>
225 void runSumAlongColumns(Tensor<T, 1, true>& input,
226 Tensor<T, 2, true>& output,
227 cudaStream_t stream) {
228 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
230 int threadsPerBlock = 256;
231 constexpr
int kRowUnroll = 4;
232 constexpr
int kRowsPerBlock = kRowUnroll * 4;
233 constexpr
int kColLoad = 4;
235 auto block = dim3(threadsPerBlock);
237 if (input.template canCastResize<TVec>() &&
238 output.template canCastResize<TVec>()) {
239 auto inputV = input.template castResize<TVec>();
240 auto outputV = output.template castResize<TVec>();
243 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
244 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
246 sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
247 <<<grid, block, 0, stream>>>(inputV, outputV);
250 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
251 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
253 sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
254 <<<grid, block, 0, stream>>>(input, output);
260 void runSumAlongColumns(Tensor<float, 1, true>& input,
261 Tensor<float, 2, true>& output,
262 cudaStream_t stream) {
263 runSumAlongColumns<float, float4>(input, output, stream);
266 #ifdef FAISS_USE_FLOAT16
267 void runSumAlongColumns(Tensor<half, 1, true>& input,
268 Tensor<half, 2, true>& output,
269 cudaStream_t stream) {
270 runSumAlongColumns<half, half2>(input, output, stream);
274 template <
typename T,
typename TVec>
275 void runAssignAlongColumns(Tensor<T, 1, true>& input,
276 Tensor<T, 2, true>& output,
277 cudaStream_t stream) {
278 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
280 int threadsPerBlock = 256;
281 constexpr
int kRowUnroll = 4;
282 constexpr
int kRowsPerBlock = kRowUnroll * 4;
283 constexpr
int kColLoad = 4;
285 auto block = dim3(threadsPerBlock);
287 if (input.template canCastResize<TVec>() &&
288 output.template canCastResize<TVec>()) {
289 auto inputV = input.template castResize<TVec>();
290 auto outputV = output.template castResize<TVec>();
293 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
294 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
296 assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
297 <<<grid, block, 0, stream>>>(inputV, outputV);
300 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
301 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
303 assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
304 <<<grid, block, 0, stream>>>(input, output);
310 void runAssignAlongColumns(Tensor<float, 1, true>& input,
311 Tensor<float, 2, true>& output,
312 cudaStream_t stream) {
313 runAssignAlongColumns<float, float4>(input, output, stream);
316 #ifdef FAISS_USE_FLOAT16
317 void runAssignAlongColumns(Tensor<half, 1, true>& input,
318 Tensor<half, 2, true>& output,
319 cudaStream_t stream) {
320 runAssignAlongColumns<half, half2>(input, output, stream);
324 template <
typename T>
325 void runSumAlongRows(Tensor<T, 1, true>& input,
326 Tensor<T, 2, true>& output,
328 cudaStream_t stream) {
329 FAISS_ASSERT(input.getSize(0) == output.getSize(0));
331 int threadsPerBlock =
332 std::min(output.getSize(1), getMaxThreadsCurrentDevice());
333 auto grid = dim3(output.getSize(0));
334 auto block = dim3(threadsPerBlock);
337 sumAlongRows<T, true><<<grid, block, 0, stream>>>(input, output);
339 sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
345 void runSumAlongRows(Tensor<float, 1, true>& input,
346 Tensor<float, 2, true>& output,
348 cudaStream_t stream) {
349 runSumAlongRows<float>(input, output, zeroClamp, stream);
352 #ifdef FAISS_USE_FLOAT16
353 void runSumAlongRows(Tensor<half, 1, true>& input,
354 Tensor<half, 2, true>& output,
356 cudaStream_t stream) {
357 runSumAlongRows<half>(input, output, zeroClamp, stream);