10 #include "../../FaissAssert.h"
12 #include "../utils/DeviceUtils.h"
13 #include "../utils/MathOperators.cuh"
14 #include "../utils/Tensor.cuh"
15 #include "../utils/StaticUtils.h"
17 namespace faiss {
namespace gpu {
19 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
20 __global__
void sumAlongColumns(Tensor<T, 1, true> input,
21 Tensor<T, 2, true> output) {
22 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
27 int rowStart = blockIdx.x * kRowsPerBlock;
28 int rowEnd = rowStart + kRowsPerBlock;
29 int colStart = blockIdx.y * blockDim.x * kColLoad;
32 bool endRow = (blockIdx.x == gridDim.x - 1);
33 bool endCol = (blockIdx.y == gridDim.y - 1);
36 if (output.getSize(0) % kRowsPerBlock == 0) {
42 for (
int col = colStart + threadIdx.x;
43 col < input.getSize(0); col += blockDim.x) {
47 for (
int row = rowStart; row < output.getSize(0); ++row) {
48 T out = output[row][col];
49 out = Math<T>::add(out, val);
50 output[row][col] = out;
55 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
57 for (
int i = 0; i < kRowUnroll; ++i) {
58 rows[i] = output[row + i][col];
62 for (
int i = 0; i < kRowUnroll; ++i) {
63 rows[i] = Math<T>::add(rows[i], val);
67 for (
int i = 0; i < kRowUnroll; ++i) {
68 output[row + i][col] = rows[i];
74 int col = colStart + threadIdx.x;
79 for (
int i = 0; i < kColLoad; ++i) {
80 val[i] = input[col + i * blockDim.x];
84 for (
int row = rowStart; row < output.getSize(0); ++row) {
86 for (
int i = 0; i < kColLoad; ++i) {
87 T out = output[row][col + i * blockDim.x];
88 out = Math<T>::add(out, val[i]);
89 output[row][col + i * blockDim.x] = out;
93 T rows[kRowUnroll * kColLoad];
95 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
97 for (
int i = 0; i < kRowUnroll; ++i) {
99 for (
int j = 0; j < kColLoad; ++j) {
100 rows[i * kColLoad + j] =
101 output[row + i][col + j * blockDim.x];
106 for (
int i = 0; i < kRowUnroll; ++i) {
108 for (
int j = 0; j < kColLoad; ++j) {
109 rows[i * kColLoad + j] =
110 Math<T>::add(rows[i * kColLoad + j], val[j]);
115 for (
int i = 0; i < kRowUnroll; ++i) {
117 for (
int j = 0; j < kColLoad; ++j) {
118 output[row + i][col + j * blockDim.x] =
119 rows[i * kColLoad + j];
127 template <
typename T,
int kRowsPerBlock,
int kRowUnroll,
int kColLoad>
128 __global__
void assignAlongColumns(Tensor<T, 1, true> input,
129 Tensor<T, 2, true> output) {
130 static_assert(kRowsPerBlock % kRowUnroll == 0,
"must fit rows");
135 int rowStart = blockIdx.x * kRowsPerBlock;
136 int rowEnd = rowStart + kRowsPerBlock;
137 int colStart = blockIdx.y * blockDim.x * kColLoad;
140 bool endRow = (blockIdx.x == gridDim.x - 1);
141 bool endCol = (blockIdx.y == gridDim.y - 1);
144 if (output.getSize(0) % kRowsPerBlock == 0) {
150 for (
int col = colStart + threadIdx.x;
151 col < input.getSize(0); col += blockDim.x) {
155 for (
int row = rowStart; row < output.getSize(0); ++row) {
156 output[row][col] = val;
159 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
161 for (
int i = 0; i < kRowUnroll; ++i) {
162 output[row + i][col] = val;
168 int col = colStart + threadIdx.x;
173 for (
int i = 0; i < kColLoad; ++i) {
174 val[i] = input[col + i * blockDim.x];
178 for (
int row = rowStart; row < output.getSize(0); ++row) {
180 for (
int i = 0; i < kColLoad; ++i) {
181 output[row][col + i * blockDim.x] = val[i];
185 for (
int row = rowStart; row < rowEnd; row += kRowUnroll) {
187 for (
int i = 0; i < kRowUnroll; ++i) {
189 for (
int j = 0; j < kColLoad; ++j) {
190 output[row + i][col + j * blockDim.x] = val[j];
198 template <
typename T,
bool ZeroClamp>
199 __global__
void sumAlongRows(Tensor<T, 1, true> input,
200 Tensor<T, 2, true> output) {
203 int row = blockIdx.x;
205 if (threadIdx.x == 0) {
214 for (
int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
215 T out = output[row][i];
216 out = Math<T>::add(out, val);
217 out = Math<T>::lt(out, Math<T>::zero()) ? Math<T>::zero() : out;
219 output[row][i] = out;
223 template <
typename T,
typename TVec>
224 void runSumAlongColumns(Tensor<T, 1, true>& input,
225 Tensor<T, 2, true>& output,
226 cudaStream_t stream) {
227 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
229 int threadsPerBlock = 256;
230 constexpr
int kRowUnroll = 4;
231 constexpr
int kRowsPerBlock = kRowUnroll * 4;
232 constexpr
int kColLoad = 4;
234 auto block = dim3(threadsPerBlock);
236 if (input.template canCastResize<TVec>() &&
237 output.template canCastResize<TVec>()) {
238 auto inputV = input.template castResize<TVec>();
239 auto outputV = output.template castResize<TVec>();
242 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
243 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
245 sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
246 <<<grid, block, 0, stream>>>(inputV, outputV);
249 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
250 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
252 sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
253 <<<grid, block, 0, stream>>>(input, output);
259 void runSumAlongColumns(Tensor<float, 1, true>& input,
260 Tensor<float, 2, true>& output,
261 cudaStream_t stream) {
262 runSumAlongColumns<float, float4>(input, output, stream);
265 #ifdef FAISS_USE_FLOAT16
266 void runSumAlongColumns(Tensor<half, 1, true>& input,
267 Tensor<half, 2, true>& output,
268 cudaStream_t stream) {
269 runSumAlongColumns<half, half2>(input, output, stream);
273 template <
typename T,
typename TVec>
274 void runAssignAlongColumns(Tensor<T, 1, true>& input,
275 Tensor<T, 2, true>& output,
276 cudaStream_t stream) {
277 FAISS_ASSERT(input.getSize(0) == output.getSize(1));
279 int threadsPerBlock = 256;
280 constexpr
int kRowUnroll = 4;
281 constexpr
int kRowsPerBlock = kRowUnroll * 4;
282 constexpr
int kColLoad = 4;
284 auto block = dim3(threadsPerBlock);
286 if (input.template canCastResize<TVec>() &&
287 output.template canCastResize<TVec>()) {
288 auto inputV = input.template castResize<TVec>();
289 auto outputV = output.template castResize<TVec>();
292 dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
293 utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
295 assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
296 <<<grid, block, 0, stream>>>(inputV, outputV);
299 dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
300 utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
302 assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
303 <<<grid, block, 0, stream>>>(input, output);
309 void runAssignAlongColumns(Tensor<float, 1, true>& input,
310 Tensor<float, 2, true>& output,
311 cudaStream_t stream) {
312 runAssignAlongColumns<float, float4>(input, output, stream);
315 #ifdef FAISS_USE_FLOAT16
316 void runAssignAlongColumns(Tensor<half, 1, true>& input,
317 Tensor<half, 2, true>& output,
318 cudaStream_t stream) {
319 runAssignAlongColumns<half, half2>(input, output, stream);
323 template <
typename T>
324 void runSumAlongRows(Tensor<T, 1, true>& input,
325 Tensor<T, 2, true>& output,
327 cudaStream_t stream) {
328 FAISS_ASSERT(input.getSize(0) == output.getSize(0));
330 int threadsPerBlock =
331 std::min(output.getSize(1), getMaxThreadsCurrentDevice());
332 auto grid = dim3(output.getSize(0));
333 auto block = dim3(threadsPerBlock);
336 sumAlongRows<T, true><<<grid, block, 0, stream>>>(input, output);
338 sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
344 void runSumAlongRows(Tensor<float, 1, true>& input,
345 Tensor<float, 2, true>& output,
347 cudaStream_t stream) {
348 runSumAlongRows<float>(input, output, zeroClamp, stream);
351 #ifdef FAISS_USE_FLOAT16
352 void runSumAlongRows(Tensor<half, 1, true>& input,
353 Tensor<half, 2, true>& output,
355 cudaStream_t stream) {
356 runSumAlongRows<half>(input, output, zeroClamp, stream);