365 lines
10 KiB
Plaintext
365 lines
10 KiB
Plaintext
/**
|
|
* Copyright (c) 2015-present, Facebook, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the BSD+Patents license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// Copyright 2004-present Facebook. All Rights Reserved.
|
|
|
|
#include <algorithm>
|
|
#include "../../FaissAssert.h"
|
|
|
|
#include "../utils/DeviceUtils.h"
|
|
#include "../utils/MathOperators.cuh"
|
|
#include "../utils/Tensor.cuh"
|
|
#include "../utils/StaticUtils.h"
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
|
|
__global__ void sumAlongColumns(Tensor<T, 1, true> input,
|
|
Tensor<T, 2, true> output) {
|
|
static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
|
|
|
|
// blockIdx.x: which chunk of rows we are responsible for updating
|
|
// blockIdx.y: which chunk of columns we are responsible for
|
|
// updating
|
|
int rowStart = blockIdx.x * kRowsPerBlock;
|
|
int rowEnd = rowStart + kRowsPerBlock;
|
|
int colStart = blockIdx.y * blockDim.x * kColLoad;
|
|
|
|
// FIXME: if we have exact multiples, don't need this
|
|
bool endRow = (blockIdx.x == gridDim.x - 1);
|
|
bool endCol = (blockIdx.y == gridDim.y - 1);
|
|
|
|
if (endRow) {
|
|
if (output.getSize(0) % kRowsPerBlock == 0) {
|
|
endRow = false;
|
|
}
|
|
}
|
|
|
|
if (endCol) {
|
|
for (int col = colStart + threadIdx.x;
|
|
col < input.getSize(0); col += blockDim.x) {
|
|
T val = input[col];
|
|
|
|
if (endRow) {
|
|
for (int row = rowStart; row < output.getSize(0); ++row) {
|
|
T out = output[row][col].ldg();
|
|
out = Math<T>::add(out, val);
|
|
output[row][col] = out;
|
|
}
|
|
} else {
|
|
T rows[kRowUnroll];
|
|
|
|
for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
rows[i] = output[row + i][col].ldg();
|
|
}
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
rows[i] = Math<T>::add(rows[i], val);
|
|
}
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
output[row + i][col] = rows[i];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
int col = colStart + threadIdx.x;
|
|
|
|
T val[kColLoad];
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < kColLoad; ++i) {
|
|
val[i] = input[col + i * blockDim.x];
|
|
}
|
|
|
|
if (endRow) {
|
|
for (int row = rowStart; row < output.getSize(0); ++row) {
|
|
#pragma unroll
|
|
for (int i = 0; i < kColLoad; ++i) {
|
|
T out = output[row][col + i * blockDim.x].ldg();
|
|
out = Math<T>::add(out, val[i]);
|
|
output[row][col + i * blockDim.x] = out;
|
|
}
|
|
}
|
|
} else {
|
|
T rows[kRowUnroll * kColLoad];
|
|
|
|
for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
#pragma unroll
|
|
for (int j = 0; j < kColLoad; ++j) {
|
|
rows[i * kColLoad + j] =
|
|
output[row + i][col + j * blockDim.x].ldg();
|
|
}
|
|
}
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
#pragma unroll
|
|
for (int j = 0; j < kColLoad; ++j) {
|
|
rows[i * kColLoad + j] =
|
|
Math<T>::add(rows[i * kColLoad + j], val[j]);
|
|
}
|
|
}
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
#pragma unroll
|
|
for (int j = 0; j < kColLoad; ++j) {
|
|
output[row + i][col + j * blockDim.x] =
|
|
rows[i * kColLoad + j];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, int kRowsPerBlock, int kRowUnroll, int kColLoad>
|
|
__global__ void assignAlongColumns(Tensor<T, 1, true> input,
|
|
Tensor<T, 2, true> output) {
|
|
static_assert(kRowsPerBlock % kRowUnroll == 0, "must fit rows");
|
|
|
|
// blockIdx.x: which chunk of rows we are responsible for updating
|
|
// blockIdx.y: which chunk of columns we are responsible for
|
|
// updating
|
|
int rowStart = blockIdx.x * kRowsPerBlock;
|
|
int rowEnd = rowStart + kRowsPerBlock;
|
|
int colStart = blockIdx.y * blockDim.x * kColLoad;
|
|
|
|
// FIXME: if we have exact multiples, don't need this
|
|
bool endRow = (blockIdx.x == gridDim.x - 1);
|
|
bool endCol = (blockIdx.y == gridDim.y - 1);
|
|
|
|
if (endRow) {
|
|
if (output.getSize(0) % kRowsPerBlock == 0) {
|
|
endRow = false;
|
|
}
|
|
}
|
|
|
|
if (endCol) {
|
|
for (int col = colStart + threadIdx.x;
|
|
col < input.getSize(0); col += blockDim.x) {
|
|
T val = input[col];
|
|
|
|
if (endRow) {
|
|
for (int row = rowStart; row < output.getSize(0); ++row) {
|
|
output[row][col] = val;
|
|
}
|
|
} else {
|
|
for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
output[row + i][col] = val;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
int col = colStart + threadIdx.x;
|
|
|
|
T val[kColLoad];
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < kColLoad; ++i) {
|
|
val[i] = input[col + i * blockDim.x];
|
|
}
|
|
|
|
if (endRow) {
|
|
for (int row = rowStart; row < output.getSize(0); ++row) {
|
|
#pragma unroll
|
|
for (int i = 0; i < kColLoad; ++i) {
|
|
output[row][col + i * blockDim.x] = val[i];
|
|
}
|
|
}
|
|
} else {
|
|
for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
|
|
#pragma unroll
|
|
for (int i = 0; i < kRowUnroll; ++i) {
|
|
#pragma unroll
|
|
for (int j = 0; j < kColLoad; ++j) {
|
|
output[row + i][col + j * blockDim.x] = val[j];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, typename TVec>
|
|
__global__ void sumAlongRows(Tensor<T, 1, true> input,
|
|
Tensor<TVec, 2, true> output) {
|
|
__shared__ T sval;
|
|
|
|
int row = blockIdx.x;
|
|
|
|
if (threadIdx.x == 0) {
|
|
sval = input[row];
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
T val = sval;
|
|
|
|
// FIXME: speed up
|
|
for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
|
|
TVec out = output[row][i];
|
|
out = Math<TVec>::add(out, val);
|
|
output[row][i] = out;
|
|
}
|
|
}
|
|
|
|
template <typename T, typename TVec>
|
|
void runSumAlongColumns(Tensor<T, 1, true>& input,
|
|
Tensor<T, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
FAISS_ASSERT(input.getSize(0) == output.getSize(1));
|
|
|
|
int threadsPerBlock = 256;
|
|
constexpr int kRowUnroll = 4;
|
|
constexpr int kRowsPerBlock = kRowUnroll * 4;
|
|
constexpr int kColLoad = 4;
|
|
|
|
auto block = dim3(threadsPerBlock);
|
|
|
|
if (input.template canCastResize<TVec>() &&
|
|
output.template canCastResize<TVec>()) {
|
|
auto inputV = input.template castResize<TVec>();
|
|
auto outputV = output.template castResize<TVec>();
|
|
|
|
auto grid =
|
|
dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
|
|
utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
|
|
|
|
sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
|
|
<<<grid, block, 0, stream>>>(inputV, outputV);
|
|
} else {
|
|
auto grid =
|
|
dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
|
|
utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
|
|
|
|
sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
|
|
<<<grid, block, 0, stream>>>(input, output);
|
|
}
|
|
|
|
CUDA_TEST_ERROR();
|
|
}
|
|
|
|
void runSumAlongColumns(Tensor<float, 1, true>& input,
|
|
Tensor<float, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
runSumAlongColumns<float, float4>(input, output, stream);
|
|
}
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
void runSumAlongColumns(Tensor<half, 1, true>& input,
|
|
Tensor<half, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
runSumAlongColumns<half, half2>(input, output, stream);
|
|
}
|
|
#endif
|
|
|
|
template <typename T, typename TVec>
|
|
void runAssignAlongColumns(Tensor<T, 1, true>& input,
|
|
Tensor<T, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
FAISS_ASSERT(input.getSize(0) == output.getSize(1));
|
|
|
|
int threadsPerBlock = 256;
|
|
constexpr int kRowUnroll = 4;
|
|
constexpr int kRowsPerBlock = kRowUnroll * 4;
|
|
constexpr int kColLoad = 4;
|
|
|
|
auto block = dim3(threadsPerBlock);
|
|
|
|
if (input.template canCastResize<TVec>() &&
|
|
output.template canCastResize<TVec>()) {
|
|
auto inputV = input.template castResize<TVec>();
|
|
auto outputV = output.template castResize<TVec>();
|
|
|
|
auto grid =
|
|
dim3(utils::divUp(outputV.getSize(0), kRowsPerBlock),
|
|
utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
|
|
|
|
assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
|
|
<<<grid, block, 0, stream>>>(inputV, outputV);
|
|
} else {
|
|
auto grid =
|
|
dim3(utils::divUp(output.getSize(0), kRowsPerBlock),
|
|
utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
|
|
|
|
assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
|
|
<<<grid, block, 0, stream>>>(input, output);
|
|
}
|
|
|
|
CUDA_TEST_ERROR();
|
|
}
|
|
|
|
void runAssignAlongColumns(Tensor<float, 1, true>& input,
|
|
Tensor<float, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
runAssignAlongColumns<float, float4>(input, output, stream);
|
|
}
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
void runAssignAlongColumns(Tensor<half, 1, true>& input,
|
|
Tensor<half, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
runAssignAlongColumns<half, half2>(input, output, stream);
|
|
}
|
|
#endif
|
|
|
|
template <typename T, typename TVec>
|
|
void runSumAlongRows(Tensor<T, 1, true>& input,
|
|
Tensor<T, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
FAISS_ASSERT(input.getSize(0) == output.getSize(0));
|
|
|
|
if (output.template canCastResize<TVec>()) {
|
|
auto outputV = output.template castResize<TVec>();
|
|
|
|
int threadsPerBlock =
|
|
std::min(outputV.getSize(1), getMaxThreadsCurrentDevice());
|
|
auto grid = dim3(outputV.getSize(0));
|
|
auto block = dim3(threadsPerBlock);
|
|
|
|
sumAlongRows<T, TVec><<<grid, block, 0, stream>>>(input, outputV);
|
|
} else {
|
|
int threadsPerBlock =
|
|
std::min(output.getSize(1), getMaxThreadsCurrentDevice());
|
|
auto grid = dim3(output.getSize(0));
|
|
auto block = dim3(threadsPerBlock);
|
|
|
|
sumAlongRows<T, T><<<grid, block, 0, stream>>>(input, output);
|
|
}
|
|
|
|
CUDA_TEST_ERROR();
|
|
}
|
|
|
|
void runSumAlongRows(Tensor<float, 1, true>& input,
|
|
Tensor<float, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
runSumAlongRows<float, float4>(input, output, stream);
|
|
}
|
|
|
|
#ifdef FAISS_USE_FLOAT16
|
|
void runSumAlongRows(Tensor<half, 1, true>& input,
|
|
Tensor<half, 2, true>& output,
|
|
cudaStream_t stream) {
|
|
runSumAlongRows<half, half2>(input, output, stream);
|
|
}
|
|
#endif
|
|
|
|
} } // namespace
|