remove deleted files from template change

This commit is contained in:
Jeff Johnson 2020-03-25 10:57:57 -07:00
parent b9914eb30a
commit 64dd988443
2 changed files with 0 additions and 1164 deletions

View File

@ -1,567 +0,0 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/impl/PQCodeDistances.cuh>
#include <faiss/gpu/impl/BroadcastSum.cuh>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/impl/L2Norm.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/DeviceDefs.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Float16.cuh>
#include <faiss/gpu/utils/MatrixMult.cuh>
#include <faiss/gpu/utils/PtxUtils.cuh>
#include <faiss/gpu/utils/StaticUtils.h>
#include <faiss/gpu/utils/Transpose.cuh>
namespace faiss { namespace gpu {
template <typename T>
struct Converter {
};
template <>
struct Converter<half> {
inline static __device__ half to(float v) { return __float2half(v); }
};
template <>
struct Converter<float> {
inline static __device__ float to(float v) { return v; }
};
// Kernel responsible for calculating distance from residual vector to
// each product quantizer code centroid
template <typename OutCodeT, int DimsPerSubQuantizer, bool L2Distance>
__global__ void
__launch_bounds__(288, 4)
pqCodeDistances(Tensor<float, 2, true> queries,
int queriesPerBlock,
Tensor<float, 2, true> coarseCentroids,
Tensor<float, 3, true> pqCentroids,
Tensor<int, 2, true> topQueryToCentroid,
// (query id)(coarse)(subquantizer)(code) -> dist
Tensor<OutCodeT, 4, true> outCodeDistances) {
const auto numSubQuantizers = pqCentroids.getSize(0);
const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
extern __shared__ float smem[];
// Each thread calculates a single code
float subQuantizerData[DimsPerSubQuantizer];
auto code = threadIdx.x;
auto subQuantizer = blockIdx.y;
// Each thread will load the pq centroid data for the code that it
// is processing
#pragma unroll
for (int i = 0; i < DimsPerSubQuantizer; ++i) {
subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
}
// Where we store our query vector
float* smemQuery = smem;
// Where we store our residual vector; this is double buffered so we
// can be loading the next one while processing the current one
float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
// Where we pre-load the coarse centroid IDs
int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
// Each thread is calculating the distance for a single code,
// performing the reductions locally
// Handle multiple queries per block
auto startQueryId = blockIdx.x * queriesPerBlock;
auto numQueries = queries.getSize(0) - startQueryId;
if (numQueries > queriesPerBlock) {
numQueries = queriesPerBlock;
}
for (int query = 0; query < numQueries; ++query) {
auto queryId = startQueryId + query;
auto querySubQuantizer =
queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
// Load current query vector
for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
smemQuery[i] = querySubQuantizer[i];
}
// Load list of coarse centroids found
for (int i = threadIdx.x;
i < topQueryToCentroid.getSize(1); i += blockDim.x) {
coarseIds[i] = topQueryToCentroid[queryId][i];
}
// We need coarseIds below
// FIXME: investigate loading separately, so we don't need this
__syncthreads();
// Preload first buffer of residual data
if (isLoadingThread) {
for (int i = loadingThreadId;
i < DimsPerSubQuantizer;
i += blockDim.x - codesPerSubQuantizer) {
auto coarseId = coarseIds[0];
// In case NaNs were in the original query data
coarseId = coarseId == -1 ? 0 : coarseId;
auto coarseCentroidSubQuantizer =
coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
if (L2Distance) {
smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
} else {
smemResidual1[i] = coarseCentroidSubQuantizer[i];
}
}
}
// The block walks the list for a single query
for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
// Wait for smemResidual1 to be loaded
__syncthreads();
if (isLoadingThread) {
// Preload second buffer of residual data
for (int i = loadingThreadId;
i < DimsPerSubQuantizer;
i += blockDim.x - codesPerSubQuantizer) {
// FIXME: try always making this centroid id 0 so we can
// terminate
if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
auto coarseId = coarseIds[coarse + 1];
// In case NaNs were in the original query data
coarseId = coarseId == -1 ? 0 : coarseId;
auto coarseCentroidSubQuantizer =
coarseCentroids[coarseId]
[subQuantizer * dimsPerSubQuantizer].data();
if (L2Distance) {
smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
} else {
smemResidual2[i] = coarseCentroidSubQuantizer[i];
}
}
}
} else {
// These are the processing threads
float dist = 0.0f;
constexpr int kUnroll = 4;
constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
float vals[kUnroll];
// Calculate residual - pqCentroid for each dim that we're
// processing
// Unrolled loop
if (L2Distance) {
#pragma unroll
for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] = smemResidual1[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] -= subQuantizerData[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] *= vals[j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
dist += vals[j];
}
}
} else {
// Inner product: query slice against the reconstructed sub-quantizer
// for this coarse cell (query o (centroid + subQCentroid))
#pragma unroll
for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] = smemResidual1[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] += subQuantizerData[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] *= smemQuery[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
dist += vals[j];
}
}
}
// Remainder loop
if (L2Distance) {
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] = smemResidual1[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] -= subQuantizerData[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] *= vals[j];
}
} else {
// Inner product
// Inner product: query slice against the reconstructed sub-quantizer
// for this coarse cell (query o (centroid + subQCentroid))
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] = smemResidual1[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] += subQuantizerData[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] *= smemQuery[kRemainderBase + j];
}
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
dist += vals[j];
}
// We have the distance for our code; write it out
outCodeDistances[queryId][coarse][subQuantizer][code] =
Converter<OutCodeT>::to(dist);
} // !isLoadingThread
// Swap residual buffers
float* tmp = smemResidual1;
smemResidual1 = smemResidual2;
smemResidual2 = tmp;
}
}
}
__global__ void
residualVector(Tensor<float, 2, true> queries,
Tensor<float, 2, true> coarseCentroids,
Tensor<int, 2, true> topQueryToCentroid,
int numSubDim,
// output is transposed:
// (sub q)(query id)(centroid id)(sub dim)
Tensor<float, 4, true> residual) {
// block x is query id
// block y is centroid id
// thread x is dim
auto queryId = blockIdx.x;
auto centroidId = blockIdx.y;
int realCentroidId = topQueryToCentroid[queryId][centroidId];
for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
float q = queries[queryId][dim];
float c = coarseCentroids[realCentroidId][dim];
residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] =
q - c;
}
}
void
runResidualVector(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
Tensor<float, 4, true>& residual,
cudaStream_t stream) {
auto grid =
dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
residualVector<<<grid, block, 0, stream>>>(
queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
residual);
CUDA_TEST_ERROR();
}
void
runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
NoTypeTensor<4, true>& outCodeDistances,
bool useFloat16Lookup,
DeviceMemory& mem,
cublasHandle_t handle,
cudaStream_t stream) {
// Calculate (q - c) residual vector
// (sub q)(query id)(centroid id)(sub dim)
DeviceTensor<float, 4, true> residual(
mem,
{pqCentroids.getSize(0),
topQueryToCentroid.getSize(0),
topQueryToCentroid.getSize(1),
pqCentroids.getSize(1)},
stream);
runResidualVector(pqCentroids, queries,
coarseCentroids, topQueryToCentroid,
residual, stream);
// Calculate ||q - c||^2
DeviceTensor<float, 1, true> residualNorms(
mem,
{pqCentroids.getSize(0) *
topQueryToCentroid.getSize(0) *
topQueryToCentroid.getSize(1)},
stream);
auto residualView2 = residual.view<2>(
{pqCentroids.getSize(0) *
topQueryToCentroid.getSize(0) *
topQueryToCentroid.getSize(1),
pqCentroids.getSize(1)});
runL2Norm(residualView2, true, residualNorms, true, stream);
// Perform a batch MM:
// (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
// (sub q) x {(q * c)(code)}
auto residualView3 = residual.view<3>(
{pqCentroids.getSize(0),
topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
pqCentroids.getSize(1)});
DeviceTensor<float, 3, true> residualDistance(
mem,
{pqCentroids.getSize(0),
topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
pqCentroids.getSize(2)},
stream);
runIteratedMatrixMult(residualDistance, false,
residualView3, false,
pqCentroids, false,
-2.0f, 0.0f,
handle,
stream);
// Sum ||q - c||^2 along rows
auto residualDistanceView2 = residualDistance.view<2>(
{pqCentroids.getSize(0) *
topQueryToCentroid.getSize(0) *
topQueryToCentroid.getSize(1),
pqCentroids.getSize(2)});
runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
Tensor<float, 4, true> outCodeDistancesF;
DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
if (useFloat16Lookup) {
outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
mem, {outCodeDistances.getSize(0),
outCodeDistances.getSize(1),
outCodeDistances.getSize(2),
outCodeDistances.getSize(3)},
stream);
outCodeDistancesF = outCodeDistancesFloatMem;
} else {
outCodeDistancesF = outCodeDistances.toTensor<float>();
}
// Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
// is where we build our output distances)
auto outCodeDistancesView = outCodeDistancesF.view<3>(
{topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
outCodeDistances.getSize(2),
outCodeDistances.getSize(3)});
runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
// Calculate code norms per each sub-dim
// (sub q)(sub dim)(code) is pqCentroids
// transpose to (sub q)(code)(sub dim)
DeviceTensor<float, 3, true> pqCentroidsTranspose(
mem,
{pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
stream);
runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
{pqCentroids.getSize(0) * pqCentroids.getSize(2),
pqCentroids.getSize(1)});
DeviceTensor<float, 1, true> pqCentroidsNorm(
mem,
{pqCentroids.getSize(0) * pqCentroids.getSize(2)},
stream);
runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
// View output as (q * c)(sub q * code), and add centroid norm to
// each row
auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
{topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
if (useFloat16Lookup) {
// Need to convert back
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
convertTensor<float, half, 4>(stream,
outCodeDistancesF,
outCodeDistancesH);
}
}
void
runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
NoTypeTensor<4, true>& outCodeDistances,
bool l2Distance,
bool useFloat16Lookup,
cudaStream_t stream) {
const auto numSubQuantizers = pqCentroids.getSize(0);
const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
// FIXME: tune
// Reuse of pq centroid data is based on both # of queries * nprobe,
// and we should really be tiling in both dimensions
constexpr int kQueriesPerBlock = 8;
auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
numSubQuantizers);
// Reserve one block of threads for double buffering
// FIXME: probably impractical for large # of dims?
auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
auto block = dim3(codesPerSubQuantizer + loadingThreads);
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
+ topQueryToCentroid.getSize(1) * sizeof(int);
#define RUN_CODE(DIMS, L2) \
do { \
if (useFloat16Lookup) { \
auto outCodeDistancesT = outCodeDistances.toTensor<half>(); \
\
pqCodeDistances<half, DIMS, L2><<<grid, block, smem, stream>>>( \
queries, kQueriesPerBlock, \
coarseCentroids, pqCentroids, \
topQueryToCentroid, outCodeDistancesT); \
} else { \
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
\
pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>( \
queries, kQueriesPerBlock, \
coarseCentroids, pqCentroids, \
topQueryToCentroid, outCodeDistancesT); \
} \
} while (0)
#define CODE_L2(DIMS) \
do { \
if (l2Distance) { \
RUN_CODE(DIMS, true); \
} else { \
RUN_CODE(DIMS, false); \
} \
} while (0)
switch (dimsPerSubQuantizer) {
case 1:
CODE_L2(1);
break;
case 2:
CODE_L2(2);
break;
case 3:
CODE_L2(3);
break;
case 4:
CODE_L2(4);
break;
case 6:
CODE_L2(6);
break;
case 8:
CODE_L2(8);
break;
case 10:
CODE_L2(10);
break;
case 12:
CODE_L2(12);
break;
case 16:
CODE_L2(16);
break;
case 20:
CODE_L2(20);
break;
case 24:
CODE_L2(24);
break;
case 28:
CODE_L2(28);
break;
case 32:
CODE_L2(32);
break;
// FIXME: larger sizes require too many registers - we need the
// MM implementation working
default:
FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
"not currently supported");
}
#undef RUN_CODE
#undef CODE_L2
CUDA_TEST_ERROR();
}
} } // namespace

View File

@ -1,597 +0,0 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/impl/PQCodeDistances.cuh>
#include <faiss/gpu/impl/PQCodeLoad.cuh>
#include <faiss/gpu/impl/IVFUtils.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Float16.cuh>
#include <faiss/gpu/utils/LoadStoreOperators.cuh>
#include <faiss/gpu/utils/NoTypeTensor.cuh>
#include <faiss/gpu/utils/StaticUtils.h>
#include <faiss/gpu/utils/HostTensor.cuh>
namespace faiss { namespace gpu {
// This must be kept in sync with PQCodeDistances.cu
bool isSupportedNoPrecomputedSubDimSize(int dims) {
switch (dims) {
case 1:
case 2:
case 3:
case 4:
case 6:
case 8:
case 10:
case 12:
case 16:
case 20:
case 24:
case 28:
case 32:
return true;
default:
// FIXME: larger sizes require too many registers - we need the
// MM implementation working
return false;
}
}
template <typename LookupT, typename LookupVecT>
struct LoadCodeDistances {
static inline __device__ void load(LookupT* smem,
LookupT* codes,
int numCodes) {
constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
// We can only use the vector type if the data is guaranteed to be
// aligned. The codes are innermost, so if it is evenly divisible,
// then any slice will be aligned.
if (numCodes % kWordSize == 0) {
// Load the data by float4 for efficiency, and then handle any remainder
// limitVec is the number of whole vec words we can load, in terms
// of whole blocks performing the load
constexpr int kUnroll = 2;
int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
limitVec *= kUnroll * blockDim.x;
LookupVecT* smemV = (LookupVecT*) smem;
LookupVecT* codesV = (LookupVecT*) codes;
for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
LookupVecT vals[kUnroll];
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] =
LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
}
}
// This is where we start loading the remainder that does not evenly
// fit into kUnroll x blockDim.x
int remainder = limitVec * kWordSize;
for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
smem[i] = codes[i];
}
} else {
// Potential unaligned load
constexpr int kUnroll = 4;
int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
int i = threadIdx.x;
for (; i < limit; i += kUnroll * blockDim.x) {
LookupT vals[kUnroll];
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] = codes[i + j * blockDim.x];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
smem[i + j * blockDim.x] = vals[j];
}
}
for (; i < numCodes; i += blockDim.x) {
smem[i] = codes[i];
}
}
}
};
template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
__global__ void
pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
Tensor<float, 3, true> pqCentroids,
Tensor<int, 2, true> topQueryToCentroid,
Tensor<LookupT, 4, true> codeDistances,
void** listCodes,
int* listLengths,
Tensor<int, 2, true> prefixSumOffsets,
Tensor<float, 1, true> distance) {
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
// Where the pq code -> residual distance is stored
extern __shared__ char smemCodeDistances[];
LookupT* codeDist = (LookupT*) smemCodeDistances;
// Each block handles a single query
auto queryId = blockIdx.y;
auto probeId = blockIdx.x;
// This is where we start writing out data
// We ensure that before the array (at offset -1), there is a 0 value
int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
float* distanceOut = distance[outBase].data();
auto listId = topQueryToCentroid[queryId][probeId];
// Safety guard in case NaNs in input cause no list ID to be generated
if (listId == -1) {
return;
}
unsigned char* codeList = (unsigned char*) listCodes[listId];
int limit = listLengths[listId];
constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
(NumSubQuantizers / 4);
unsigned int code32[kNumCode32];
unsigned int nextCode32[kNumCode32];
// We double-buffer the code loading, which improves memory utilization
if (threadIdx.x < limit) {
LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
}
LoadCodeDistances<LookupT, LookupVecT>::load(
codeDist,
codeDistances[queryId][probeId].data(),
codeDistances.getSize(2) * codeDistances.getSize(3));
// Prevent WAR dependencies
__syncthreads();
// Each thread handles one code element in the list, with a
// block-wide stride
for (int codeIndex = threadIdx.x;
codeIndex < limit;
codeIndex += blockDim.x) {
// Prefetch next codes
if (codeIndex + blockDim.x < limit) {
LoadCode32<NumSubQuantizers>::load(
nextCode32, codeList, codeIndex + blockDim.x);
}
float dist = 0.0f;
#pragma unroll
for (int word = 0; word < kNumCode32; ++word) {
constexpr int kBytesPerCode32 =
NumSubQuantizers < 4 ? NumSubQuantizers : 4;
if (kBytesPerCode32 == 1) {
auto code = code32[0];
dist = ConvertTo<float>::to(codeDist[code]);
} else {
#pragma unroll
for (int byte = 0; byte < kBytesPerCode32; ++byte) {
auto code = getByte(code32[word], byte * 8, 8);
auto offset =
codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
dist += ConvertTo<float>::to(codeDist[offset + code]);
}
}
}
// Write out intermediate distance result
// We do not maintain indices here, in order to reduce global
// memory traffic. Those are recovered in the final selection step.
distanceOut[codeIndex] = dist;
// Rotate buffers
#pragma unroll
for (int word = 0; word < kNumCode32; ++word) {
code32[word] = nextCode32[word];
}
}
}
void
runMultiPassTile(Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& centroids,
Tensor<float, 3, true>& pqCentroidsInnermostCode,
NoTypeTensor<4, true>& codeDistances,
Tensor<int, 2, true>& topQueryToCentroid,
bool useFloat16Lookup,
int bytesPerCode,
int numSubQuantizers,
int numSubQuantizerCodes,
thrust::device_vector<void*>& listCodes,
thrust::device_vector<void*>& listIndices,
IndicesOptions indicesOptions,
thrust::device_vector<int>& listLengths,
Tensor<char, 1, true>& thrustMem,
Tensor<int, 2, true>& prefixSumOffsets,
Tensor<float, 1, true>& allDistances,
Tensor<float, 3, true>& heapDistances,
Tensor<int, 3, true>& heapIndices,
int k,
faiss::MetricType metric,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices,
cudaStream_t stream) {
// We only support two metrics at the moment
FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
metric == MetricType::METRIC_L2);
bool l2Distance = metric == MetricType::METRIC_L2;
// Calculate offset lengths, so we know where to write out
// intermediate results
runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
thrustMem, stream);
// Calculate residual code distances, since this is without
// precomputed codes
runPQCodeDistances(pqCentroidsInnermostCode,
queries,
centroids,
topQueryToCentroid,
codeDistances,
l2Distance,
useFloat16Lookup,
stream);
// Convert all codes to a distance, and write out (distance,
// index) values for all intermediate results
{
auto kThreadsPerBlock = 256;
auto grid = dim3(topQueryToCentroid.getSize(1),
topQueryToCentroid.getSize(0));
auto block = dim3(kThreadsPerBlock);
// pq centroid distances
auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
smem *= numSubQuantizers * numSubQuantizerCodes;
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \
do { \
auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>(); \
\
pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
<<<grid, block, smem, stream>>>( \
queries, \
pqCentroidsInnermostCode, \
topQueryToCentroid, \
codeDistancesT, \
listCodes.data().get(), \
listLengths.data().get(), \
prefixSumOffsets, \
allDistances); \
} while (0)
#define RUN_PQ(NUM_SUB_Q) \
do { \
if (useFloat16Lookup) { \
RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \
} else { \
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
} \
} while (0)
switch (bytesPerCode) {
case 1:
RUN_PQ(1);
break;
case 2:
RUN_PQ(2);
break;
case 3:
RUN_PQ(3);
break;
case 4:
RUN_PQ(4);
break;
case 8:
RUN_PQ(8);
break;
case 12:
RUN_PQ(12);
break;
case 16:
RUN_PQ(16);
break;
case 20:
RUN_PQ(20);
break;
case 24:
RUN_PQ(24);
break;
case 28:
RUN_PQ(28);
break;
case 32:
RUN_PQ(32);
break;
case 40:
RUN_PQ(40);
break;
case 48:
RUN_PQ(48);
break;
case 56:
RUN_PQ(56);
break;
case 64:
RUN_PQ(64);
break;
case 96:
RUN_PQ(96);
break;
default:
FAISS_ASSERT(false);
break;
}
#undef RUN_PQ
#undef RUN_PQ_OPT
}
CUDA_TEST_ERROR();
// k-select the output in chunks, to increase parallelism
runPass1SelectLists(prefixSumOffsets,
allDistances,
topQueryToCentroid.getSize(1),
k,
!l2Distance, // L2 distance chooses smallest
heapDistances,
heapIndices,
stream);
// k-select final output
auto flatHeapDistances = heapDistances.downcastInner<2>();
auto flatHeapIndices = heapIndices.downcastInner<2>();
runPass2SelectLists(flatHeapDistances,
flatHeapIndices,
listIndices,
indicesOptions,
prefixSumOffsets,
topQueryToCentroid,
k,
!l2Distance, // L2 distance chooses smallest
outDistances,
outIndices,
stream);
}
void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& centroids,
Tensor<float, 3, true>& pqCentroidsInnermostCode,
Tensor<int, 2, true>& topQueryToCentroid,
bool useFloat16Lookup,
int bytesPerCode,
int numSubQuantizers,
int numSubQuantizerCodes,
thrust::device_vector<void*>& listCodes,
thrust::device_vector<void*>& listIndices,
IndicesOptions indicesOptions,
thrust::device_vector<int>& listLengths,
int maxListLength,
int k,
faiss::MetricType metric,
// output
Tensor<float, 2, true>& outDistances,
// output
Tensor<long, 2, true>& outIndices,
GpuResources* res) {
constexpr int kMinQueryTileSize = 8;
constexpr int kMaxQueryTileSize = 128;
constexpr int kThrustMemSize = 16384;
int nprobe = topQueryToCentroid.getSize(1);
auto& mem = res->getMemoryManagerCurrentDevice();
auto stream = res->getDefaultStreamCurrentDevice();
// Make a reservation for Thrust to do its dirty work (global memory
// cross-block reduction space); hopefully this is large enough.
DeviceTensor<char, 1, true> thrustMem1(
mem, {kThrustMemSize}, stream);
DeviceTensor<char, 1, true> thrustMem2(
mem, {kThrustMemSize}, stream);
DeviceTensor<char, 1, true>* thrustMem[2] =
{&thrustMem1, &thrustMem2};
// How much temporary storage is available?
// If possible, we'd like to fit within the space available.
size_t sizeAvailable = mem.getSizeAvailable();
// We run two passes of heap selection
// This is the size of the first-level heap passes
constexpr int kNProbeSplit = 8;
int pass2Chunks = std::min(nprobe, kNProbeSplit);
size_t sizeForFirstSelectPass =
pass2Chunks * k * (sizeof(float) + sizeof(int));
// How much temporary storage we need per each query
size_t sizePerQuery =
2 * // streams
((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
nprobe * maxListLength * sizeof(float) + // allDistances
// residual distances
nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
sizeForFirstSelectPass);
int queryTileSize = (int) (sizeAvailable / sizePerQuery);
if (queryTileSize < kMinQueryTileSize) {
queryTileSize = kMinQueryTileSize;
} else if (queryTileSize > kMaxQueryTileSize) {
queryTileSize = kMaxQueryTileSize;
}
// FIXME: we should adjust queryTileSize to deal with this, since
// indexing is in int32
FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
std::numeric_limits<int>::max());
// Temporary memory buffers
// Make sure there is space prior to the start which will be 0, and
// will handle the boundary condition without branches
DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
mem, {queryTileSize * nprobe + 1}, stream);
DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
mem, {queryTileSize * nprobe + 1}, stream);
DeviceTensor<int, 2, true> prefixSumOffsets1(
prefixSumOffsetSpace1[1].data(),
{queryTileSize, nprobe});
DeviceTensor<int, 2, true> prefixSumOffsets2(
prefixSumOffsetSpace2[1].data(),
{queryTileSize, nprobe});
DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
{&prefixSumOffsets1, &prefixSumOffsets2};
// Make sure the element before prefixSumOffsets is 0, since we
// depend upon simple, boundary-less indexing to get proper results
CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
0,
sizeof(int),
stream));
CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
0,
sizeof(int),
stream));
int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
int totalCodeDistancesSize =
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
codeDistanceTypeSize;
DeviceTensor<char, 1, true> codeDistances1Mem(
mem, {totalCodeDistancesSize}, stream);
NoTypeTensor<4, true> codeDistances1(
codeDistances1Mem.data(),
codeDistanceTypeSize,
{queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
DeviceTensor<char, 1, true> codeDistances2Mem(
mem, {totalCodeDistancesSize}, stream);
NoTypeTensor<4, true> codeDistances2(
codeDistances2Mem.data(),
codeDistanceTypeSize,
{queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
NoTypeTensor<4, true>* codeDistances[2] =
{&codeDistances1, &codeDistances2};
DeviceTensor<float, 1, true> allDistances1(
mem, {queryTileSize * nprobe * maxListLength}, stream);
DeviceTensor<float, 1, true> allDistances2(
mem, {queryTileSize * nprobe * maxListLength}, stream);
DeviceTensor<float, 1, true>* allDistances[2] =
{&allDistances1, &allDistances2};
DeviceTensor<float, 3, true> heapDistances1(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<float, 3, true> heapDistances2(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<float, 3, true>* heapDistances[2] =
{&heapDistances1, &heapDistances2};
DeviceTensor<int, 3, true> heapIndices1(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<int, 3, true> heapIndices2(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<int, 3, true>* heapIndices[2] =
{&heapIndices1, &heapIndices2};
auto streams = res->getAlternateStreamsCurrentDevice();
streamWait(streams, {stream});
int curStream = 0;
for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
int numQueriesInTile =
std::min(queryTileSize, queries.getSize(0) - query);
auto prefixSumOffsetsView =
prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
auto codeDistancesView =
codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
auto coarseIndicesView =
topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
auto queryView =
queries.narrowOutermost(query, numQueriesInTile);
auto heapDistancesView =
heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
auto heapIndicesView =
heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
auto outDistanceView =
outDistances.narrowOutermost(query, numQueriesInTile);
auto outIndicesView =
outIndices.narrowOutermost(query, numQueriesInTile);
runMultiPassTile(queryView,
centroids,
pqCentroidsInnermostCode,
codeDistancesView,
coarseIndicesView,
useFloat16Lookup,
bytesPerCode,
numSubQuantizers,
numSubQuantizerCodes,
listCodes,
listIndices,
indicesOptions,
listLengths,
*thrustMem[curStream],
prefixSumOffsetsView,
*allDistances[curStream],
heapDistancesView,
heapIndicesView,
k,
metric,
outDistanceView,
outIndicesView,
streams[curStream]);
curStream = (curStream + 1) % 2;
}
streamWait({stream}, streams);
}
} } // namespace