11 #include "PQScanMultiPassPrecomputed.cuh"
12 #include "../GpuResources.h"
13 #include "PQCodeLoad.cuh"
14 #include "IVFUtils.cuh"
15 #include "../utils/ConversionOperators.cuh"
16 #include "../utils/DeviceTensor.cuh"
17 #include "../utils/DeviceUtils.h"
18 #include "../utils/Float16.cuh"
19 #include "../utils/LoadStoreOperators.cuh"
20 #include "../utils/MathOperators.cuh"
21 #include "../utils/StaticUtils.h"
24 namespace faiss {
namespace gpu {
28 template <
typename LookupT,
typename LookupVecT>
29 inline __device__
void
30 loadPrecomputedTerm(LookupT* smem,
34 constexpr
int kWordSize =
sizeof(LookupVecT) /
sizeof(LookupT);
39 if (numCodes % kWordSize == 0) {
40 constexpr
int kUnroll = 2;
45 int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
46 limitVec *= kUnroll * blockDim.x;
48 LookupVecT* smemV = (LookupVecT*) smem;
49 LookupVecT* term2StartV = (LookupVecT*) term2Start;
50 LookupVecT* term3StartV = (LookupVecT*) term3Start;
52 for (
int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
53 LookupVecT vals[kUnroll];
56 for (
int j = 0; j < kUnroll; ++j) {
58 LoadStore<LookupVecT>::load(&term2StartV[i + j * blockDim.x]);
62 for (
int j = 0; j < kUnroll; ++j) {
64 LoadStore<LookupVecT>::load(&term3StartV[i + j * blockDim.x]);
66 vals[j] = Math<LookupVecT>::add(vals[j], q);
70 for (
int j = 0; j < kUnroll; ++j) {
71 LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
77 int remainder = limitVec * kWordSize;
79 for (
int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
80 smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
84 constexpr
int kUnroll = 4;
86 int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
89 for (; i < limit; i += kUnroll * blockDim.x) {
90 LookupT vals[kUnroll];
93 for (
int j = 0; j < kUnroll; ++j) {
94 vals[j] = term2Start[i + j * blockDim.x];
98 for (
int j = 0; j < kUnroll; ++j) {
99 vals[j] = Math<LookupT>::add(vals[j], term3Start[i + j * blockDim.x]);
103 for (
int j = 0; j < kUnroll; ++j) {
104 smem[i + j * blockDim.x] = vals[j];
108 for (; i < numCodes; i += blockDim.x) {
109 smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
114 template <
int NumSubQuantizers,
typename LookupT,
typename LookupVecT>
116 pqScanPrecomputedMultiPass(Tensor<float, 2, true> queries,
117 Tensor<float, 2, true> precompTerm1,
118 Tensor<LookupT, 3, true> precompTerm2,
119 Tensor<LookupT, 3, true> precompTerm3,
120 Tensor<int, 2, true> topQueryToCentroid,
123 Tensor<int, 2, true> prefixSumOffsets,
124 Tensor<float, 1, true> distance) {
127 extern __shared__
char smemTerm23[];
128 LookupT* term23 = (LookupT*) smemTerm23;
131 auto queryId = blockIdx.y;
132 auto probeId = blockIdx.x;
133 auto codesPerSubQuantizer = precompTerm2.getSize(2);
134 auto precompTermSize = precompTerm2.getSize(1) * codesPerSubQuantizer;
138 int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
139 float* distanceOut = distance[outBase].data();
141 auto listId = topQueryToCentroid[queryId][probeId];
147 unsigned char* codeList = (
unsigned char*) listCodes[listId];
148 int limit = listLengths[listId];
150 constexpr
int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
151 (NumSubQuantizers / 4);
152 unsigned int code32[kNumCode32];
153 unsigned int nextCode32[kNumCode32];
156 if (threadIdx.x < limit) {
157 LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
161 float term1 = precompTerm1[queryId][probeId];
162 loadPrecomputedTerm<LookupT, LookupVecT>(term23,
163 precompTerm2[listId].data(),
164 precompTerm3[queryId].data(),
172 for (
int codeIndex = threadIdx.x;
174 codeIndex += blockDim.x) {
176 if (codeIndex + blockDim.x < limit) {
177 LoadCode32<NumSubQuantizers>::load(
178 nextCode32, codeList, codeIndex + blockDim.x);
184 for (
int word = 0; word < kNumCode32; ++word) {
185 constexpr
int kBytesPerCode32 =
186 NumSubQuantizers < 4 ? NumSubQuantizers : 4;
188 if (kBytesPerCode32 == 1) {
189 auto code = code32[0];
190 dist = ConvertTo<float>::to(term23[code]);
194 for (
int byte = 0; byte < kBytesPerCode32; ++byte) {
195 auto code = getByte(code32[word], byte * 8, 8);
198 codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
200 dist += ConvertTo<float>::to(term23[offset + code]);
208 distanceOut[codeIndex] = dist;
212 for (
int word = 0; word < kNumCode32; ++word) {
213 code32[word] = nextCode32[word];
219 runMultiPassTile(Tensor<float, 2, true>& queries,
220 Tensor<float, 2, true>& precompTerm1,
221 NoTypeTensor<3, true>& precompTerm2,
222 NoTypeTensor<3, true>& precompTerm3,
223 Tensor<int, 2, true>& topQueryToCentroid,
224 bool useFloat16Lookup,
226 int numSubQuantizers,
227 int numSubQuantizerCodes,
228 thrust::device_vector<void*>& listCodes,
229 thrust::device_vector<void*>& listIndices,
230 IndicesOptions indicesOptions,
231 thrust::device_vector<int>& listLengths,
232 Tensor<char, 1, true>& thrustMem,
233 Tensor<int, 2, true>& prefixSumOffsets,
234 Tensor<float, 1, true>& allDistances,
235 Tensor<float, 3, true>& heapDistances,
236 Tensor<int, 3, true>& heapIndices,
238 Tensor<float, 2, true>& outDistances,
239 Tensor<long, 2, true>& outIndices,
240 cudaStream_t stream) {
243 runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
249 auto kThreadsPerBlock = 256;
251 auto grid = dim3(topQueryToCentroid.getSize(1),
252 topQueryToCentroid.getSize(0));
253 auto block = dim3(kThreadsPerBlock);
256 auto smem =
sizeof(float);
257 #ifdef FAISS_USE_FLOAT16
258 if (useFloat16Lookup) {
262 smem *= numSubQuantizers * numSubQuantizerCodes;
263 FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
265 #define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \
267 auto precompTerm2T = precompTerm2.toTensor<LOOKUP_T>(); \
268 auto precompTerm3T = precompTerm3.toTensor<LOOKUP_T>(); \
270 pqScanPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
271 <<<grid, block, smem, stream>>>( \
276 topQueryToCentroid, \
277 listCodes.data().get(), \
278 listLengths.data().get(), \
283 #ifdef FAISS_USE_FLOAT16
284 #define RUN_PQ(NUM_SUB_Q) \
286 if (useFloat16Lookup) { \
287 RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \
289 RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
293 #define RUN_PQ(NUM_SUB_Q) \
295 RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
297 #endif // FAISS_USE_FLOAT16
299 switch (bytesPerCode) {
360 runPass1SelectLists(prefixSumOffsets,
362 topQueryToCentroid.getSize(1),
370 auto flatHeapDistances = heapDistances.downcastInner<2>();
371 auto flatHeapIndices = heapIndices.downcastInner<2>();
373 runPass2SelectLists(flatHeapDistances,
388 void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
389 Tensor<float, 2, true>& precompTerm1,
390 NoTypeTensor<3, true>& precompTerm2,
391 NoTypeTensor<3, true>& precompTerm3,
392 Tensor<int, 2, true>& topQueryToCentroid,
393 bool useFloat16Lookup,
395 int numSubQuantizers,
396 int numSubQuantizerCodes,
397 thrust::device_vector<void*>& listCodes,
398 thrust::device_vector<void*>& listIndices,
399 IndicesOptions indicesOptions,
400 thrust::device_vector<int>& listLengths,
404 Tensor<float, 2, true>& outDistances,
406 Tensor<long, 2, true>& outIndices,
408 constexpr
int kMinQueryTileSize = 8;
409 constexpr
int kMaxQueryTileSize = 128;
410 constexpr
int kThrustMemSize = 16384;
412 int nprobe = topQueryToCentroid.getSize(1);
414 auto& mem = res->getMemoryManagerCurrentDevice();
415 auto stream = res->getDefaultStreamCurrentDevice();
419 DeviceTensor<char, 1, true> thrustMem1(
420 mem, {kThrustMemSize}, stream);
421 DeviceTensor<char, 1, true> thrustMem2(
422 mem, {kThrustMemSize}, stream);
423 DeviceTensor<char, 1, true>* thrustMem[2] =
424 {&thrustMem1, &thrustMem2};
428 size_t sizeAvailable = mem.getSizeAvailable();
432 constexpr
int kNProbeSplit = 8;
433 int pass2Chunks = std::min(nprobe, kNProbeSplit);
435 size_t sizeForFirstSelectPass =
436 pass2Chunks * k * (
sizeof(float) +
sizeof(
int));
439 size_t sizePerQuery =
441 ((nprobe *
sizeof(int) +
sizeof(
int)) +
442 nprobe * maxListLength *
sizeof(
float) +
443 sizeForFirstSelectPass);
445 int queryTileSize = (int) (sizeAvailable / sizePerQuery);
447 if (queryTileSize < kMinQueryTileSize) {
448 queryTileSize = kMinQueryTileSize;
449 }
else if (queryTileSize > kMaxQueryTileSize) {
450 queryTileSize = kMaxQueryTileSize;
455 FAISS_ASSERT(queryTileSize * nprobe * maxListLength <=
456 std::numeric_limits<int>::max());
461 DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
462 mem, {queryTileSize * nprobe + 1}, stream);
463 DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
464 mem, {queryTileSize * nprobe + 1}, stream);
466 DeviceTensor<int, 2, true> prefixSumOffsets1(
467 prefixSumOffsetSpace1[1].data(),
468 {queryTileSize, nprobe});
469 DeviceTensor<int, 2, true> prefixSumOffsets2(
470 prefixSumOffsetSpace2[1].data(),
471 {queryTileSize, nprobe});
472 DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
473 {&prefixSumOffsets1, &prefixSumOffsets2};
477 CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
481 CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
486 DeviceTensor<float, 1, true> allDistances1(
487 mem, {queryTileSize * nprobe * maxListLength}, stream);
488 DeviceTensor<float, 1, true> allDistances2(
489 mem, {queryTileSize * nprobe * maxListLength}, stream);
490 DeviceTensor<float, 1, true>* allDistances[2] =
491 {&allDistances1, &allDistances2};
493 DeviceTensor<float, 3, true> heapDistances1(
494 mem, {queryTileSize, pass2Chunks, k}, stream);
495 DeviceTensor<float, 3, true> heapDistances2(
496 mem, {queryTileSize, pass2Chunks, k}, stream);
497 DeviceTensor<float, 3, true>* heapDistances[2] =
498 {&heapDistances1, &heapDistances2};
500 DeviceTensor<int, 3, true> heapIndices1(
501 mem, {queryTileSize, pass2Chunks, k}, stream);
502 DeviceTensor<int, 3, true> heapIndices2(
503 mem, {queryTileSize, pass2Chunks, k}, stream);
504 DeviceTensor<int, 3, true>* heapIndices[2] =
505 {&heapIndices1, &heapIndices2};
507 auto streams = res->getAlternateStreamsCurrentDevice();
508 streamWait(streams, {stream});
512 for (
int query = 0; query < queries.getSize(0); query += queryTileSize) {
513 int numQueriesInTile =
514 std::min(queryTileSize, queries.getSize(0) - query);
516 auto prefixSumOffsetsView =
517 prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
519 auto coarseIndicesView =
520 topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
522 queries.narrowOutermost(query, numQueriesInTile);
524 precompTerm1.narrowOutermost(query, numQueriesInTile);
526 precompTerm3.narrowOutermost(query, numQueriesInTile);
528 auto heapDistancesView =
529 heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
530 auto heapIndicesView =
531 heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
533 auto outDistanceView =
534 outDistances.narrowOutermost(query, numQueriesInTile);
535 auto outIndicesView =
536 outIndices.narrowOutermost(query, numQueriesInTile);
538 runMultiPassTile(queryView,
546 numSubQuantizerCodes,
551 *thrustMem[curStream],
552 prefixSumOffsetsView,
553 *allDistances[curStream],
561 curStream = (curStream + 1) % 2;
564 streamWait({stream}, streams);