Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
PQScanMultiPassNoPrecomputed.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "PQScanMultiPassNoPrecomputed.cuh"
13 #include "../GpuResources.h"
14 #include "PQCodeDistances.cuh"
15 #include "PQCodeLoad.cuh"
16 #include "IVFUtils.cuh"
17 #include "../utils/ConversionOperators.cuh"
18 #include "../utils/DeviceTensor.cuh"
19 #include "../utils/DeviceUtils.h"
20 #include "../utils/Float16.cuh"
21 #include "../utils/LoadStoreOperators.cuh"
22 #include "../utils/NoTypeTensor.cuh"
23 #include "../utils/StaticUtils.h"
24 
25 #include "../utils/HostTensor.cuh"
26 
27 namespace faiss { namespace gpu {
28 
29 bool isSupportedNoPrecomputedSubDimSize(int dims) {
30  switch (dims) {
31  case 1:
32  case 2:
33  case 3:
34  case 4:
35  case 6:
36  case 8:
37  case 10:
38  case 12:
39  case 16:
40  case 32:
41  return true;
42  break;
43  default:
44  // FIXME: larger sizes require too many registers - we need the
45  // MM implementation working
46  return false;
47  break;
48  }
49 }
50 
51 template <typename LookupT, typename LookupVecT>
53  static inline __device__ void load(LookupT* smem,
54  LookupT* codes,
55  int numCodes) {
56  constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
57 
58  // We can only use the vector type if the data is guaranteed to be
59  // aligned. The codes are innermost, so if it is evenly divisible,
60  // then any slice will be aligned.
61  if (numCodes % kWordSize == 0) {
62  // Load the data by float4 for efficiency, and then handle any remainder
63  // limitVec is the number of whole vec words we can load, in terms
64  // of whole blocks performing the load
65  constexpr int kUnroll = 2;
66  int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
67  limitVec *= kUnroll * blockDim.x;
68 
69  LookupVecT* smemV = (LookupVecT*) smem;
70  LookupVecT* codesV = (LookupVecT*) codes;
71 
72  for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
73  LookupVecT vals[kUnroll];
74 
75 #pragma unroll
76  for (int j = 0; j < kUnroll; ++j) {
77  vals[j] =
78  LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
79  }
80 
81 #pragma unroll
82  for (int j = 0; j < kUnroll; ++j) {
83  LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
84  }
85  }
86 
87  // This is where we start loading the remainder that does not evenly
88  // fit into kUnroll x blockDim.x
89  int remainder = limitVec * kWordSize;
90 
91  for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
92  smem[i] = codes[i];
93  }
94  } else {
95  // Potential unaligned load
96  constexpr int kUnroll = 4;
97 
98  int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
99 
100  int i = threadIdx.x;
101  for (; i < limit; i += kUnroll * blockDim.x) {
102  LookupT vals[kUnroll];
103 
104 #pragma unroll
105  for (int j = 0; j < kUnroll; ++j) {
106  vals[j] = codes[i + j * blockDim.x];
107  }
108 
109 #pragma unroll
110  for (int j = 0; j < kUnroll; ++j) {
111  smem[i + j * blockDim.x] = vals[j];
112  }
113  }
114 
115  for (; i < numCodes; i += blockDim.x) {
116  smem[i] = codes[i];
117  }
118  }
119  }
120 };
121 
122 template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
123 __global__ void
124 pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
125  Tensor<float, 3, true> pqCentroids,
126  Tensor<int, 2, true> topQueryToCentroid,
127  Tensor<LookupT, 4, true> codeDistances,
128  void** listCodes,
129  int* listLengths,
130  Tensor<int, 2, true> prefixSumOffsets,
131  Tensor<float, 1, true> distance) {
132  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
133 
134  // Where the pq code -> residual distance is stored
135  extern __shared__ char smemCodeDistances[];
136  LookupT* codeDist = (LookupT*) smemCodeDistances;
137 
138  // Each block handles a single query
139  auto queryId = blockIdx.y;
140  auto probeId = blockIdx.x;
141 
142  // This is where we start writing out data
143  // We ensure that before the array (at offset -1), there is a 0 value
144  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
145  float* distanceOut = distance[outBase].data();
146 
147  auto listId = topQueryToCentroid[queryId][probeId];
148  // Safety guard in case NaNs in input cause no list ID to be generated
149  if (listId == -1) {
150  return;
151  }
152 
153  unsigned char* codeList = (unsigned char*) listCodes[listId];
154  int limit = listLengths[listId];
155 
156  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
157  (NumSubQuantizers / 4);
158  unsigned int code32[kNumCode32];
159  unsigned int nextCode32[kNumCode32];
160 
161  // We double-buffer the code loading, which improves memory utilization
162  if (threadIdx.x < limit) {
163  LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
164  }
165 
166  LoadCodeDistances<LookupT, LookupVecT>::load(
167  codeDist,
168  codeDistances[queryId][probeId].data(),
169  codeDistances.getSize(2) * codeDistances.getSize(3));
170 
171  // Prevent WAR dependencies
172  __syncthreads();
173 
174  // Each thread handles one code element in the list, with a
175  // block-wide stride
176  for (int codeIndex = threadIdx.x;
177  codeIndex < limit;
178  codeIndex += blockDim.x) {
179  // Prefetch next codes
180  if (codeIndex + blockDim.x < limit) {
181  LoadCode32<NumSubQuantizers>::load(
182  nextCode32, codeList, codeIndex + blockDim.x);
183  }
184 
185  float dist = 0.0f;
186 
187 #pragma unroll
188  for (int word = 0; word < kNumCode32; ++word) {
189  constexpr int kBytesPerCode32 =
190  NumSubQuantizers < 4 ? NumSubQuantizers : 4;
191 
192  if (kBytesPerCode32 == 1) {
193  auto code = code32[0];
194  dist = ConvertTo<float>::to(codeDist[code]);
195 
196  } else {
197 #pragma unroll
198  for (int byte = 0; byte < kBytesPerCode32; ++byte) {
199  auto code = getByte(code32[word], byte * 8, 8);
200 
201  auto offset =
202  codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
203 
204  dist += ConvertTo<float>::to(codeDist[offset + code]);
205  }
206  }
207  }
208 
209  // Write out intermediate distance result
210  // We do not maintain indices here, in order to reduce global
211  // memory traffic. Those are recovered in the final selection step.
212  distanceOut[codeIndex] = dist;
213 
214  // Rotate buffers
215 #pragma unroll
216  for (int word = 0; word < kNumCode32; ++word) {
217  code32[word] = nextCode32[word];
218  }
219  }
220 }
221 
222 void
223 runMultiPassTile(Tensor<float, 2, true>& queries,
224  Tensor<float, 2, true>& centroids,
225  Tensor<float, 3, true>& pqCentroidsInnermostCode,
226  NoTypeTensor<4, true>& codeDistances,
227  Tensor<int, 2, true>& topQueryToCentroid,
228  bool useFloat16Lookup,
229  int bytesPerCode,
230  int numSubQuantizers,
231  int numSubQuantizerCodes,
232  thrust::device_vector<void*>& listCodes,
233  thrust::device_vector<void*>& listIndices,
234  IndicesOptions indicesOptions,
235  thrust::device_vector<int>& listLengths,
236  Tensor<char, 1, true>& thrustMem,
237  Tensor<int, 2, true>& prefixSumOffsets,
238  Tensor<float, 1, true>& allDistances,
239  Tensor<float, 3, true>& heapDistances,
240  Tensor<int, 3, true>& heapIndices,
241  int k,
242  Tensor<float, 2, true>& outDistances,
243  Tensor<long, 2, true>& outIndices,
244  cudaStream_t stream) {
245 #ifndef FAISS_USE_FLOAT16
246  FAISS_ASSERT(!useFloat16Lookup);
247 #endif
248 
249  // Calculate offset lengths, so we know where to write out
250  // intermediate results
251  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
252  thrustMem, stream);
253 
254  // Calculate residual code distances, since this is without
255  // precomputed codes
256  runPQCodeDistances(pqCentroidsInnermostCode,
257  queries,
258  centroids,
259  topQueryToCentroid,
260  codeDistances,
261  useFloat16Lookup,
262  stream);
263 
264  // Convert all codes to a distance, and write out (distance,
265  // index) values for all intermediate results
266  {
267  auto kThreadsPerBlock = 256;
268 
269  auto grid = dim3(topQueryToCentroid.getSize(1),
270  topQueryToCentroid.getSize(0));
271  auto block = dim3(kThreadsPerBlock);
272 
273  // pq centroid distances
274  auto smem = sizeof(float);
275 #ifdef FAISS_USE_FLOAT16
276  if (useFloat16Lookup) {
277  smem = sizeof(half);
278  }
279 #endif
280  smem *= numSubQuantizers * numSubQuantizerCodes;
281  FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
282 
283 #define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \
284  do { \
285  auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>(); \
286  \
287  pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
288  <<<grid, block, smem, stream>>>( \
289  queries, \
290  pqCentroidsInnermostCode, \
291  topQueryToCentroid, \
292  codeDistancesT, \
293  listCodes.data().get(), \
294  listLengths.data().get(), \
295  prefixSumOffsets, \
296  allDistances); \
297  } while (0)
298 
299 #ifdef FAISS_USE_FLOAT16
300 #define RUN_PQ(NUM_SUB_Q) \
301  do { \
302  if (useFloat16Lookup) { \
303  RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \
304  } else { \
305  RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
306  } \
307  } while (0)
308 #else
309 #define RUN_PQ(NUM_SUB_Q) \
310  do { \
311  RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
312  } while (0)
313 #endif // FAISS_USE_FLOAT16
314 
315  switch (bytesPerCode) {
316  case 1:
317  RUN_PQ(1);
318  break;
319  case 2:
320  RUN_PQ(2);
321  break;
322  case 3:
323  RUN_PQ(3);
324  break;
325  case 4:
326  RUN_PQ(4);
327  break;
328  case 8:
329  RUN_PQ(8);
330  break;
331  case 12:
332  RUN_PQ(12);
333  break;
334  case 16:
335  RUN_PQ(16);
336  break;
337  case 20:
338  RUN_PQ(20);
339  break;
340  case 24:
341  RUN_PQ(24);
342  break;
343  case 28:
344  RUN_PQ(28);
345  break;
346  case 32:
347  RUN_PQ(32);
348  break;
349  case 40:
350  RUN_PQ(40);
351  break;
352  case 48:
353  RUN_PQ(48);
354  break;
355  case 56:
356  RUN_PQ(56);
357  break;
358  case 64:
359  RUN_PQ(64);
360  break;
361  case 96:
362  RUN_PQ(96);
363  break;
364  default:
365  FAISS_ASSERT(false);
366  break;
367  }
368 
369 #undef RUN_PQ
370 #undef RUN_PQ_OPT
371  }
372 
373  // k-select the output in chunks, to increase parallelism
374  runPass1SelectLists(prefixSumOffsets,
375  allDistances,
376  topQueryToCentroid.getSize(1),
377  k,
378  false, // L2 distance chooses smallest
379  heapDistances,
380  heapIndices,
381  stream);
382 
383  // k-select final output
384  auto flatHeapDistances = heapDistances.downcastInner<2>();
385  auto flatHeapIndices = heapIndices.downcastInner<2>();
386 
387  runPass2SelectLists(flatHeapDistances,
388  flatHeapIndices,
389  listIndices,
390  indicesOptions,
391  prefixSumOffsets,
392  topQueryToCentroid,
393  k,
394  false, // L2 distance chooses smallest
395  outDistances,
396  outIndices,
397  stream);
398 
399  CUDA_VERIFY(cudaGetLastError());
400 }
401 
402 void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
403  Tensor<float, 2, true>& centroids,
404  Tensor<float, 3, true>& pqCentroidsInnermostCode,
405  Tensor<int, 2, true>& topQueryToCentroid,
406  bool useFloat16Lookup,
407  int bytesPerCode,
408  int numSubQuantizers,
409  int numSubQuantizerCodes,
410  thrust::device_vector<void*>& listCodes,
411  thrust::device_vector<void*>& listIndices,
412  IndicesOptions indicesOptions,
413  thrust::device_vector<int>& listLengths,
414  int maxListLength,
415  int k,
416  // output
417  Tensor<float, 2, true>& outDistances,
418  // output
419  Tensor<long, 2, true>& outIndices,
420  GpuResources* res) {
421  constexpr int kMinQueryTileSize = 8;
422  constexpr int kMaxQueryTileSize = 128;
423  constexpr int kThrustMemSize = 16384;
424 
425  int nprobe = topQueryToCentroid.getSize(1);
426 
427  auto& mem = res->getMemoryManagerCurrentDevice();
428  auto stream = res->getDefaultStreamCurrentDevice();
429 
430  // Make a reservation for Thrust to do its dirty work (global memory
431  // cross-block reduction space); hopefully this is large enough.
432  DeviceTensor<char, 1, true> thrustMem1(
433  mem, {kThrustMemSize}, stream);
434  DeviceTensor<char, 1, true> thrustMem2(
435  mem, {kThrustMemSize}, stream);
436  DeviceTensor<char, 1, true>* thrustMem[2] =
437  {&thrustMem1, &thrustMem2};
438 
439  // How much temporary storage is available?
440  // If possible, we'd like to fit within the space available.
441  size_t sizeAvailable = mem.getSizeAvailable();
442 
443  // We run two passes of heap selection
444  // This is the size of the first-level heap passes
445  constexpr int kNProbeSplit = 8;
446  int pass2Chunks = std::min(nprobe, kNProbeSplit);
447 
448  size_t sizeForFirstSelectPass =
449  pass2Chunks * k * (sizeof(float) + sizeof(int));
450 
451  // How much temporary storage we need per each query
452  size_t sizePerQuery =
453  2 * // streams
454  ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
455  nprobe * maxListLength * sizeof(float) + // allDistances
456  // residual distances
457  nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
458  sizeForFirstSelectPass);
459 
460  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
461 
462  if (queryTileSize < kMinQueryTileSize) {
463  queryTileSize = kMinQueryTileSize;
464  } else if (queryTileSize > kMaxQueryTileSize) {
465  queryTileSize = kMaxQueryTileSize;
466  }
467 
468  // FIXME: we should adjust queryTileSize to deal with this, since
469  // indexing is in int32
470  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
471  std::numeric_limits<int>::max());
472 
473  // Temporary memory buffers
474  // Make sure there is space prior to the start which will be 0, and
475  // will handle the boundary condition without branches
476  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
477  mem, {queryTileSize * nprobe + 1}, stream);
478  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
479  mem, {queryTileSize * nprobe + 1}, stream);
480 
481  DeviceTensor<int, 2, true> prefixSumOffsets1(
482  prefixSumOffsetSpace1[1].data(),
483  {queryTileSize, nprobe});
484  DeviceTensor<int, 2, true> prefixSumOffsets2(
485  prefixSumOffsetSpace2[1].data(),
486  {queryTileSize, nprobe});
487  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
488  {&prefixSumOffsets1, &prefixSumOffsets2};
489 
490  // Make sure the element before prefixSumOffsets is 0, since we
491  // depend upon simple, boundary-less indexing to get proper results
492  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
493  0,
494  sizeof(int),
495  stream));
496  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
497  0,
498  sizeof(int),
499  stream));
500 
501  int codeDistanceTypeSize = sizeof(float);
502 #ifdef FAISS_USE_FLOAT16
503  if (useFloat16Lookup) {
504  codeDistanceTypeSize = sizeof(half);
505  }
506 #else
507  FAISS_ASSERT(!useFloat16Lookup);
508  int codeSize = sizeof(float);
509 #endif
510 
511  int totalCodeDistancesSize =
512  queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
513  codeDistanceTypeSize;
514 
515  DeviceTensor<char, 1, true> codeDistances1Mem(
516  mem, {totalCodeDistancesSize}, stream);
517  NoTypeTensor<4, true> codeDistances1(
518  codeDistances1Mem.data(),
519  codeDistanceTypeSize,
520  {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
521 
522  DeviceTensor<char, 1, true> codeDistances2Mem(
523  mem, {totalCodeDistancesSize}, stream);
524  NoTypeTensor<4, true> codeDistances2(
525  codeDistances2Mem.data(),
526  codeDistanceTypeSize,
527  {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
528 
529  NoTypeTensor<4, true>* codeDistances[2] =
530  {&codeDistances1, &codeDistances2};
531 
532  DeviceTensor<float, 1, true> allDistances1(
533  mem, {queryTileSize * nprobe * maxListLength}, stream);
534  DeviceTensor<float, 1, true> allDistances2(
535  mem, {queryTileSize * nprobe * maxListLength}, stream);
536  DeviceTensor<float, 1, true>* allDistances[2] =
537  {&allDistances1, &allDistances2};
538 
539  DeviceTensor<float, 3, true> heapDistances1(
540  mem, {queryTileSize, pass2Chunks, k}, stream);
541  DeviceTensor<float, 3, true> heapDistances2(
542  mem, {queryTileSize, pass2Chunks, k}, stream);
543  DeviceTensor<float, 3, true>* heapDistances[2] =
544  {&heapDistances1, &heapDistances2};
545 
546  DeviceTensor<int, 3, true> heapIndices1(
547  mem, {queryTileSize, pass2Chunks, k}, stream);
548  DeviceTensor<int, 3, true> heapIndices2(
549  mem, {queryTileSize, pass2Chunks, k}, stream);
550  DeviceTensor<int, 3, true>* heapIndices[2] =
551  {&heapIndices1, &heapIndices2};
552 
553  auto streams = res->getAlternateStreamsCurrentDevice();
554  streamWait(streams, {stream});
555 
556  int curStream = 0;
557 
558  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
559  int numQueriesInTile =
560  std::min(queryTileSize, queries.getSize(0) - query);
561 
562  auto prefixSumOffsetsView =
563  prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
564 
565  auto codeDistancesView =
566  codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
567  auto coarseIndicesView =
568  topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
569  auto queryView =
570  queries.narrowOutermost(query, numQueriesInTile);
571 
572  auto heapDistancesView =
573  heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
574  auto heapIndicesView =
575  heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
576 
577  auto outDistanceView =
578  outDistances.narrowOutermost(query, numQueriesInTile);
579  auto outIndicesView =
580  outIndices.narrowOutermost(query, numQueriesInTile);
581 
582  runMultiPassTile(queryView,
583  centroids,
584  pqCentroidsInnermostCode,
585  codeDistancesView,
586  coarseIndicesView,
587  useFloat16Lookup,
588  bytesPerCode,
589  numSubQuantizers,
590  numSubQuantizerCodes,
591  listCodes,
592  listIndices,
593  indicesOptions,
594  listLengths,
595  *thrustMem[curStream],
596  prefixSumOffsetsView,
597  *allDistances[curStream],
598  heapDistancesView,
599  heapIndicesView,
600  k,
601  outDistanceView,
602  outIndicesView,
603  streams[curStream]);
604 
605  curStream = (curStream + 1) % 2;
606  }
607 
608  streamWait({stream}, streams);
609 }
610 
611 } } // namespace
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:174
Our tensor type.
Definition: Tensor.cuh:31
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:222