Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFPQ.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "IVFPQ.cuh"
12 #include "../GpuResources.h"
13 #include "BroadcastSum.cuh"
14 #include "Distance.cuh"
15 #include "FlatIndex.cuh"
16 #include "InvertedListAppend.cuh"
17 #include "L2Norm.cuh"
18 #include "PQCodeDistances.cuh"
19 #include "PQScanMultiPassNoPrecomputed.cuh"
20 #include "PQScanMultiPassPrecomputed.cuh"
21 #include "RemapIndices.h"
22 #include "VectorResidual.cuh"
23 #include "../utils/DeviceDefs.cuh"
24 #include "../utils/DeviceUtils.h"
25 #include "../utils/HostTensor.cuh"
26 #include "../utils/MatrixMult.cuh"
27 #include "../utils/NoTypeTensor.cuh"
28 #include "../utils/Transpose.cuh"
29 #include <limits>
30 #include <thrust/host_vector.h>
31 #include <unordered_map>
32 
33 namespace faiss { namespace gpu {
34 
36  FlatIndex* quantizer,
37  int numSubQuantizers,
38  int bitsPerSubQuantizer,
39  float* pqCentroidData,
40  IndicesOptions indicesOptions,
41  bool useFloat16LookupTables,
42  MemorySpace space) :
43  IVFBase(resources,
44  quantizer,
45  numSubQuantizers,
46  indicesOptions,
47  space),
48  numSubQuantizers_(numSubQuantizers),
49  bitsPerSubQuantizer_(bitsPerSubQuantizer),
50  numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
51  dimPerSubQuantizer_(dim_ / numSubQuantizers),
52  precomputedCodes_(false),
53  useFloat16LookupTables_(useFloat16LookupTables) {
54  FAISS_ASSERT(pqCentroidData);
55 
56  FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
57  FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
59 
60 #ifndef FAISS_USE_FLOAT16
61  FAISS_ASSERT(!useFloat16LookupTables_);
62 #endif
63 
64  setPQCentroids_(pqCentroidData);
65 }
66 
67 IVFPQ::~IVFPQ() {
68 }
69 
70 
71 bool
73  switch (size) {
74  case 1:
75  case 2:
76  case 3:
77  case 4:
78  case 8:
79  case 12:
80  case 16:
81  case 20:
82  case 24:
83  case 28:
84  case 32:
85  case 40:
86  case 48:
87  case 56: // only supported with float16
88  case 64: // only supported with float16
89  case 96: // only supported with float16
90  return true;
91  default:
92  return false;
93  }
94 }
95 
96 bool
98  return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
99 }
100 
101 void
103  if (precomputedCodes_ != enable) {
104  precomputedCodes_ = enable;
105 
106  if (precomputedCodes_) {
107  precomputeCodes_();
108  } else {
109  // Clear out old precomputed code data
110  precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
111 
112 #ifdef FAISS_USE_FLOAT16
113  precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
114 #endif
115  }
116  }
117 }
118 
119 int
121  Tensor<long, 1, true>& indices) {
122  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
123  FAISS_ASSERT(vecs.getSize(1) == dim_);
124 
125  FAISS_ASSERT(!quantizer_->getUseFloat16());
126  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
128  auto stream = resources_->getDefaultStreamCurrentDevice();
129 
130  // Number of valid vectors that we actually add; we return this
131  int numAdded = 0;
132 
133  // We don't actually need this
134  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
135  // We use this
136  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
137  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
138 
139  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
140 
141  // Copy the lists that we wish to append to back to the CPU
142  // FIXME: really this can be into pinned memory and a true async
143  // copy on a different stream; we can start the copy early, but it's
144  // tiny
145  HostTensor<int, 1, true> listIdsHost(listIds, stream);
146 
147  // Calculate the residual for each closest centroid
149  mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
150 
151  runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
152 
153  // Residuals are in the form
154  // (vec x numSubQuantizer x dimPerSubQuantizer)
155  // transpose to
156  // (numSubQuantizer x vec x dimPerSubQuantizer)
157  auto residualsView = residuals.view<3>(
158  {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
159 
160  DeviceTensor<float, 3, true> residualsTranspose(
161  mem,
162  {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
163  stream);
164 
165  runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
166 
167  // Get the product quantizer centroids in the form
168  // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
169  // which is pqCentroidsMiddleCode_
170 
171  // We now have a batch operation to find the top-1 distances:
172  // batch size: numSubQuantizer
173  // centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
174  // residuals: (vec x dimPerSubQuantizer)
175  // => (numSubQuantizer x vec x 1)
176 
177  DeviceTensor<float, 3, true> closestSubQDistance(
178  mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
179  DeviceTensor<int, 3, true> closestSubQIndex(
180  mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
181 
182  for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
183  auto closestSubQDistanceView = closestSubQDistance[subQ].view();
184  auto closestSubQIndexView = closestSubQIndex[subQ].view();
185 
186  auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
187  auto residualsTransposeView = residualsTranspose[subQ].view();
188 
189  runL2Distance(resources_,
190  pqCentroidsMiddleCodeView,
191  nullptr, // no transposed storage
192  nullptr, // no precomputed norms
193  residualsTransposeView,
194  1,
195  closestSubQDistanceView,
196  closestSubQIndexView,
197  // We don't care about distances
198  true,
199  // Much larger tile size, since these vectors are a
200  // lot smaller than query vectors
201  1024);
202  }
203 
204  // Now, we have the nearest sub-q centroid for each slice of the
205  // residual vector.
206  auto closestSubQIndexView = closestSubQIndex.view<2>(
207  {numSubQuantizers_, residuals.getSize(0)});
208 
209  // Transpose this for easy use
210  DeviceTensor<int, 2, true> encodings(
211  mem, {residuals.getSize(0), numSubQuantizers_}, stream);
212 
213  runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
214 
215  // Now we add the encoded vectors to the individual lists
216  // First, make sure that there is space available for adding the new
217  // encoded vectors and indices
218 
219  // list id -> # being added
220  std::unordered_map<int, int> assignCounts;
221 
222  // vector id -> offset in list
223  // (we already have vector id -> list id in listIds)
224  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
225 
226  for (int i = 0; i < listIdsHost.getSize(0); ++i) {
227  int listId = listIdsHost[i];
228 
229  // Add vector could be invalid (contains NaNs etc)
230  if (listId < 0) {
231  listOffsetHost[i] = -1;
232  continue;
233  }
234 
235  FAISS_ASSERT(listId < numLists_);
236  ++numAdded;
237 
238  int offset = deviceListData_[listId]->size() / bytesPerVector_;
239 
240  auto it = assignCounts.find(listId);
241  if (it != assignCounts.end()) {
242  offset += it->second;
243  it->second++;
244  } else {
245  assignCounts[listId] = 1;
246  }
247 
248  listOffsetHost[i] = offset;
249  }
250 
251  // If we didn't add anything (all invalid vectors), no need to
252  // continue
253  if (numAdded == 0) {
254  return 0;
255  }
256 
257  // We need to resize the data structures for the inverted lists on
258  // the GPUs, which means that they might need reallocation, which
259  // means that their base address may change. Figure out the new base
260  // addresses, and update those in a batch on the device
261  {
262  // Resize all of the lists that we are appending to
263  for (auto& counts : assignCounts) {
264  auto& codes = deviceListData_[counts.first];
265  codes->resize(codes->size() + counts.second * bytesPerVector_,
266  stream);
267  int newNumVecs = (int) (codes->size() / bytesPerVector_);
268 
269  auto& indices = deviceListIndices_[counts.first];
270  if ((indicesOptions_ == INDICES_32_BIT) ||
271  (indicesOptions_ == INDICES_64_BIT)) {
272  size_t indexSize =
273  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
274 
275  indices->resize(indices->size() + counts.second * indexSize, stream);
276  } else if (indicesOptions_ == INDICES_CPU) {
277  // indices are stored on the CPU side
278  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
279 
280  auto& userIndices = listOffsetToUserIndex_[counts.first];
281  userIndices.resize(newNumVecs);
282  } else {
283  // indices are not stored on the GPU or CPU side
284  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
285  }
286 
287  // This is used by the multi-pass query to decide how much scratch
288  // space to allocate for intermediate results
289  maxListLength_ = std::max(maxListLength_, newNumVecs);
290  }
291 
292  // Update all pointers and sizes on the device for lists that we
293  // appended to
294  {
295  std::vector<int> listIds(assignCounts.size());
296  int i = 0;
297  for (auto& counts : assignCounts) {
298  listIds[i++] = counts.first;
299  }
300 
301  updateDeviceListInfo_(listIds, stream);
302  }
303  }
304 
305  // If we're maintaining the indices on the CPU side, update our
306  // map. We already resized our map above.
307  if (indicesOptions_ == INDICES_CPU) {
308  // We need to maintain the indices on the CPU side
309  HostTensor<long, 1, true> hostIndices(indices, stream);
310 
311  for (int i = 0; i < hostIndices.getSize(0); ++i) {
312  int listId = listIdsHost[i];
313 
314  // Add vector could be invalid (contains NaNs etc)
315  if (listId < 0) {
316  continue;
317  }
318 
319  int offset = listOffsetHost[i];
320 
321  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
322  auto& userIndices = listOffsetToUserIndex_[listId];
323 
324  FAISS_ASSERT(offset < userIndices.size());
325  userIndices[offset] = hostIndices[i];
326  }
327  }
328 
329  // We similarly need to actually append the new encoded vectors
330  {
331  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
332 
333  // This kernel will handle appending each encoded vector + index to
334  // the appropriate list
335  runIVFPQInvertedListAppend(listIds,
336  listOffset,
337  encodings,
338  indices,
342  stream);
343  }
344 
345  return numAdded;
346 }
347 
348 void
350  const void* codes,
351  const long* indices,
352  size_t numVecs) {
353  // This list must already exist
354  FAISS_ASSERT(listId < deviceListData_.size());
355  auto stream = resources_->getDefaultStreamCurrentDevice();
356 
357  // If there's nothing to add, then there's nothing we have to do
358  if (numVecs == 0) {
359  return;
360  }
361 
362  size_t lengthInBytes = numVecs * bytesPerVector_;
363 
364  auto& listCodes = deviceListData_[listId];
365  auto prevCodeData = listCodes->data();
366 
367  // We only have int32 length representations on the GPU per each
368  // list; the length is in sizeof(char)
369  FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
370  FAISS_ASSERT(listCodes->size() + lengthInBytes <=
371  (size_t) std::numeric_limits<int>::max());
372 
373  listCodes->append((unsigned char*) codes,
374  lengthInBytes,
375  stream,
376  true /* exact reserved size */);
377 
378  // Handle the indices as well
379  addIndicesFromCpu_(listId, indices, numVecs);
380 
381  // This list address may have changed due to vector resizing, but
382  // only bother updating it on the device if it has changed
383  if (prevCodeData != listCodes->data()) {
384  deviceListDataPointers_[listId] = listCodes->data();
385  }
386 
387  // And our size has changed too
388  int listLength = listCodes->size() / bytesPerVector_;
389  deviceListLengths_[listId] = listLength;
390 
391  // We update this as well, since the multi-pass algorithm uses it
392  maxListLength_ = std::max(maxListLength_, listLength);
393 
394  // device_vector add is potentially happening on a different stream
395  // than our default stream
397  streamWait({stream}, {0});
398  }
399 }
400 
401 void
402 IVFPQ::setPQCentroids_(float* data) {
403  size_t pqSize =
404  numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
405 
406  // Make sure the data is on the host
407  // FIXME: why are we doing this?
408  thrust::host_vector<float> hostMemory;
409  hostMemory.insert(hostMemory.end(), data, data + pqSize);
410 
412  hostMemory.data(),
413  {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
414  DeviceTensor<float, 3, true> pqDevice(
415  pqHost,
417 
418  DeviceTensor<float, 3, true> pqDeviceTranspose(
419  {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
420  runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
422 
423  pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
424 
425  // Also maintain the PQ centroids in the form
426  // (sub q)(code id)(sub dim)
427  DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
428  {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
429  runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
431 
432  pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
433 }
434 
435 void
436 IVFPQ::precomputeCodes_() {
437  //
438  // d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
439  // --------------- --------------------------- -------
440  // term 1 term 2 term 3
441  //
442 
443  // Terms 1 and 3 are available only at query time. We compute term 2
444  // here.
445  FAISS_ASSERT(!quantizer_->getUseFloat16());
446  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
447 
448  // Compute ||y_R||^2 by treating
449  // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
450  auto pqCentroidsMiddleCodeView =
451  pqCentroidsMiddleCode_.view<2>(
452  {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
453  DeviceTensor<float, 1, true> subQuantizerNorms(
454  {numSubQuantizers_ * numSubQuantizerCodes_});
455 
456  runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms, true,
458 
459  // Compute 2 * (y_C|y_R) via batch matrix multiplication
460  // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
461  // => (sub q) x {(centroid id)(code id)}
462  // => (sub q)(centroid id)(code id)
463 
464  // View (centroid id)(dim) as
465  // (centroid id)(sub q)(dim)
466  // Transpose (centroid id)(sub q)(sub dim) to
467  // (sub q)(centroid id)(sub dim)
468  auto centroidView = coarseCentroids.view<3>(
469  {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
470  DeviceTensor<float, 3, true> centroidsTransposed(
471  {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
472 
473  runTransposeAny(centroidView, 0, 1, centroidsTransposed,
475 
476  DeviceTensor<float, 3, true> coarsePQProduct(
477  {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
478 
479  runIteratedMatrixMult(coarsePQProduct, false,
480  centroidsTransposed, false,
481  pqCentroidsMiddleCode_, true,
482  2.0f, 0.0f,
485 
486  // Transpose (sub q)(centroid id)(code id) to
487  // (centroid id)(sub q)(code id)
488  DeviceTensor<float, 3, true> coarsePQProductTransposed(
489  {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
490  runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
492 
493  // View (centroid id)(sub q)(code id) as
494  // (centroid id)(sub q * code id)
495  auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
496  {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
497 
498  // Sum || y_R ||^2 + 2 * (y_C|y_R)
499  // i.e., add norms (sub q * code id)
500  // along columns of inner product (centroid id)(sub q * code id)
501  runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
503 
504 #ifdef FAISS_USE_FLOAT16
505  if (useFloat16LookupTables_) {
506  precomputedCodeHalf_ = toHalf(resources_,
508  coarsePQProductTransposed);
509  return;
510  }
511 #endif
512 
513  // We added into the view, so `coarsePQProductTransposed` is now our
514  // precomputed term 2.
515  precomputedCode_ = std::move(coarsePQProductTransposed);
516 }
517 
518 void
520  int nprobe,
521  int k,
522  Tensor<float, 2, true>& outDistances,
523  Tensor<long, 2, true>& outIndices) {
524  // Validate these at a top level
525  FAISS_ASSERT(nprobe <= 1024);
526  FAISS_ASSERT(k <= 1024);
527 
529  auto stream = resources_->getDefaultStreamCurrentDevice();
530  nprobe = std::min(nprobe, quantizer_->getSize());
531 
532  FAISS_ASSERT(queries.getSize(1) == dim_);
533  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
534  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
535 
536  // Reserve space for the closest coarse centroids
538  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
540  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
541 
542  // Find the `nprobe` closest coarse centroids; we can use int
543  // indices both internally and externally
544  quantizer_->query(queries,
545  nprobe,
546  coarseDistances,
547  coarseIndices,
548  true);
549 
550  if (precomputedCodes_) {
551  runPQPrecomputedCodes_(queries,
552  coarseDistances,
553  coarseIndices,
554  k,
555  outDistances,
556  outIndices);
557  } else {
558  runPQNoPrecomputedCodes_(queries,
559  coarseDistances,
560  coarseIndices,
561  k,
562  outDistances,
563  outIndices);
564  }
565 
566  // If the GPU isn't storing indices (they are on the CPU side), we
567  // need to perform the re-mapping here
568  // FIXME: we might ultimately be calling this function with inputs
569  // from the CPU, these are unnecessary copies
570  if (indicesOptions_ == INDICES_CPU) {
571  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
572 
573  ivfOffsetToUserIndex(hostOutIndices.data(),
574  numLists_,
575  hostOutIndices.getSize(0),
576  hostOutIndices.getSize(1),
578 
579  // Copy back to GPU, since the input to this function is on the
580  // GPU
581  outIndices.copyFrom(hostOutIndices, stream);
582  }
583 }
584 
585 std::vector<unsigned char>
586 IVFPQ::getListCodes(int listId) const {
587  FAISS_ASSERT(listId < deviceListData_.size());
588 
589  return deviceListData_[listId]->copyToHost<unsigned char>(
591 }
592 
595  return pqCentroidsMiddleCode_;
596 }
597 
598 void
599 IVFPQ::runPQPrecomputedCodes_(
600  Tensor<float, 2, true>& queries,
601  DeviceTensor<float, 2, true>& coarseDistances,
602  DeviceTensor<int, 2, true>& coarseIndices,
603  int k,
604  Tensor<float, 2, true>& outDistances,
605  Tensor<long, 2, true>& outIndices) {
607  auto stream = resources_->getDefaultStreamCurrentDevice();
608 
609  // Compute precomputed code term 3, - 2 * (x|y_R)
610  // This is done via batch MM
611  // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
612  // {sub q} x {(query id)(code id)}
613  DeviceTensor<float, 3, true> term3Transposed(
614  mem,
615  {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
616  stream);
617 
618  // These allocations within are only temporary, so release them when
619  // we're done to maximize free space
620  {
621  auto querySubQuantizerView = queries.view<3>(
622  {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
623  DeviceTensor<float, 3, true> queriesTransposed(
624  mem,
625  {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
626  stream);
627  runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
628 
629  DeviceTensor<float, 3, true> term3(
630  mem,
631  {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
632  stream);
633 
634  runIteratedMatrixMult(term3, false,
635  queriesTransposed, false,
636  pqCentroidsMiddleCode_, true,
637  -2.0f, 0.0f,
639  stream);
640 
641  runTransposeAny(term3, 0, 1, term3Transposed, stream);
642  }
643 
644  NoTypeTensor<3, true> term2;
645  NoTypeTensor<3, true> term3;
646 #ifdef FAISS_USE_FLOAT16
647  DeviceTensor<half, 3, true> term3Half;
648 
649  if (useFloat16LookupTables_) {
650  term3Half = toHalf(resources_, stream, term3Transposed);
651  term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
652  term3 = NoTypeTensor<3, true>(term3Half);
653  }
654 #endif
655 
656  if (!useFloat16LookupTables_) {
657  term2 = NoTypeTensor<3, true>(precomputedCode_);
658  term3 = NoTypeTensor<3, true>(term3Transposed);
659  }
660 
661  runPQScanMultiPassPrecomputed(queries,
662  coarseDistances, // term 1
663  term2, // term 2
664  term3, // term 3
665  coarseIndices,
666  useFloat16LookupTables_,
668  numSubQuantizers_,
669  numSubQuantizerCodes_,
675  k,
676  outDistances,
677  outIndices,
678  resources_);
679 }
680 
681 void
682 IVFPQ::runPQNoPrecomputedCodes_(
683  Tensor<float, 2, true>& queries,
684  DeviceTensor<float, 2, true>& coarseDistances,
685  DeviceTensor<int, 2, true>& coarseIndices,
686  int k,
687  Tensor<float, 2, true>& outDistances,
688  Tensor<long, 2, true>& outIndices) {
689  FAISS_ASSERT(!quantizer_->getUseFloat16());
690  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
691 
692  runPQScanMultiPassNoPrecomputed(queries,
693  coarseCentroids,
694  pqCentroidsInnermostCode_,
695  coarseIndices,
696  useFloat16LookupTables_,
698  numSubQuantizers_,
699  numSubQuantizerCodes_,
705  k,
706  outDistances,
707  outIndices,
708  resources_);
709 }
710 
711 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:91
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:115
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
Definition: IVFPQ.cu:349
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:47
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:127
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:173
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:27
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:112
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
Definition: IVFPQ.cu:72
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:108
cublasHandle_t getBlasHandleCurrentDevice()
Calls getBlasHandle with the current device.
DeviceMemory & getMemoryManagerCurrentDevice()
Calls getMemoryManager for the current device.
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFPQ.cu:120
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFPQ.cu:519
Tensor< float, 3, true > getPQCentroids()
Definition: IVFPQ.cu:594
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:85
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
Definition: IVFPQ.cu:102
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
Definition: IVFPQ.cu:586
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:104
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:82
Our tensor type.
Definition: Tensor.cuh:30
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:578
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:94
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:138
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:221
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:101
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:79
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:97
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:121
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables, MemorySpace space)
Definition: IVFPQ.cu:35
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:88
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:245
static bool isSupportedNoPrecomputedSubDimSize(int dims)
Definition: IVFPQ.cu:97