Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFPQ.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "IVFPQ.cuh"
13 #include "../GpuResources.h"
14 #include "BroadcastSum.cuh"
15 #include "Distance.cuh"
16 #include "FlatIndex.cuh"
17 #include "InvertedListAppend.cuh"
18 #include "L2Norm.cuh"
19 #include "PQCodeDistances.cuh"
20 #include "PQScanMultiPassNoPrecomputed.cuh"
21 #include "PQScanMultiPassPrecomputed.cuh"
22 #include "RemapIndices.h"
23 #include "VectorResidual.cuh"
24 #include "../utils/DeviceDefs.cuh"
25 #include "../utils/DeviceUtils.h"
26 #include "../utils/HostTensor.cuh"
27 #include "../utils/MatrixMult.cuh"
28 #include "../utils/NoTypeTensor.cuh"
29 #include "../utils/Transpose.cuh"
30 #include <limits>
31 #include <thrust/host_vector.h>
32 #include <unordered_map>
33 
34 namespace faiss { namespace gpu {
35 
37  FlatIndex* quantizer,
38  int numSubQuantizers,
39  int bitsPerSubQuantizer,
40  float* pqCentroidData,
41  IndicesOptions indicesOptions,
42  bool useFloat16LookupTables) :
43  IVFBase(resources,
44  quantizer,
45  numSubQuantizers,
46  indicesOptions),
47  numSubQuantizers_(numSubQuantizers),
48  bitsPerSubQuantizer_(bitsPerSubQuantizer),
49  numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
50  dimPerSubQuantizer_(dim_ / numSubQuantizers),
51  precomputedCodes_(false),
52  useFloat16LookupTables_(useFloat16LookupTables) {
53  FAISS_ASSERT(pqCentroidData);
54 
55  FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
56  FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
58 
59 #ifndef FAISS_USE_FLOAT16
60  FAISS_ASSERT(!useFloat16LookupTables_);
61 #endif
62 
63  setPQCentroids_(pqCentroidData);
64 }
65 
66 IVFPQ::~IVFPQ() {
67 }
68 
69 
70 bool
72  switch (size) {
73  case 1:
74  case 2:
75  case 3:
76  case 4:
77  case 8:
78  case 12:
79  case 16:
80  case 20:
81  case 24:
82  case 28:
83  case 32:
84  case 40:
85  case 48:
86  case 56: // only supported with float16
87  case 64: // only supported with float16
88  case 96: // only supported with float16
89  return true;
90  default:
91  return false;
92  }
93 }
94 
95 bool
97  return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
98 }
99 
100 void
102  if (precomputedCodes_ != enable) {
103  precomputedCodes_ = enable;
104 
105  if (precomputedCodes_) {
106  precomputeCodes_();
107  } else {
108  // Clear out old precomputed code data
109  precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
110 
111 #ifdef FAISS_USE_FLOAT16
112  precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
113 #endif
114  }
115  }
116 }
117 
118 int
120  Tensor<long, 1, true>& indices) {
121  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
122  FAISS_ASSERT(vecs.getSize(1) == dim_);
123 
124  FAISS_ASSERT(!quantizer_->getUseFloat16());
125  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
126  auto& mem = resources_->getMemoryManagerCurrentDevice();
127  auto stream = resources_->getDefaultStreamCurrentDevice();
128 
129  // Number of valid vectors that we actually add; we return this
130  int numAdded = 0;
131 
132  // We don't actually need this
133  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
134  // We use this
135  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
136  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
137 
138  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
139 
140  // Copy the lists that we wish to append to back to the CPU
141  // FIXME: really this can be into pinned memory and a true async
142  // copy on a different stream; we can start the copy early, but it's
143  // tiny
144  HostTensor<int, 1, true> listIdsHost(listIds, stream);
145 
146  // Calculate the residual for each closest centroid
148  mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
149 
150  runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
151 
152  // Residuals are in the form
153  // (vec x numSubQuantizer x dimPerSubQuantizer)
154  // transpose to
155  // (numSubQuantizer x vec x dimPerSubQuantizer)
156  auto residualsView = residuals.view<3>(
157  {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
158 
159  DeviceTensor<float, 3, true> residualsTranspose(
160  mem,
161  {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
162  stream);
163 
164  runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
165 
166  // Get the product quantizer centroids in the form
167  // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
168  // which is pqCentroidsMiddleCode_
169 
170  // We now have a batch operation to find the top-1 distances:
171  // batch size: numSubQuantizer
172  // centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
173  // residuals: (vec x dimPerSubQuantizer)
174  // => (numSubQuantizer x vec x 1)
175 
176  DeviceTensor<float, 3, true> closestSubQDistance(
177  mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
178  DeviceTensor<int, 3, true> closestSubQIndex(
179  mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
180 
181  for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
182  auto closestSubQDistanceView = closestSubQDistance[subQ].view();
183  auto closestSubQIndexView = closestSubQIndex[subQ].view();
184 
185  auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
186  auto residualsTransposeView = residualsTranspose[subQ].view();
187 
188  runL2Distance(resources_,
189  pqCentroidsMiddleCodeView,
190  nullptr, // no transposed storage
191  nullptr, // no precomputed norms
192  residualsTransposeView,
193  1,
194  closestSubQDistanceView,
195  closestSubQIndexView,
196  // We don't care about distances
197  true,
198  // Much larger tile size, since these vectors are a
199  // lot smaller than query vectors
200  1024);
201  }
202 
203  // Now, we have the nearest sub-q centroid for each slice of the
204  // residual vector.
205  auto closestSubQIndexView = closestSubQIndex.view<2>(
206  {numSubQuantizers_, residuals.getSize(0)});
207 
208  // Transpose this for easy use
209  DeviceTensor<int, 2, true> encodings(
210  mem, {residuals.getSize(0), numSubQuantizers_}, stream);
211 
212  runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
213 
214  // Now we add the encoded vectors to the individual lists
215  // First, make sure that there is space available for adding the new
216  // encoded vectors and indices
217 
218  // list id -> # being added
219  std::unordered_map<int, int> assignCounts;
220 
221  // vector id -> offset in list
222  // (we already have vector id -> list id in listIds)
223  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
224 
225  for (int i = 0; i < listIdsHost.getSize(0); ++i) {
226  int listId = listIdsHost[i];
227 
228  // Add vector could be invalid (contains NaNs etc)
229  if (listId < 0) {
230  listOffsetHost[i] = -1;
231  continue;
232  }
233 
234  FAISS_ASSERT(listId < numLists_);
235  ++numAdded;
236 
237  int offset = deviceListData_[listId]->size() / bytesPerVector_;
238 
239  auto it = assignCounts.find(listId);
240  if (it != assignCounts.end()) {
241  offset += it->second;
242  it->second++;
243  } else {
244  assignCounts[listId] = 1;
245  }
246 
247  listOffsetHost[i] = offset;
248  }
249 
250  // If we didn't add anything (all invalid vectors), no need to
251  // continue
252  if (numAdded == 0) {
253  return 0;
254  }
255 
256  // We need to resize the data structures for the inverted lists on
257  // the GPUs, which means that they might need reallocation, which
258  // means that their base address may change. Figure out the new base
259  // addresses, and update those in a batch on the device
260  {
261  // Resize all of the lists that we are appending to
262  for (auto& counts : assignCounts) {
263  auto& codes = deviceListData_[counts.first];
264  codes->resize(codes->size() + counts.second * bytesPerVector_,
265  stream);
266  int newNumVecs = (int) (codes->size() / bytesPerVector_);
267 
268  auto& indices = deviceListIndices_[counts.first];
269  if ((indicesOptions_ == INDICES_32_BIT) ||
270  (indicesOptions_ == INDICES_64_BIT)) {
271  size_t indexSize =
272  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
273 
274  indices->resize(indices->size() + counts.second * indexSize, stream);
275  } else if (indicesOptions_ == INDICES_CPU) {
276  // indices are stored on the CPU side
277  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
278 
279  auto& userIndices = listOffsetToUserIndex_[counts.first];
280  userIndices.resize(newNumVecs);
281  } else {
282  // indices are not stored on the GPU or CPU side
283  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
284  }
285 
286  // This is used by the multi-pass query to decide how much scratch
287  // space to allocate for intermediate results
288  maxListLength_ = std::max(maxListLength_, newNumVecs);
289  }
290 
291  // Update all pointers and sizes on the device for lists that we
292  // appended to
293  {
294  std::vector<int> listIds(assignCounts.size());
295  int i = 0;
296  for (auto& counts : assignCounts) {
297  listIds[i++] = counts.first;
298  }
299 
300  updateDeviceListInfo_(listIds, stream);
301  }
302  }
303 
304  // If we're maintaining the indices on the CPU side, update our
305  // map. We already resized our map above.
306  if (indicesOptions_ == INDICES_CPU) {
307  // We need to maintain the indices on the CPU side
308  HostTensor<long, 1, true> hostIndices(indices, stream);
309 
310  for (int i = 0; i < hostIndices.getSize(0); ++i) {
311  int listId = listIdsHost[i];
312 
313  // Add vector could be invalid (contains NaNs etc)
314  if (listId < 0) {
315  continue;
316  }
317 
318  int offset = listOffsetHost[i];
319 
320  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
321  auto& userIndices = listOffsetToUserIndex_[listId];
322 
323  FAISS_ASSERT(offset < userIndices.size());
324  userIndices[offset] = hostIndices[i];
325  }
326  }
327 
328  // We similarly need to actually append the new encoded vectors
329  {
330  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
331 
332  // This kernel will handle appending each encoded vector + index to
333  // the appropriate list
334  runIVFPQInvertedListAppend(listIds,
335  listOffset,
336  encodings,
337  indices,
341  stream);
342  }
343 
344  return numAdded;
345 }
346 
347 void
349  const void* codes,
350  const long* indices,
351  size_t numVecs) {
352  // This list must already exist
353  FAISS_ASSERT(listId < deviceListData_.size());
354  auto stream = resources_->getDefaultStreamCurrentDevice();
355 
356  // If there's nothing to add, then there's nothing we have to do
357  if (numVecs == 0) {
358  return;
359  }
360 
361  size_t lengthInBytes = numVecs * bytesPerVector_;
362 
363  auto& listCodes = deviceListData_[listId];
364  auto prevCodeData = listCodes->data();
365 
366  // We only have int32 length representations on the GPU per each
367  // list; the length is in sizeof(char)
368  FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
369  FAISS_ASSERT(listCodes->size() + lengthInBytes <=
370  (size_t) std::numeric_limits<int>::max());
371 
372  listCodes->append((unsigned char*) codes,
373  lengthInBytes,
374  stream,
375  true /* exact reserved size */);
376 
377  // Handle the indices as well
378  addIndicesFromCpu_(listId, indices, numVecs);
379 
380  // This list address may have changed due to vector resizing, but
381  // only bother updating it on the device if it has changed
382  if (prevCodeData != listCodes->data()) {
383  deviceListDataPointers_[listId] = listCodes->data();
384  }
385 
386  // And our size has changed too
387  int listLength = listCodes->size() / bytesPerVector_;
388  deviceListLengths_[listId] = listLength;
389 
390  // We update this as well, since the multi-pass algorithm uses it
391  maxListLength_ = std::max(maxListLength_, listLength);
392 
393  // device_vector add is potentially happening on a different stream
394  // than our default stream
395  if (resources_->getDefaultStreamCurrentDevice() != 0) {
396  streamWait({stream}, {0});
397  }
398 }
399 
400 void
401 IVFPQ::setPQCentroids_(float* data) {
402  size_t pqSize =
403  numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
404 
405  // Make sure the data is on the host
406  // FIXME: why are we doing this?
407  thrust::host_vector<float> hostMemory;
408  hostMemory.insert(hostMemory.end(), data, data + pqSize);
409 
411  hostMemory.data(),
412  {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
413  DeviceTensor<float, 3, true> pqDevice(
414  pqHost,
415  resources_->getDefaultStreamCurrentDevice());
416 
417  DeviceTensor<float, 3, true> pqDeviceTranspose(
418  {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
419  runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
420  resources_->getDefaultStreamCurrentDevice());
421 
422  pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
423 
424  // Also maintain the PQ centroids in the form
425  // (sub q)(code id)(sub dim)
426  DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
427  {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
428  runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
429  resources_->getDefaultStreamCurrentDevice());
430 
431  pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
432 }
433 
434 void
435 IVFPQ::precomputeCodes_() {
436  //
437  // d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
438  // --------------- --------------------------- -------
439  // term 1 term 2 term 3
440  //
441 
442  // Terms 1 and 3 are available only at query time. We compute term 2
443  // here.
444  FAISS_ASSERT(!quantizer_->getUseFloat16());
445  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
446 
447  // Compute ||y_R||^2 by treating
448  // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
449  auto pqCentroidsMiddleCodeView =
450  pqCentroidsMiddleCode_.view<2>(
451  {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
452  DeviceTensor<float, 1, true> subQuantizerNorms(
453  {numSubQuantizers_ * numSubQuantizerCodes_});
454 
455  runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms, true,
456  resources_->getDefaultStreamCurrentDevice());
457 
458  // Compute 2 * (y_C|y_R) via batch matrix multiplication
459  // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
460  // => (sub q) x {(centroid id)(code id)}
461  // => (sub q)(centroid id)(code id)
462 
463  // View (centroid id)(dim) as
464  // (centroid id)(sub q)(dim)
465  // Transpose (centroid id)(sub q)(sub dim) to
466  // (sub q)(centroid id)(sub dim)
467  auto centroidView = coarseCentroids.view<3>(
468  {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
469  DeviceTensor<float, 3, true> centroidsTransposed(
470  {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
471 
472  runTransposeAny(centroidView, 0, 1, centroidsTransposed,
473  resources_->getDefaultStreamCurrentDevice());
474 
475  DeviceTensor<float, 3, true> coarsePQProduct(
476  {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
477 
478  runIteratedMatrixMult(coarsePQProduct, false,
479  centroidsTransposed, false,
480  pqCentroidsMiddleCode_, true,
481  2.0f, 0.0f,
482  resources_->getBlasHandleCurrentDevice(),
483  resources_->getDefaultStreamCurrentDevice());
484 
485  // Transpose (sub q)(centroid id)(code id) to
486  // (centroid id)(sub q)(code id)
487  DeviceTensor<float, 3, true> coarsePQProductTransposed(
488  {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
489  runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
490  resources_->getDefaultStreamCurrentDevice());
491 
492  // View (centroid id)(sub q)(code id) as
493  // (centroid id)(sub q * code id)
494  auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
495  {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
496 
497  // Sum || y_R ||^2 + 2 * (y_C|y_R)
498  // i.e., add norms (sub q * code id)
499  // along columns of inner product (centroid id)(sub q * code id)
500  runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
501  resources_->getDefaultStreamCurrentDevice());
502 
503 #ifdef FAISS_USE_FLOAT16
504  if (useFloat16LookupTables_) {
505  precomputedCodeHalf_ = toHalf(resources_,
506  resources_->getDefaultStreamCurrentDevice(),
507  coarsePQProductTransposed);
508  return;
509  }
510 #endif
511 
512  // We added into the view, so `coarsePQProductTransposed` is now our
513  // precomputed term 2.
514  precomputedCode_ = std::move(coarsePQProductTransposed);
515 }
516 
517 void
519  int nprobe,
520  int k,
521  Tensor<float, 2, true>& outDistances,
522  Tensor<long, 2, true>& outIndices) {
523  // Validate these at a top level
524  FAISS_ASSERT(nprobe <= 1024);
525  FAISS_ASSERT(k <= 1024);
526 
527  auto& mem = resources_->getMemoryManagerCurrentDevice();
528  auto stream = resources_->getDefaultStreamCurrentDevice();
529  nprobe = std::min(nprobe, quantizer_->getSize());
530 
531  FAISS_ASSERT(queries.getSize(1) == dim_);
532  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
533  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
534 
535  // Reserve space for the closest coarse centroids
537  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
539  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
540 
541  // Find the `nprobe` closest coarse centroids; we can use int
542  // indices both internally and externally
543  quantizer_->query(queries,
544  nprobe,
545  coarseDistances,
546  coarseIndices,
547  true);
548 
549  if (precomputedCodes_) {
550  runPQPrecomputedCodes_(queries,
551  coarseDistances,
552  coarseIndices,
553  k,
554  outDistances,
555  outIndices);
556  } else {
557  runPQNoPrecomputedCodes_(queries,
558  coarseDistances,
559  coarseIndices,
560  k,
561  outDistances,
562  outIndices);
563  }
564 
565  // If the GPU isn't storing indices (they are on the CPU side), we
566  // need to perform the re-mapping here
567  // FIXME: we might ultimately be calling this function with inputs
568  // from the CPU, these are unnecessary copies
569  if (indicesOptions_ == INDICES_CPU) {
570  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
571 
572  ivfOffsetToUserIndex(hostOutIndices.data(),
573  numLists_,
574  hostOutIndices.getSize(0),
575  hostOutIndices.getSize(1),
577 
578  // Copy back to GPU, since the input to this function is on the
579  // GPU
580  outIndices.copyFrom(hostOutIndices, stream);
581  }
582 }
583 
584 std::vector<unsigned char>
585 IVFPQ::getListCodes(int listId) const {
586  FAISS_ASSERT(listId < deviceListData_.size());
587 
588  return deviceListData_[listId]->copyToHost<unsigned char>(
589  resources_->getDefaultStreamCurrentDevice());
590 }
591 
594  return pqCentroidsMiddleCode_;
595 }
596 
597 void
598 IVFPQ::runPQPrecomputedCodes_(
599  Tensor<float, 2, true>& queries,
600  DeviceTensor<float, 2, true>& coarseDistances,
601  DeviceTensor<int, 2, true>& coarseIndices,
602  int k,
603  Tensor<float, 2, true>& outDistances,
604  Tensor<long, 2, true>& outIndices) {
605  auto& mem = resources_->getMemoryManagerCurrentDevice();
606  auto stream = resources_->getDefaultStreamCurrentDevice();
607 
608  // Compute precomputed code term 3, - 2 * (x|y_R)
609  // This is done via batch MM
610  // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
611  // {sub q} x {(query id)(code id)}
612  DeviceTensor<float, 3, true> term3Transposed(
613  mem,
614  {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
615  stream);
616 
617  // These allocations within are only temporary, so release them when
618  // we're done to maximize free space
619  {
620  auto querySubQuantizerView = queries.view<3>(
621  {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
622  DeviceTensor<float, 3, true> queriesTransposed(
623  mem,
624  {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
625  stream);
626  runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
627 
628  DeviceTensor<float, 3, true> term3(
629  mem,
630  {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
631  stream);
632 
633  runIteratedMatrixMult(term3, false,
634  queriesTransposed, false,
635  pqCentroidsMiddleCode_, true,
636  -2.0f, 0.0f,
637  resources_->getBlasHandleCurrentDevice(),
638  stream);
639 
640  runTransposeAny(term3, 0, 1, term3Transposed, stream);
641  }
642 
643  NoTypeTensor<3, true> term2;
644  NoTypeTensor<3, true> term3;
645 #ifdef FAISS_USE_FLOAT16
646  DeviceTensor<half, 3, true> term3Half;
647 
648  if (useFloat16LookupTables_) {
649  term3Half = toHalf(resources_, stream, term3Transposed);
650  term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
651  term3 = NoTypeTensor<3, true>(term3Half);
652  }
653 #endif
654 
655  if (!useFloat16LookupTables_) {
656  term2 = NoTypeTensor<3, true>(precomputedCode_);
657  term3 = NoTypeTensor<3, true>(term3Transposed);
658  }
659 
660  runPQScanMultiPassPrecomputed(queries,
661  coarseDistances, // term 1
662  term2, // term 2
663  term3, // term 3
664  coarseIndices,
665  useFloat16LookupTables_,
667  numSubQuantizers_,
668  numSubQuantizerCodes_,
674  k,
675  outDistances,
676  outIndices,
677  resources_);
678 }
679 
680 void
681 IVFPQ::runPQNoPrecomputedCodes_(
682  Tensor<float, 2, true>& queries,
683  DeviceTensor<float, 2, true>& coarseDistances,
684  DeviceTensor<int, 2, true>& coarseIndices,
685  int k,
686  Tensor<float, 2, true>& outDistances,
687  Tensor<long, 2, true>& outIndices) {
688  FAISS_ASSERT(!quantizer_->getUseFloat16());
689  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
690 
691  runPQScanMultiPassNoPrecomputed(queries,
692  coarseCentroids,
693  pqCentroidsInnermostCode_,
694  coarseIndices,
695  useFloat16LookupTables_,
697  numSubQuantizers_,
698  numSubQuantizerCodes_,
704  k,
705  outDistances,
706  outIndices,
707  resources_);
708 }
709 
710 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:90
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:111
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
Definition: IVFPQ.cu:348
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:43
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:123
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:174
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:27
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables)
Definition: IVFPQ.cu:36
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:108
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
Definition: IVFPQ.cu:71
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:104
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFPQ.cu:119
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFPQ.cu:518
Tensor< float, 3, true > getPQCentroids()
Definition: IVFPQ.cu:593
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:84
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
Definition: IVFPQ.cu:101
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
Definition: IVFPQ.cu:585
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:100
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:81
Our tensor type.
Definition: Tensor.cuh:31
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:579
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:93
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:137
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:222
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:102
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:64
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:96
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:117
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:87
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:244
static bool isSupportedNoPrecomputedSubDimSize(int dims)
Definition: IVFPQ.cu:96