Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFPQ.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "IVFPQ.cuh"
13 #include "../GpuResources.h"
14 #include "BroadcastSum.cuh"
15 #include "Distance.cuh"
16 #include "FlatIndex.cuh"
17 #include "InvertedListAppend.cuh"
18 #include "L2Norm.cuh"
19 #include "PQCodeDistances.cuh"
20 #include "PQScanMultiPassNoPrecomputed.cuh"
21 #include "PQScanMultiPassPrecomputed.cuh"
22 #include "RemapIndices.h"
23 #include "VectorResidual.cuh"
24 #include "../utils/DeviceDefs.cuh"
25 #include "../utils/DeviceUtils.h"
26 #include "../utils/HostTensor.cuh"
27 #include "../utils/MatrixMult.cuh"
28 #include "../utils/NoTypeTensor.cuh"
29 #include "../utils/Transpose.cuh"
30 #include <limits>
31 #include <thrust/host_vector.h>
32 #include <unordered_map>
33 
34 namespace faiss { namespace gpu {
35 
37  FlatIndex* quantizer,
38  int numSubQuantizers,
39  int bitsPerSubQuantizer,
40  float* pqCentroidData,
41  IndicesOptions indicesOptions,
42  bool useFloat16LookupTables) :
43  IVFBase(resources,
44  quantizer,
45  numSubQuantizers,
46  indicesOptions),
47  numSubQuantizers_(numSubQuantizers),
48  bitsPerSubQuantizer_(bitsPerSubQuantizer),
49  numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
50  dimPerSubQuantizer_(dim_ / numSubQuantizers),
51  precomputedCodes_(false),
52  useFloat16LookupTables_(useFloat16LookupTables) {
53  FAISS_ASSERT(pqCentroidData);
54 
55  FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
56  FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
58 
59 #ifndef FAISS_USE_FLOAT16
60  FAISS_ASSERT(!useFloat16LookupTables_);
61 #endif
62 
63  setPQCentroids_(pqCentroidData);
64 }
65 
66 IVFPQ::~IVFPQ() {
67 }
68 
69 
70 bool
72  switch (size) {
73  case 1:
74  case 2:
75  case 3:
76  case 4:
77  case 8:
78  case 12:
79  case 16:
80  case 20:
81  case 24:
82  case 28:
83  case 32:
84  case 40:
85  case 48:
86  case 56: // only supported with float16
87  case 64: // only supported with float16
88  return true;
89  default:
90  return false;
91  }
92 }
93 
94 bool
96  return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
97 }
98 
99 void
101  if (precomputedCodes_ != enable) {
102  precomputedCodes_ = enable;
103 
104  if (precomputedCodes_) {
105  precomputeCodes_();
106  } else {
107  // Clear out old precomputed code data
108  precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
109 
110 #ifdef FAISS_USE_FLOAT16
111  precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
112 #endif
113  }
114  }
115 }
116 
117 int
119  Tensor<long, 1, true>& indices) {
120  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
121  FAISS_ASSERT(vecs.getSize(1) == dim_);
122 
123  FAISS_ASSERT(!quantizer_->getUseFloat16());
124  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
125  auto& mem = resources_->getMemoryManagerCurrentDevice();
126  auto stream = resources_->getDefaultStreamCurrentDevice();
127 
128  // Number of valid vectors that we actually add; we return this
129  int numAdded = 0;
130 
131  // We don't actually need this
132  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
133  // We use this
134  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
135  auto listIds = listIds2d.view<1>({vecs.getSize(0)});
136 
137  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
138 
139  // Copy the lists that we wish to append to back to the CPU
140  // FIXME: really this can be into pinned memory and a true async
141  // copy on a different stream; we can start the copy early, but it's
142  // tiny
143  HostTensor<int, 1, true> listIdsHost(listIds, stream);
144 
145  // Calculate the residual for each closest centroid
147  mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
148 
149  runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
150 
151  // Residuals are in the form
152  // (vec x numSubQuantizer x dimPerSubQuantizer)
153  // transpose to
154  // (numSubQuantizer x vec x dimPerSubQuantizer)
155  auto residualsView = residuals.view<3>(
156  {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
157 
158  DeviceTensor<float, 3, true> residualsTranspose(
159  mem,
160  {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
161  stream);
162 
163  runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);
164 
165  // Get the product quantizer centroids in the form
166  // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
167  // which is pqCentroidsMiddleCode_
168 
169  // We now have a batch operation to find the top-1 distances:
170  // batch size: numSubQuantizer
171  // centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
172  // residuals: (vec x dimPerSubQuantizer)
173  // => (numSubQuantizer x vec x 1)
174 
175  DeviceTensor<float, 3, true> closestSubQDistance(
176  mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
177  DeviceTensor<int, 3, true> closestSubQIndex(
178  mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
179 
180  for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
181  auto closestSubQDistanceView = closestSubQDistance[subQ].view();
182  auto closestSubQIndexView = closestSubQIndex[subQ].view();
183 
184  auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
185  auto residualsTransposeView = residualsTranspose[subQ].view();
186 
187  runL2Distance(resources_,
188  pqCentroidsMiddleCodeView,
189  nullptr, // no precomputed norms
190  residualsTransposeView,
191  1,
192  closestSubQDistanceView,
193  closestSubQIndexView,
194  // We don't care about distances
195  true,
196  // Much larger tile size, since these vectors are a
197  // lot smaller than query vectors
198  1024);
199  }
200 
201  // Now, we have the nearest sub-q centroid for each slice of the
202  // residual vector.
203  auto closestSubQIndexView = closestSubQIndex.view<2>(
204  {numSubQuantizers_, residuals.getSize(0)});
205 
206  // Transpose this for easy use
207  DeviceTensor<int, 2, true> encodings(
208  mem, {residuals.getSize(0), numSubQuantizers_}, stream);
209 
210  runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);
211 
212  // Now we add the encoded vectors to the individual lists
213  // First, make sure that there is space available for adding the new
214  // encoded vectors and indices
215 
216  // list id -> # being added
217  std::unordered_map<int, int> assignCounts;
218 
219  // vector id -> offset in list
220  // (we already have vector id -> list id in listIds)
221  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});
222 
223  for (int i = 0; i < listIdsHost.getSize(0); ++i) {
224  int listId = listIdsHost[i];
225 
226  // Add vector could be invalid (contains NaNs etc)
227  if (listId < 0) {
228  listOffsetHost[i] = -1;
229  continue;
230  }
231 
232  FAISS_ASSERT(listId < numLists_);
233  ++numAdded;
234 
235  int offset = deviceListData_[listId]->size() / bytesPerVector_;
236 
237  auto it = assignCounts.find(listId);
238  if (it != assignCounts.end()) {
239  offset += it->second;
240  it->second++;
241  } else {
242  assignCounts[listId] = 1;
243  }
244 
245  listOffsetHost[i] = offset;
246  }
247 
248  // If we didn't add anything (all invalid vectors), no need to
249  // continue
250  if (numAdded == 0) {
251  return 0;
252  }
253 
254  // We need to resize the data structures for the inverted lists on
255  // the GPUs, which means that they might need reallocation, which
256  // means that their base address may change. Figure out the new base
257  // addresses, and update those in a batch on the device
258  {
259  // Resize all of the lists that we are appending to
260  for (auto& counts : assignCounts) {
261  auto& codes = deviceListData_[counts.first];
262  codes->resize(codes->size() + counts.second * bytesPerVector_,
263  stream);
264  int newNumVecs = (int) (codes->size() / bytesPerVector_);
265 
266  auto& indices = deviceListIndices_[counts.first];
267  if ((indicesOptions_ == INDICES_32_BIT) ||
268  (indicesOptions_ == INDICES_64_BIT)) {
269  size_t indexSize =
270  (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);
271 
272  indices->resize(indices->size() + counts.second * indexSize, stream);
273  } else if (indicesOptions_ == INDICES_CPU) {
274  // indices are stored on the CPU side
275  FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());
276 
277  auto& userIndices = listOffsetToUserIndex_[counts.first];
278  userIndices.resize(newNumVecs);
279  } else {
280  // indices are not stored on the GPU or CPU side
281  FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
282  }
283 
284  // This is used by the multi-pass query to decide how much scratch
285  // space to allocate for intermediate results
286  maxListLength_ = std::max(maxListLength_, newNumVecs);
287  }
288 
289  // Update all pointers and sizes on the device for lists that we
290  // appended to
291  {
292  std::vector<int> listIds(assignCounts.size());
293  int i = 0;
294  for (auto& counts : assignCounts) {
295  listIds[i++] = counts.first;
296  }
297 
298  updateDeviceListInfo_(listIds, stream);
299  }
300  }
301 
302  // If we're maintaining the indices on the CPU side, update our
303  // map. We already resized our map above.
304  if (indicesOptions_ == INDICES_CPU) {
305  // We need to maintain the indices on the CPU side
306  HostTensor<long, 1, true> hostIndices(indices, stream);
307 
308  for (int i = 0; i < hostIndices.getSize(0); ++i) {
309  int listId = listIdsHost[i];
310 
311  // Add vector could be invalid (contains NaNs etc)
312  if (listId < 0) {
313  continue;
314  }
315 
316  int offset = listOffsetHost[i];
317 
318  FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
319  auto& userIndices = listOffsetToUserIndex_[listId];
320 
321  FAISS_ASSERT(offset < userIndices.size());
322  userIndices[offset] = hostIndices[i];
323  }
324  }
325 
326  // We similarly need to actually append the new encoded vectors
327  {
328  DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);
329 
330  // This kernel will handle appending each encoded vector + index to
331  // the appropriate list
332  runIVFPQInvertedListAppend(listIds,
333  listOffset,
334  encodings,
335  indices,
339  stream);
340  }
341 
342  return numAdded;
343 }
344 
345 void
347  const void* codes,
348  const long* indices,
349  size_t numVecs) {
350  // This list must already exist
351  FAISS_ASSERT(listId < deviceListData_.size());
352  auto stream = resources_->getDefaultStreamCurrentDevice();
353 
354  // If there's nothing to add, then there's nothing we have to do
355  if (numVecs == 0) {
356  return;
357  }
358 
359  size_t lengthInBytes = numVecs * bytesPerVector_;
360 
361  auto& listCodes = deviceListData_[listId];
362  auto prevCodeData = listCodes->data();
363 
364  // We only have int32 length representations on the GPU per each
365  // list; the length is in sizeof(char)
366  FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
367  FAISS_ASSERT(listCodes->size() + lengthInBytes <=
368  (size_t) std::numeric_limits<int>::max());
369 
370  listCodes->append((unsigned char*) codes,
371  lengthInBytes,
372  stream,
373  true /* exact reserved size */);
374 
375  // Handle the indices as well
376  addIndicesFromCpu_(listId, indices, numVecs);
377 
378  // This list address may have changed due to vector resizing, but
379  // only bother updating it on the device if it has changed
380  if (prevCodeData != listCodes->data()) {
381  deviceListDataPointers_[listId] = listCodes->data();
382  }
383 
384  // And our size has changed too
385  int listLength = listCodes->size() / bytesPerVector_;
386  deviceListLengths_[listId] = listLength;
387 
388  // We update this as well, since the multi-pass algorithm uses it
389  maxListLength_ = std::max(maxListLength_, listLength);
390 
391  // device_vector add is potentially happening on a different stream
392  // than our default stream
393  if (resources_->getDefaultStreamCurrentDevice() != 0) {
394  streamWait({stream}, {0});
395  }
396 }
397 
398 void
399 IVFPQ::setPQCentroids_(float* data) {
400  size_t pqSize =
401  numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;
402 
403  // Make sure the data is on the host
404  // FIXME: why are we doing this?
405  thrust::host_vector<float> hostMemory;
406  hostMemory.insert(hostMemory.end(), data, data + pqSize);
407 
409  hostMemory.data(),
410  {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
411  DeviceTensor<float, 3, true> pqDevice(
412  pqHost,
413  resources_->getDefaultStreamCurrentDevice());
414 
415  DeviceTensor<float, 3, true> pqDeviceTranspose(
416  {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
417  runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
418  resources_->getDefaultStreamCurrentDevice());
419 
420  pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);
421 
422  // Also maintain the PQ centroids in the form
423  // (sub q)(code id)(sub dim)
424  DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
425  {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
426  runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
427  resources_->getDefaultStreamCurrentDevice());
428 
429  pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
430 }
431 
432 void
433 IVFPQ::precomputeCodes_() {
434  //
435  // d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
436  // --------------- --------------------------- -------
437  // term 1 term 2 term 3
438  //
439 
440  // Terms 1 and 3 are available only at query time. We compute term 2
441  // here.
442  FAISS_ASSERT(!quantizer_->getUseFloat16());
443  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
444 
445  // Compute ||y_R||^2 by treating
446  // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
447  auto pqCentroidsMiddleCodeView =
448  pqCentroidsMiddleCode_.view<2>(
449  {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
450  DeviceTensor<float, 1, true> subQuantizerNorms(
451  {numSubQuantizers_ * numSubQuantizerCodes_});
452 
453  runL2Norm(pqCentroidsMiddleCodeView, subQuantizerNorms, true,
454  resources_->getDefaultStreamCurrentDevice());
455 
456  // Compute 2 * (y_C|y_R) via batch matrix multiplication
457  // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
458  // => (sub q) x {(centroid id)(code id)}
459  // => (sub q)(centroid id)(code id)
460 
461  // View (centroid id)(dim) as
462  // (centroid id)(sub q)(dim)
463  // Transpose (centroid id)(sub q)(sub dim) to
464  // (sub q)(centroid id)(sub dim)
465  auto centroidView = coarseCentroids.view<3>(
466  {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
467  DeviceTensor<float, 3, true> centroidsTransposed(
468  {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
469 
470  runTransposeAny(centroidView, 0, 1, centroidsTransposed,
471  resources_->getDefaultStreamCurrentDevice());
472 
473  DeviceTensor<float, 3, true> coarsePQProduct(
474  {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});
475 
476  runIteratedMatrixMult(coarsePQProduct, false,
477  centroidsTransposed, false,
478  pqCentroidsMiddleCode_, true,
479  2.0f, 0.0f,
480  resources_->getBlasHandleCurrentDevice(),
481  resources_->getDefaultStreamCurrentDevice());
482 
483  // Transpose (sub q)(centroid id)(code id) to
484  // (centroid id)(sub q)(code id)
485  DeviceTensor<float, 3, true> coarsePQProductTransposed(
486  {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
487  runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
488  resources_->getDefaultStreamCurrentDevice());
489 
490  // View (centroid id)(sub q)(code id) as
491  // (centroid id)(sub q * code id)
492  auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
493  {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});
494 
495  // Sum || y_R ||^2 + 2 * (y_C|y_R)
496  // i.e., add norms (sub q * code id)
497  // along columns of inner product (centroid id)(sub q * code id)
498  runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
499  resources_->getDefaultStreamCurrentDevice());
500 
501 #ifdef FAISS_USE_FLOAT16
502  if (useFloat16LookupTables_) {
503  precomputedCodeHalf_ = toHalf(resources_,
504  resources_->getDefaultStreamCurrentDevice(),
505  coarsePQProductTransposed);
506  return;
507  }
508 #endif
509 
510  // We added into the view, so `coarsePQProductTransposed` is now our
511  // precomputed term 2.
512  precomputedCode_ = std::move(coarsePQProductTransposed);
513 }
514 
515 void
517  int nprobe,
518  int k,
519  Tensor<float, 2, true>& outDistances,
520  Tensor<long, 2, true>& outIndices) {
521  // Validate these at a top level
522  FAISS_ASSERT(nprobe <= 1024);
523  FAISS_ASSERT(k <= 1024);
524 
525  auto& mem = resources_->getMemoryManagerCurrentDevice();
526  auto stream = resources_->getDefaultStreamCurrentDevice();
527  nprobe = std::min(nprobe, quantizer_->getSize());
528 
529  FAISS_ASSERT(queries.getSize(1) == dim_);
530  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
531  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
532 
533  // Reserve space for the closest coarse centroids
535  coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
537  coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
538 
539  // Find the `nprobe` closest coarse centroids; we can use int
540  // indices both internally and externally
541  quantizer_->query(queries,
542  nprobe,
543  coarseDistances,
544  coarseIndices,
545  true);
546 
547  if (precomputedCodes_) {
548  runPQPrecomputedCodes_(queries,
549  coarseDistances,
550  coarseIndices,
551  k,
552  outDistances,
553  outIndices);
554  } else {
555  runPQNoPrecomputedCodes_(queries,
556  coarseDistances,
557  coarseIndices,
558  k,
559  outDistances,
560  outIndices);
561  }
562 
563  // If the GPU isn't storing indices (they are on the CPU side), we
564  // need to perform the re-mapping here
565  // FIXME: we might ultimately be calling this function with inputs
566  // from the CPU, these are unnecessary copies
567  if (indicesOptions_ == INDICES_CPU) {
568  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
569 
570  ivfOffsetToUserIndex(hostOutIndices.data(),
571  numLists_,
572  hostOutIndices.getSize(0),
573  hostOutIndices.getSize(1),
575 
576  // Copy back to GPU, since the input to this function is on the
577  // GPU
578  outIndices.copyFrom(hostOutIndices, stream);
579  }
580 }
581 
582 std::vector<unsigned char>
583 IVFPQ::getListCodes(int listId) const {
584  FAISS_ASSERT(listId < deviceListData_.size());
585 
586  return deviceListData_[listId]->copyToHost<unsigned char>(
587  resources_->getDefaultStreamCurrentDevice());
588 }
589 
592  return pqCentroidsMiddleCode_;
593 }
594 
595 void
596 IVFPQ::runPQPrecomputedCodes_(
597  Tensor<float, 2, true>& queries,
598  DeviceTensor<float, 2, true>& coarseDistances,
599  DeviceTensor<int, 2, true>& coarseIndices,
600  int k,
601  Tensor<float, 2, true>& outDistances,
602  Tensor<long, 2, true>& outIndices) {
603  auto& mem = resources_->getMemoryManagerCurrentDevice();
604  auto stream = resources_->getDefaultStreamCurrentDevice();
605 
606  // Compute precomputed code term 3, - 2 * (x|y_R)
607  // This is done via batch MM
608  // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
609  // {sub q} x {(query id)(code id)}
610  DeviceTensor<float, 3, true> term3Transposed(
611  mem,
612  {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
613  stream);
614 
615  // These allocations within are only temporary, so release them when
616  // we're done to maximize free space
617  {
618  auto querySubQuantizerView = queries.view<3>(
619  {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
620  DeviceTensor<float, 3, true> queriesTransposed(
621  mem,
622  {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
623  stream);
624  runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);
625 
626  DeviceTensor<float, 3, true> term3(
627  mem,
628  {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
629  stream);
630 
631  runIteratedMatrixMult(term3, false,
632  queriesTransposed, false,
633  pqCentroidsMiddleCode_, true,
634  -2.0f, 0.0f,
635  resources_->getBlasHandleCurrentDevice(),
636  stream);
637 
638  runTransposeAny(term3, 0, 1, term3Transposed, stream);
639  }
640 
641  NoTypeTensor<3, true> term2;
642  NoTypeTensor<3, true> term3;
643 #ifdef FAISS_USE_FLOAT16
644  DeviceTensor<half, 3, true> term3Half;
645 
646  if (useFloat16LookupTables_) {
647  term3Half = toHalf(resources_, stream, term3Transposed);
648  term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
649  term3 = NoTypeTensor<3, true>(term3Half);
650  }
651 #endif
652 
653  if (!useFloat16LookupTables_) {
654  term2 = NoTypeTensor<3, true>(precomputedCode_);
655  term3 = NoTypeTensor<3, true>(term3Transposed);
656  }
657 
658  runPQScanMultiPassPrecomputed(queries,
659  coarseDistances, // term 1
660  term2, // term 2
661  term3, // term 3
662  coarseIndices,
663  useFloat16LookupTables_,
665  numSubQuantizers_,
666  numSubQuantizerCodes_,
672  k,
673  outDistances,
674  outIndices,
675  resources_);
676 }
677 
678 void
679 IVFPQ::runPQNoPrecomputedCodes_(
680  Tensor<float, 2, true>& queries,
681  DeviceTensor<float, 2, true>& coarseDistances,
682  DeviceTensor<int, 2, true>& coarseIndices,
683  int k,
684  Tensor<float, 2, true>& outDistances,
685  Tensor<long, 2, true>& outIndices) {
686  FAISS_ASSERT(!quantizer_->getUseFloat16());
687  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
688 
689  runPQScanMultiPassNoPrecomputed(queries,
690  coarseCentroids,
691  pqCentroidsInnermostCode_,
692  coarseIndices,
693  useFloat16LookupTables_,
695  numSubQuantizers_,
696  numSubQuantizerCodes_,
702  k,
703  outDistances,
704  outIndices,
705  resources_);
706 }
707 
708 } } // namespace
const int numLists_
Number of inverted lists we maintain.
Definition: IVFBase.cuh:90
int maxListLength_
Maximum list length seen.
Definition: IVFBase.cuh:111
void addCodeVectorsFromCpu(int listId, const void *codes, const long *indices, size_t numVecs)
Definition: IVFPQ.cu:346
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:40
std::vector< std::vector< long > > listOffsetToUserIndex_
Definition: IVFBase.cuh:123
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:162
Base inverted list functionality for IVFFlat and IVFPQ.
Definition: IVFBase.cuh:27
IVFPQ(GpuResources *resources, FlatIndex *quantizer, int numSubQuantizers, int bitsPerSubQuantizer, float *pqCentroidData, IndicesOptions indicesOptions, bool useFloat16LookupTables)
Definition: IVFPQ.cu:36
thrust::device_vector< int > deviceListLengths_
Definition: IVFBase.cuh:108
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
Definition: IVFPQ.cu:71
thrust::device_vector< void * > deviceListIndexPointers_
Definition: IVFBase.cuh:104
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFPQ.cu:118
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFPQ.cu:516
Tensor< float, 3, true > getPQCentroids()
Definition: IVFPQ.cu:591
FlatIndex * quantizer_
Quantizer object.
Definition: IVFBase.cuh:84
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
Definition: IVFPQ.cu:100
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
Definition: IVFPQ.cu:583
thrust::device_vector< void * > deviceListDataPointers_
Definition: IVFBase.cuh:100
GpuResources * resources_
Collection of GPU resources that we use.
Definition: IVFBase.cuh:81
Our tensor type.
Definition: Tensor.cuh:31
__host__ __device__ Tensor< T, SubDim, Contig, IndexT, PtrTraits > view(DataPtrType at)
Definition: Tensor-inl.cuh:526
const int bytesPerVector_
Number of bytes per vector in the list.
Definition: IVFBase.cuh:93
void updateDeviceListInfo_(cudaStream_t stream)
Update all device-side list pointer and size information.
Definition: IVFBase.cu:136
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:210
__host__ void copyFrom(Tensor< T, Dim, Contig, IndexT, PtrTraits > &t, cudaStream_t stream)
Copies a tensor into ourselves; sizes must match.
Definition: Tensor-inl.cuh:101
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:61
const IndicesOptions indicesOptions_
How are user indices stored on the GPU?
Definition: IVFBase.cuh:96
std::vector< std::unique_ptr< DeviceVector< unsigned char > > > deviceListData_
Definition: IVFBase.cuh:117
const int dim_
Expected dimensionality of the vectors.
Definition: IVFBase.cuh:87
void addIndicesFromCpu_(int listId, const long *indices, size_t numVecs)
Shared function to copy indices from CPU to GPU.
Definition: IVFBase.cu:243
static bool isSupportedNoPrecomputedSubDimSize(int dims)
Definition: IVFPQ.cu:95