Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
GpuIndexIVFPQ.cu
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #include "GpuIndexIVFPQ.h"
10 #include "../IndexFlat.h"
11 #include "../IndexIVFPQ.h"
12 #include "../ProductQuantizer.h"
13 #include "GpuIndexFlat.h"
14 #include "GpuResources.h"
15 #include "impl/IVFPQ.cuh"
16 #include "utils/CopyUtils.cuh"
17 #include "utils/DeviceUtils.h"
18 
19 #include <limits>
20 
21 namespace faiss { namespace gpu {
22 
24  const faiss::IndexIVFPQ* index,
25  GpuIndexIVFPQConfig config) :
26  GpuIndexIVF(resources,
27  index->d,
28  index->metric_type,
29  index->nlist,
30  config),
31  ivfpqConfig_(config),
32  subQuantizers_(0),
33  bitsPerCode_(0),
34  reserveMemoryVecs_(0),
35  index_(nullptr) {
36 #ifndef FAISS_USE_FLOAT16
37  FAISS_ASSERT(!ivfpqConfig_.useFloat16LookupTables);
38 #endif
39 
40  copyFrom(index);
41 }
42 
44  int dims,
45  int nlist,
46  int subQuantizers,
47  int bitsPerCode,
48  faiss::MetricType metric,
49  GpuIndexIVFPQConfig config) :
50  GpuIndexIVF(resources,
51  dims,
52  metric,
53  nlist,
54  config),
55  ivfpqConfig_(config),
56  subQuantizers_(subQuantizers),
57  bitsPerCode_(bitsPerCode),
58  reserveMemoryVecs_(0),
59  index_(nullptr) {
60 #ifndef FAISS_USE_FLOAT16
61  FAISS_ASSERT(!config.useFloat16LookupTables);
62 #endif
63 
64  verifySettings_();
65 
66  // FIXME make IP work fully
67  FAISS_ASSERT(this->metric_type == faiss::METRIC_L2);
68 
69  // We haven't trained ourselves, so don't construct the PQ index yet
70  this->is_trained = false;
71 }
72 
73 GpuIndexIVFPQ::~GpuIndexIVFPQ() {
74  delete index_;
75 }
76 
77 void
79  DeviceScope scope(device_);
80 
81  // FIXME: support this
82  FAISS_THROW_IF_NOT_MSG(index->metric_type == faiss::METRIC_L2,
83  "inner product unsupported");
84  GpuIndexIVF::copyFrom(index);
85 
86  // Clear out our old data
87  delete index_;
88  index_ = nullptr;
89 
90  subQuantizers_ = index->pq.M;
91  bitsPerCode_ = index->pq.nbits;
92 
93  // We only support this
94  FAISS_ASSERT(index->pq.nbits == 8);
95  FAISS_ASSERT(index->by_residual);
96  FAISS_ASSERT(index->polysemous_ht == 0);
97 
98  verifySettings_();
99 
100  // The other index might not be trained
101  if (!index->is_trained) {
102  return;
103  }
104 
105  // Otherwise, we can populate ourselves from the other index
106  this->is_trained = true;
107 
108  // Copy our lists as well
109  // The product quantizer must have data in it
110  FAISS_ASSERT(index->pq.centroids.size() > 0);
111  index_ = new IVFPQ(resources_,
113  subQuantizers_,
114  bitsPerCode_,
115  (float*) index->pq.centroids.data(),
116  ivfpqConfig_.indicesOptions,
117  ivfpqConfig_.useFloat16LookupTables,
118  memorySpace_);
119  // Doesn't make sense to reserve memory here
120  index_->setPrecomputedCodes(ivfpqConfig_.usePrecomputedTables);
121 
122  // Copy database vectors, if any
123  const InvertedLists *ivf = index->invlists;
124  size_t nlist = ivf ? ivf->nlist : 0;
125  for (size_t i = 0; i < nlist; ++i) {
126  size_t list_size = ivf->list_size(i);
127 
128  // GPU index can only support max int entries per list
129  FAISS_THROW_IF_NOT_FMT(list_size <=
130  (size_t) std::numeric_limits<int>::max(),
131  "GPU inverted list can only support "
132  "%zu entries; %zu found",
133  (size_t) std::numeric_limits<int>::max(),
134  list_size);
135 
136  index_->addCodeVectorsFromCpu(
137  i, ivf->get_codes(i), ivf->get_ids(i), list_size);
138  }
139 }
140 
141 void
143  DeviceScope scope(device_);
144 
145  // We must have the indices in order to copy to ourselves
146  FAISS_THROW_IF_NOT_MSG(ivfpqConfig_.indicesOptions != INDICES_IVF,
147  "Cannot copy to CPU as GPU index doesn't retain "
148  "indices (INDICES_IVF)");
149 
150  GpuIndexIVF::copyTo(index);
151 
152  //
153  // IndexIVFPQ information
154  //
155  index->by_residual = true;
156  index->use_precomputed_table = 0;
157  index->code_size = subQuantizers_;
158  index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
159 
160  index->do_polysemous_training = false;
161  index->polysemous_training = nullptr;
162 
163  index->scan_table_threshold = 0;
164  index->max_codes = 0;
165  index->polysemous_ht = 0;
166  index->precomputed_table.clear();
167 
169  nlist_, index->code_size);
170 
171  index->replace_invlists(ivf, true);
172 
173  if (index_) {
174  // Copy the inverted lists
175  for (int i = 0; i < nlist_; ++i) {
176  auto ids = getListIndices(i);
177  auto codes = getListCodes(i);
178  index->invlists->add_entries (i, ids.size(), ids.data(), codes.data());
179  }
180 
181  // Copy PQ centroids
182  auto devPQCentroids = index_->getPQCentroids();
183  index->pq.centroids.resize(devPQCentroids.numElements());
184 
185  fromDevice<float, 3>(devPQCentroids,
186  index->pq.centroids.data(),
188 
189  if (ivfpqConfig_.usePrecomputedTables) {
190  index->precompute_table();
191  }
192  }
193 }
194 
195 void
197  reserveMemoryVecs_ = numVecs;
198  if (index_) {
199  DeviceScope scope(device_);
200  index_->reserveMemory(numVecs);
201  }
202 }
203 
204 void
206  ivfpqConfig_.usePrecomputedTables = enable;
207  if (index_) {
208  DeviceScope scope(device_);
209  index_->setPrecomputedCodes(enable);
210  }
211 
212  verifySettings_();
213 }
214 
215 bool
217  return ivfpqConfig_.usePrecomputedTables;
218 }
219 
220 int
222  return subQuantizers_;
223 }
224 
225 int
227  return bitsPerCode_;
228 }
229 
230 int
232  return utils::pow2(bitsPerCode_);
233 }
234 
235 size_t
237  if (index_) {
238  DeviceScope scope(device_);
239  return index_->reclaimMemory();
240  }
241 
242  return 0;
243 }
244 
245 void
247  if (index_) {
248  DeviceScope scope(device_);
249 
250  index_->reset();
251  this->ntotal = 0;
252  } else {
253  FAISS_ASSERT(this->ntotal == 0);
254  }
255 }
256 
257 void
258 GpuIndexIVFPQ::trainResidualQuantizer_(Index::idx_t n, const float* x) {
259  // Code largely copied from faiss::IndexIVFPQ
260  // FIXME: GPUize more of this
261  n = std::min(n, (Index::idx_t) (1 << bitsPerCode_) * 64);
262 
263  if (this->verbose) {
264  printf("computing residuals\n");
265  }
266 
267  std::vector<Index::idx_t> assign(n);
268  quantizer_->assign (n, x, assign.data());
269 
270  std::vector<float> residuals(n * d);
271 
272  for (idx_t i = 0; i < n; i++) {
273  quantizer_->compute_residual(x + i * d, &residuals[i * d], assign[i]);
274  }
275 
276  if (this->verbose) {
277  printf("training %d x %d product quantizer on %ld vectors in %dD\n",
278  subQuantizers_, getCentroidsPerSubQuantizer(), n, this->d);
279  }
280 
281  // Just use the CPU product quantizer to determine sub-centroids
282  faiss::ProductQuantizer pq(this->d, subQuantizers_, bitsPerCode_);
283  pq.verbose = this->verbose;
284  pq.train(n, residuals.data());
285 
286  index_ = new IVFPQ(resources_,
288  subQuantizers_,
289  bitsPerCode_,
290  pq.centroids.data(),
291  ivfpqConfig_.indicesOptions,
292  ivfpqConfig_.useFloat16LookupTables,
293  memorySpace_);
294  if (reserveMemoryVecs_) {
295  index_->reserveMemory(reserveMemoryVecs_);
296  }
297 
298  index_->setPrecomputedCodes(ivfpqConfig_.usePrecomputedTables);
299 }
300 
301 void
302 GpuIndexIVFPQ::train(Index::idx_t n, const float* x) {
303  DeviceScope scope(device_);
304 
305  if (this->is_trained) {
306  FAISS_ASSERT(quantizer_->is_trained);
307  FAISS_ASSERT(quantizer_->ntotal == nlist_);
308  FAISS_ASSERT(index_);
309  return;
310  }
311 
312  FAISS_ASSERT(!index_);
313 
314  trainQuantizer_(n, x);
315  trainResidualQuantizer_(n, x);
316 
317  FAISS_ASSERT(index_);
318 
319  this->is_trained = true;
320 }
321 
322 void
324  const float* x,
325  const Index::idx_t* xids) {
326  // Device is already set in GpuIndex::add
327  FAISS_ASSERT(index_);
328  FAISS_ASSERT(n > 0);
329 
330  // Data is already resident on the GPU
331  Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
332 
333  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
334  Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
335 
336  // Not all vectors may be able to be added (some may contain NaNs etc)
337  index_->classifyAndAddVectors(data, labels);
338 
339  // but keep the ntotal based on the total number of vectors that we attempted
340  // to add
341  ntotal += n;
342 }
343 
344 void
346  const float* x,
347  int k,
348  float* distances,
349  Index::idx_t* labels) const {
350  // Device is already set in GpuIndex::search
351  FAISS_ASSERT(index_);
352  FAISS_ASSERT(n > 0);
353 
354  // Data is already resident on the GPU
355  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
356  Tensor<float, 2, true> outDistances(distances, {n, k});
357 
358  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
359  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
360 
361  index_->query(queries, nprobe_, k, outDistances, outLabels);
362 }
363 
364 int
365 GpuIndexIVFPQ::getListLength(int listId) const {
366  FAISS_ASSERT(index_);
367  return index_->getListLength(listId);
368 }
369 
370 std::vector<unsigned char>
371 GpuIndexIVFPQ::getListCodes(int listId) const {
372  FAISS_ASSERT(index_);
373  DeviceScope scope(device_);
374 
375  return index_->getListCodes(listId);
376 }
377 
378 std::vector<long>
379 GpuIndexIVFPQ::getListIndices(int listId) const {
380  FAISS_ASSERT(index_);
381  DeviceScope scope(device_);
382 
383  return index_->getListIndices(listId);
384 }
385 
386 void
387 GpuIndexIVFPQ::verifySettings_() const {
388  // Our implementation has these restrictions:
389 
390  // Must have some number of lists
391  FAISS_THROW_IF_NOT_MSG(nlist_ > 0, "nlist must be >0");
392 
393  // up to a single byte per code
394  FAISS_THROW_IF_NOT_FMT(bitsPerCode_ <= 8,
395  "Bits per code must be <= 8 (passed %d)", bitsPerCode_);
396 
397  // Sub-quantizers must evenly divide dimensions available
398  FAISS_THROW_IF_NOT_FMT(this->d % subQuantizers_ == 0,
399  "Number of sub-quantizers (%d) must be an "
400  "even divisor of the number of dimensions (%d)",
401  subQuantizers_, this->d);
402 
403  // The number of bytes per encoded vector must be one we support
404  FAISS_THROW_IF_NOT_FMT(IVFPQ::isSupportedPQCodeLength(subQuantizers_),
405  "Number of bytes per encoded vector / sub-quantizers (%d) "
406  "is not supported",
407  subQuantizers_);
408 
409  // We must have enough shared memory on the current device to store
410  // our lookup distances
411  int lookupTableSize = sizeof(float);
412 #ifdef FAISS_USE_FLOAT16
413  if (ivfpqConfig_.useFloat16LookupTables) {
414  lookupTableSize = sizeof(half);
415  }
416 #endif
417 
418  // 64 bytes per code is only supported with usage of float16, at 2^8
419  // codes per subquantizer
420  size_t requiredSmemSize =
421  lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
422  size_t smemPerBlock = getMaxSharedMemPerBlock(device_);
423 
424  FAISS_THROW_IF_NOT_FMT(requiredSmemSize
425  <= getMaxSharedMemPerBlock(device_),
426  "Device %d has %zu bytes of shared memory, while "
427  "%d bits per code and %d sub-quantizers requires %zu "
428  "bytes. Consider useFloat16LookupTables and/or "
429  "reduce parameters",
430  device_, smemPerBlock, bitsPerCode_, subQuantizers_,
431  requiredSmemSize);
432 
433  // If precomputed codes are disabled, we have an extra limitation in
434  // terms of the number of dimensions per subquantizer
435  FAISS_THROW_IF_NOT_FMT(ivfpqConfig_.usePrecomputedTables ||
437  this->d / subQuantizers_),
438  "Number of dimensions per sub-quantizer (%d) "
439  "is not currently supported without precomputed codes. "
440  "Only 1, 2, 3, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32 dims "
441  "per sub-quantizer are currently supported with no "
442  "precomputed codes. "
443  "Precomputed codes supports any number of dimensions, but "
444  "will involve memory overheads.",
445  this->d / subQuantizers_);
446 
447  // TODO: fully implement METRIC_INNER_PRODUCT
448  FAISS_THROW_IF_NOT_MSG(this->metric_type == faiss::METRIC_L2,
449  "METRIC_INNER_PRODUCT is currently unsupported");
450 }
451 
452 } } // namespace
std::vector< long > getListIndices(int listId) const
void precompute_table()
build precomputed table
Definition: IndexIVFPQ.cpp:363
size_t nbits
number of bits per quantization index
simple (default) implementation as an array of inverted lists
PolysemousTraining * polysemous_training
if NULL, use default
Definition: IndexIVFPQ.h:40
void addImpl_(int n, const float *x, const Index::idx_t *ids) override
Called from GpuIndex for add/add_with_ids.
virtual const idx_t * get_ids(size_t list_no) const =0
GpuIndexIVFPQ(GpuResources *resources, const faiss::IndexIVFPQ *index, GpuIndexIVFPQConfig config=GpuIndexIVFPQConfig())
int getListLength(int listId) const
Definition: IVFBase.cu:198
FlatIndex * getGpuData()
For internal access.
Definition: GpuIndexFlat.h:99
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reserveMemory(size_t numVecs)
Reserve GPU memory in our inverted lists for this number of vectors.
Definition: IVFBase.cu:43
int getListLength(int listId) const
bool do_polysemous_training
reorder PQ centroids after training?
Definition: IndexIVFPQ.h:39
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:43
std::vector< float > precomputed_table
Definition: IndexIVFPQ.h:59
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:44
int getBitsPerCode() const
Return the number of bits per PQ code.
virtual cudaStream_t getDefaultStream(int device)=0
int d
vector dimension
Definition: Index.h:66
long idx_t
all indices are this type
Definition: Index.h:62
void train(Index::idx_t n, const float *x) override
static bool isSupportedPQCodeLength(int size)
Returns true if we support PQ in this size.
Definition: IVFPQ.cu:70
int nprobe_
Number of inverted list probes per query.
Definition: GpuIndexIVF.h:84
void reserveMemory(size_t numVecs)
Reserve GPU memory in our inverted lists for this number of vectors.
int classifyAndAddVectors(Tensor< float, 2, true > &vecs, Tensor< long, 1, true > &indices)
Definition: IVFPQ.cu:118
void query(Tensor< float, 2, true > &queries, int nprobe, int k, Tensor< float, 2, true > &outDistances, Tensor< long, 2, true > &outIndices)
Definition: IVFPQ.cu:516
const int device_
The GPU device we are resident on.
Definition: GpuIndex.h:126
void copyFrom(const faiss::IndexIVFPQ *index)
Tensor< float, 3, true > getPQCentroids()
Definition: IVFPQ.cu:591
GpuResources * resources_
Manages streams, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:123
void copyTo(faiss::IndexIVF *index) const
Copy what we have to the CPU equivalent.
Definition: GpuIndexIVF.cu:153
void replace_invlists(InvertedLists *il, bool own=false)
replace the inverted lists, old one is deallocated if own_invlists
Definition: IndexIVF.cpp:735
int nlist_
Number of inverted lists that we manage.
Definition: GpuIndexIVF.h:81
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
void searchImpl_(int n, const float *x, int k, float *distances, Index::idx_t *labels) const override
Called from GpuIndex for search.
bool verbose
verbosity level
Definition: Index.h:68
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
Definition: IVFPQ.cu:100
std::vector< unsigned char > getListCodes(int listId) const
Return the list codes of a particular list back to the CPU.
Definition: IVFPQ.cu:583
void copyTo(faiss::IndexIVFPQ *index) const
Our tensor type.
Definition: Tensor.cuh:28
size_t nlist
number of possible key values
Definition: InvertedLists.h:34
const MemorySpace memorySpace_
The memory space of our primary storage on the GPU.
Definition: GpuIndex.h:129
bool by_residual
Encode residual or plain vector?
Definition: IndexIVFPQ.h:35
GpuIndexFlat * quantizer_
Quantizer for inverted lists.
Definition: GpuIndexIVF.h:87
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
ProductQuantizer pq
produces the codes
Definition: IndexIVFPQ.h:37
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:92
size_t M
number of subquantizers
int getNumSubQuantizers() const
Return the number of sub-quantizers we are using.
std::vector< long > getListIndices(int listId) const
Return the list indices of a particular list back to the CPU.
Definition: IVFBase.cu:205
int getCentroidsPerSubQuantizer() const
Return the number of centroids per PQ code (2^bits per code)
void setPrecomputedCodes(bool enable)
Enable or disable pre-computed codes.
virtual const uint8_t * get_codes(size_t list_no) const =0
void copyFrom(const faiss::IndexIVF *index)
Copy what we need from the CPU equivalent.
Definition: GpuIndexIVF.cu:79
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
void compute_residual(const float *x, float *residual, idx_t key) const
Definition: Index.cpp:86
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:98
bool getPrecomputedCodes() const
Are pre-computed codes enabled?
IndicesOptions indicesOptions
Index storage options for the GPU.
Definition: GpuIndexIVF.h:29
size_t reclaimMemory()
Definition: IVFBase.cu:103
Implementing class for IVFPQ on the GPU.
Definition: IVFPQ.cuh:17
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:95
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44
int use_precomputed_table
if by_residual, build precompute tables
Definition: IndexIVFPQ.h:54
std::vector< unsigned char > getListCodes(int listId) const
std::vector< float > centroids
Centroid table, size M * ksub * dsub.
static bool isSupportedNoPrecomputedSubDimSize(int dims)
Definition: IVFPQ.cu:95