Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/IndexIVF.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #include "IndexIVF.h"
12 
13 #include <cstdio>
14 
15 #include "utils.h"
16 #include "hamming.h"
17 
18 #include "FaissAssert.h"
19 #include "IndexFlat.h"
20 #include "AuxIndexStructures.h"
21 
22 namespace faiss {
23 
24 using ScopedIds = InvertedLists::ScopedIds;
25 using ScopedCodes = InvertedLists::ScopedCodes;
26 
27 /*****************************************
28  * Level1Quantizer implementation
29  ******************************************/
30 
31 
32 Level1Quantizer::Level1Quantizer (Index * quantizer, size_t nlist):
33  quantizer (quantizer),
34  nlist (nlist),
35  quantizer_trains_alone (0),
36  own_fields (false),
37  clustering_index (nullptr)
38 {
39  // here we set a low # iterations because this is typically used
40  // for large clusterings (nb this is not used for the MultiIndex,
41  // for which quantizer_trains_alone = true)
42  cp.niter = 10;
43 }
44 
45 Level1Quantizer::Level1Quantizer ():
46  quantizer (nullptr),
47  nlist (0),
48  quantizer_trains_alone (0), own_fields (false),
49  clustering_index (nullptr)
50 {}
51 
52 Level1Quantizer::~Level1Quantizer ()
53 {
54  if (own_fields) delete quantizer;
55 }
56 
57 void Level1Quantizer::train_q1 (size_t n, const float *x, bool verbose, MetricType metric_type)
58 {
59  size_t d = quantizer->d;
60  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
61  if (verbose)
62  printf ("IVF quantizer does not need training.\n");
63  } else if (quantizer_trains_alone == 1) {
64  if (verbose)
65  printf ("IVF quantizer trains alone...\n");
66  quantizer->train (n, x);
67  quantizer->verbose = verbose;
68  FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
69  "nlist not consistent with quantizer size");
70  } else if (quantizer_trains_alone == 0) {
71  if (verbose)
72  printf ("Training level-1 quantizer on %ld vectors in %ldD\n",
73  n, d);
74 
75  Clustering clus (d, nlist, cp);
76  quantizer->reset();
77  if (clustering_index) {
78  clus.train (n, x, *clustering_index);
79  quantizer->add (nlist, clus.centroids.data());
80  } else {
81  clus.train (n, x, *quantizer);
82  }
83  quantizer->is_trained = true;
84  } else if (quantizer_trains_alone == 2) {
85  if (verbose)
86  printf (
87  "Training L2 quantizer on %ld vectors in %ldD%s\n",
88  n, d,
89  clustering_index ? "(user provided index)" : "");
90  FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
91  Clustering clus (d, nlist, cp);
92  if (!clustering_index) {
93  IndexFlatL2 assigner (d);
94  clus.train(n, x, assigner);
95  } else {
96  clus.train(n, x, *clustering_index);
97  }
98  if (verbose)
99  printf ("Adding centroids to quantizer\n");
100  quantizer->add (nlist, clus.centroids.data());
101  }
102 }
103 
104 
105 
106 /*****************************************
107  * IndexIVF implementation
108  ******************************************/
109 
110 
111 IndexIVF::IndexIVF (Index * quantizer, size_t d,
112  size_t nlist, size_t code_size,
113  MetricType metric):
114  Index (d, metric),
115  Level1Quantizer (quantizer, nlist),
116  invlists (new ArrayInvertedLists (nlist, code_size)),
117  own_invlists (true),
118  code_size (code_size),
119  nprobe (1),
120  max_codes (0),
121  maintain_direct_map (false)
122 {
123  FAISS_THROW_IF_NOT (d == quantizer->d);
124  is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
125  // Spherical by default if the metric is inner_product
126  if (metric_type == METRIC_INNER_PRODUCT) {
127  cp.spherical = true;
128  }
129 
130 }
131 
132 IndexIVF::IndexIVF ():
133  invlists (nullptr), own_invlists (false),
134  code_size (0),
135  nprobe (1), max_codes (0),
136  maintain_direct_map (false)
137 {}
138 
139 void IndexIVF::add (idx_t n, const float * x)
140 {
141  add_with_ids (n, x, nullptr);
142 }
143 
144 void IndexIVF::make_direct_map (bool new_maintain_direct_map)
145 {
146  // nothing to do
147  if (new_maintain_direct_map == maintain_direct_map)
148  return;
149 
150  if (new_maintain_direct_map) {
151  direct_map.resize (ntotal, -1);
152  for (size_t key = 0; key < nlist; key++) {
153  size_t list_size = invlists->list_size (key);
154  ScopedIds idlist (invlists, key);
155 
156  for (long ofs = 0; ofs < list_size; ofs++) {
157  FAISS_THROW_IF_NOT_MSG (
158  0 <= idlist [ofs] && idlist[ofs] < ntotal,
159  "direct map supported only for seuquential ids");
160  direct_map [idlist [ofs]] = key << 32 | ofs;
161  }
162  }
163  } else {
164  direct_map.clear ();
165  }
166  maintain_direct_map = new_maintain_direct_map;
167 }
168 
169 
170 void IndexIVF::search (idx_t n, const float *x, idx_t k,
171  float *distances, idx_t *labels) const
172 {
173  long * idx = new long [n * nprobe];
174  ScopeDeleter<long> del (idx);
175  float * coarse_dis = new float [n * nprobe];
176  ScopeDeleter<float> del2 (coarse_dis);
177 
178  quantizer->search (n, x, nprobe, coarse_dis, idx);
179 
180  invlists->prefetch_lists (idx, n * nprobe);
181 
182  search_preassigned (n, x, k, idx, coarse_dis,
183  distances, labels, false);
184 
185 }
186 
187 
188 
189 void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
190  const idx_t *keys,
191  const float *coarse_dis ,
192  float *distances, idx_t *labels,
193  bool store_pairs,
194  const IVFSearchParameters *params) const
195 {
196  long nprobe = params ? params->nprobe : this->nprobe;
197  long max_codes = params ? params->max_codes : this->max_codes;
198 
199  size_t nlistv = 0, ndis = 0, nheap = 0;
200 
201  using HeapForIP = CMin<float, idx_t>;
202  using HeapForL2 = CMax<float, idx_t>;
203 
204 #pragma omp parallel reduction(+: nlistv, ndis, nheap)
205  {
206  InvertedListScanner *scanner = get_InvertedListScanner(store_pairs);
208 #pragma omp for
209  for (size_t i = 0; i < n; i++) {
210  // loop over queries
211  const float * xi = x + i * d;
212  scanner->set_query (xi);
213  const long * keysi = keys + i * nprobe;
214  float * simi = distances + i * k;
215  long * idxi = labels + i * k;
216 
217  if (metric_type == METRIC_INNER_PRODUCT) {
218  heap_heapify<HeapForIP> (k, simi, idxi);
219  } else {
220  heap_heapify<HeapForL2> (k, simi, idxi);
221  }
222 
223  long nscan = 0;
224 
225  // loop over probes
226  for (size_t ik = 0; ik < nprobe; ik++) {
227  long key = keysi[ik]; /* select the list */
228  if (key < 0) {
229  // not enough centroids for multiprobe
230  continue;
231  }
232  FAISS_THROW_IF_NOT_FMT (key < (long) nlist,
233  "Invalid key=%ld at ik=%ld nlist=%ld\n",
234  key, ik, nlist);
235 
236 
237  size_t list_size = invlists->list_size(key);
238 
239  // don't waste time on empty lists
240  if (list_size == 0) {
241  continue;
242  }
243 
244  scanner->set_list (key, coarse_dis[i * nprobe + ik]);
245 
246  nlistv++;
247 
248 
249  InvertedLists::ScopedCodes scodes (invlists, key);
250  const Index::idx_t * ids = store_pairs ? nullptr :
251  invlists->get_ids (key);
252 
253  nheap += scanner->scan_codes (list_size, scodes.get(),
254  ids, simi, idxi, k);
255 
256  if (ids) {
257  invlists->release_ids (ids);
258  }
259 
260  nscan += list_size;
261  if (max_codes && nscan >= max_codes)
262  break;
263  }
264 
265  ndis += nscan;
266  if (metric_type == METRIC_INNER_PRODUCT) {
267  heap_reorder<HeapForIP> (k, simi, idxi);
268  } else {
269  heap_reorder<HeapForL2> (k, simi, idxi);
270  }
271 
272  } // parallel for
273  } // parallel
274 
275  indexIVF_stats.nq += n;
276  indexIVF_stats.nlist += nlistv;
277  indexIVF_stats.ndis += ndis;
278  indexIVF_stats.nheap_updates += nheap;
279 
280 }
281 
282 
283 
284 void IndexIVF::reconstruct (idx_t key, float* recons) const
285 {
286  FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
287  "direct map is not initialized");
288  long list_no = direct_map[key] >> 32;
289  long offset = direct_map[key] & 0xffffffff;
290  reconstruct_from_offset (list_no, offset, recons);
291 }
292 
293 
294 void IndexIVF::reconstruct_n (idx_t i0, idx_t ni, float* recons) const
295 {
296  FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
297 
298  for (long list_no = 0; list_no < nlist; list_no++) {
299  size_t list_size = invlists->list_size (list_no);
300  ScopedIds idlist (invlists, list_no);
301 
302  for (long offset = 0; offset < list_size; offset++) {
303  long id = idlist[offset];
304  if (!(id >= i0 && id < i0 + ni)) {
305  continue;
306  }
307 
308  float* reconstructed = recons + (id - i0) * d;
309  reconstruct_from_offset (list_no, offset, reconstructed);
310  }
311  }
312 }
313 
314 
315 void IndexIVF::search_and_reconstruct (idx_t n, const float *x, idx_t k,
316  float *distances, idx_t *labels,
317  float *recons) const
318 {
319  long * idx = new long [n * nprobe];
320  ScopeDeleter<long> del (idx);
321  float * coarse_dis = new float [n * nprobe];
322  ScopeDeleter<float> del2 (coarse_dis);
323 
324  quantizer->search (n, x, nprobe, coarse_dis, idx);
325 
326  invlists->prefetch_lists (idx, n * nprobe);
327 
328  // search_preassigned() with `store_pairs` enabled to obtain the list_no
329  // and offset into `codes` for reconstruction
330  search_preassigned (n, x, k, idx, coarse_dis,
331  distances, labels, true /* store_pairs */);
332  for (idx_t i = 0; i < n; ++i) {
333  for (idx_t j = 0; j < k; ++j) {
334  idx_t ij = i * k + j;
335  idx_t key = labels[ij];
336  float* reconstructed = recons + ij * d;
337  if (key < 0) {
338  // Fill with NaNs
339  memset(reconstructed, -1, sizeof(*reconstructed) * d);
340  } else {
341  int list_no = key >> 32;
342  int offset = key & 0xffffffff;
343 
344  // Update label to the actual id
345  labels[ij] = invlists->get_single_id (list_no, offset);
346 
347  reconstruct_from_offset (list_no, offset, reconstructed);
348  }
349  }
350  }
351 }
352 
354  long /*list_no*/,
355  long /*offset*/,
356  float* /*recons*/) const {
357  FAISS_THROW_MSG ("reconstruct_from_offset not implemented");
358 }
359 
361 {
362  direct_map.clear ();
363  invlists->reset ();
364  ntotal = 0;
365 }
366 
367 
369 {
370  FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
371  "direct map remove not implemented");
372 
373  std::vector<long> toremove(nlist);
374 
375 #pragma omp parallel for
376  for (long i = 0; i < nlist; i++) {
377  long l0 = invlists->list_size (i), l = l0, j = 0;
378  ScopedIds idsi (invlists, i);
379  while (j < l) {
380  if (sel.is_member (idsi[j])) {
381  l--;
382  invlists->update_entry (
383  i, j,
384  invlists->get_single_id (i, l),
385  ScopedCodes (invlists, i, l).get());
386  } else {
387  j++;
388  }
389  }
390  toremove[i] = l0 - l;
391  }
392  // this will not run well in parallel on ondisk because of possible shrinks
393  long nremove = 0;
394  for (long i = 0; i < nlist; i++) {
395  if (toremove[i] > 0) {
396  nremove += toremove[i];
397  invlists->resize(
398  i, invlists->list_size(i) - toremove[i]);
399  }
400  }
401  ntotal -= nremove;
402  return nremove;
403 }
404 
405 
406 
407 
408 void IndexIVF::train (idx_t n, const float *x)
409 {
410  if (verbose)
411  printf ("Training level-1 quantizer\n");
412 
413  train_q1 (n, x, verbose, metric_type);
414 
415  if (verbose)
416  printf ("Training IVF residual\n");
417 
418  train_residual (n, x);
419  is_trained = true;
420 
421 }
422 
423 void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
424  if (verbose)
425  printf("IndexIVF: no residual training\n");
426  // does nothing by default
427 }
428 
429 
430 
432 {
433  std::vector<int> hist (nlist);
434  for (int i = 0; i < nlist; i++) {
435  hist[i] = invlists->list_size(i);
436  }
437  return faiss::imbalance_factor (nlist, hist.data());
438 }
439 
441 {
442  std::vector<int> sizes(40);
443  for (int i = 0; i < nlist; i++) {
444  for (int j = 0; j < sizes.size(); j++) {
445  if ((invlists->list_size(i) >> j) == 0) {
446  sizes[j]++;
447  break;
448  }
449  }
450  }
451  for (int i = 0; i < sizes.size(); i++) {
452  if (sizes[i]) {
453  printf ("list size in < %d: %d instances\n",
454  1 << i, sizes[i]);
455  }
456  }
457 
458 }
459 
460 
462 {
463  // minimal sanity checks
464  FAISS_THROW_IF_NOT (other.d == d);
465  FAISS_THROW_IF_NOT (other.nlist == nlist);
466  FAISS_THROW_IF_NOT (other.code_size == code_size);
467  FAISS_THROW_IF_NOT_MSG (typeid (*this) == typeid (other),
468  "can only merge indexes of the same type");
469 }
470 
471 
472 void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)
473 {
475  FAISS_THROW_IF_NOT_MSG ((!maintain_direct_map &&
476  !other.maintain_direct_map),
477  "direct map copy not implemented");
478 
479  invlists->merge_from (other.invlists, add_id);
480 
481  ntotal += other.ntotal;
482  other.ntotal = 0;
483 }
484 
485 
487 {
488  //FAISS_THROW_IF_NOT (ntotal == 0);
489  FAISS_THROW_IF_NOT (il->nlist == nlist &&
490  il->code_size == code_size);
491  if (own_invlists) {
492  delete invlists;
493  }
494  invlists = il;
495  own_invlists = own;
496 }
497 
498 
499 void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
500  long a1, long a2) const
501 {
502 
503  FAISS_THROW_IF_NOT (nlist == other.nlist);
504  FAISS_THROW_IF_NOT (code_size == other.code_size);
505  FAISS_THROW_IF_NOT (!other.maintain_direct_map);
506  FAISS_THROW_IF_NOT_FMT (
507  subset_type == 0 || subset_type == 1 || subset_type == 2,
508  "subset type %d not implemented", subset_type);
509 
510  size_t accu_n = 0;
511  size_t accu_a1 = 0;
512  size_t accu_a2 = 0;
513 
514  InvertedLists *oivf = other.invlists;
515 
516  for (long list_no = 0; list_no < nlist; list_no++) {
517  size_t n = invlists->list_size (list_no);
518  ScopedIds ids_in (invlists, list_no);
519 
520  if (subset_type == 0) {
521  for (long i = 0; i < n; i++) {
522  idx_t id = ids_in[i];
523  if (a1 <= id && id < a2) {
524  oivf->add_entry (list_no,
525  invlists->get_single_id (list_no, i),
526  ScopedCodes (invlists, list_no, i).get());
527  other.ntotal++;
528  }
529  }
530  } else if (subset_type == 1) {
531  for (long i = 0; i < n; i++) {
532  idx_t id = ids_in[i];
533  if (id % a1 == a2) {
534  oivf->add_entry (list_no,
535  invlists->get_single_id (list_no, i),
536  ScopedCodes (invlists, list_no, i).get());
537  other.ntotal++;
538  }
539  }
540  } else if (subset_type == 2) {
541  // see what is allocated to a1 and to a2
542  size_t next_accu_n = accu_n + n;
543  size_t next_accu_a1 = next_accu_n * a1 / ntotal;
544  size_t i1 = next_accu_a1 - accu_a1;
545  size_t next_accu_a2 = next_accu_n * a2 / ntotal;
546  size_t i2 = next_accu_a2 - accu_a2;
547 
548  for (long i = i1; i < i2; i++) {
549  oivf->add_entry (list_no,
550  invlists->get_single_id (list_no, i),
551  ScopedCodes (invlists, list_no, i).get());
552  }
553 
554  other.ntotal += i2 - i1;
555  accu_a1 = next_accu_a1;
556  accu_a2 = next_accu_a2;
557  }
558  accu_n += n;
559  }
560  FAISS_ASSERT(accu_n == ntotal);
561 
562 }
563 
564 
565 IndexIVF::~IndexIVF()
566 {
567  if (own_invlists) {
568  delete invlists;
569  }
570 }
571 
572 
573 void IndexIVFStats::reset()
574 {
575  memset ((void*)this, 0, sizeof (*this));
576 }
577 
578 
579 IndexIVFStats indexIVF_stats;
580 
581 
582 } // namespace faiss
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
Definition: IndexIVF.cpp:189
simple (default) implementation as an array of inverted lists
void check_compatible_for_merge(const IndexIVF &other) const
Definition: IndexIVF.cpp:461
virtual const idx_t * get_ids(size_t list_no) const =0
double imbalance_factor() const
1= perfectly balanced, &gt;1: imbalanced
Definition: IndexIVF.cpp:431
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
Definition: IndexIVF.cpp:315
virtual void reset()=0
removes all elements from the database.
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
Definition: IndexIVF.cpp:499
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
Definition: IndexIVF.cpp:353
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:98
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:284
virtual void train(idx_t n, const float *x)
Definition: Index.cpp:24
virtual void add_with_ids(idx_t n, const float *x, const long *xids)
Definition: Index.cpp:42
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:423
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
Definition: utils.cpp:1292
void merge_from(InvertedLists *oivf, size_t add_id)
move all entries from oivf (empty on output)
virtual idx_t get_single_id(size_t list_no, size_t offset) const
int d
vector dimension
Definition: Index.h:66
size_t code_size
code size per vector in bytes
Definition: InvertedLists.h:36
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:408
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVF.cpp:294
virtual void add(idx_t n, const float *x)=0
virtual void set_list(idx_t list_no, float coarse_dis)=0
following codes come from this inverted list
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
virtual size_t scan_codes(size_t n, const uint8_t *codes, const idx_t *ids, float *distances, idx_t *labels, size_t k) const =0
long idx_t
all indices are this type
Definition: Index.h:64
void replace_invlists(InvertedLists *il, bool own=false)
replace the inverted lists, old one is deallocated if own_invlists
Definition: IndexIVF.cpp:486
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:44
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
bool verbose
verbosity level
Definition: Index.h:68
void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:360
std::vector< float > centroids
centroids (k * d)
Definition: Clustering.h:62
virtual void prefetch_lists(const long *list_nos, int nlist) const
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
Index * clustering_index
to override index used during clustering
Definition: IndexIVF.h:45
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:57
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:62
size_t nlist
number of possible key values
Definition: InvertedLists.h:35
void make_direct_map(bool new_maintain_direct_map=true)
Definition: IndexIVF.cpp:144
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
void print_stats() const
display some stats about the inverted lists
Definition: IndexIVF.cpp:440
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:93
void add(idx_t n, const float *x) override
Calls add_with_ids with NULL ids.
Definition: IndexIVF.cpp:139
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:64
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:63
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
Definition: IndexIVF.cpp:368
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:99
virtual InvertedListScanner * get_InvertedListScanner(bool store_pairs=false) const
get a scanner for this index (store_pairs means ignore labels)
Definition: IndexIVF.h:167
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:102
bool spherical
do we want normalized centroids?
Definition: Clustering.h:28
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:42
virtual void set_query(const float *query_vector)=0
from now on we handle this query.
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:170
virtual void release_ids(const idx_t *ids) const
release ids returned by get_ids
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:472
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:96
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45