Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVF.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 /* Copyright 2004-present Facebook. All Rights Reserved.
11  Inverted list structure.
12 */
13 
14 #include "IndexIVF.h"
15 
16 #include <cstdio>
17 
18 #include "utils.h"
19 #include "hamming.h"
20 
21 #include "FaissAssert.h"
22 #include "IndexFlat.h"
23 #include "AuxIndexStructures.h"
24 
25 namespace faiss {
26 
27 /*****************************************
28  * IndexIVF implementation
29  ******************************************/
30 
31 
32 IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,
33  MetricType metric):
34  Index (d, metric),
35  nlist (nlist),
36  nprobe (1),
37  quantizer (quantizer),
38  quantizer_trains_alone (false),
39  own_fields (false),
40  ids (nlist),
41  maintain_direct_map (false)
42 {
43  FAISS_ASSERT (d == quantizer->d);
44  is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
45  // Spherical by default if the metric is inner_product
46  if (metric_type == METRIC_INNER_PRODUCT) {
47  cp.spherical = true;
48  }
49  // here we set a low # iterations because this is typically used
50  // for large clusterings (nb this is not used for the MultiIndex,
51  // for which quantizer_trains_alone = true)
52  cp.niter = 10;
53  cp.verbose = verbose;
54 
55 }
56 
57 IndexIVF::IndexIVF ():
58  nlist (0), nprobe (1), quantizer (nullptr),
59  quantizer_trains_alone (false), own_fields (false),
60  maintain_direct_map (false)
61 {}
62 
63 
64 void IndexIVF::add (idx_t n, const float * x)
65 {
66  add_with_ids (n, x, nullptr);
67 }
68 
70 {
71  if (maintain_direct_map) return;
72 
73  direct_map.resize (ntotal, -1);
74  for (size_t key = 0; key < nlist; key++) {
75  const std::vector<long> & idlist = ids[key];
76 
77  for (long ofs = 0; ofs < idlist.size(); ofs++) {
78  direct_map [idlist [ofs]] =
79  key << 32 | ofs;
80  }
81  }
82 
83  maintain_direct_map = true;
84 }
85 
86 
88 {
89  ntotal = 0;
90  direct_map.clear();
91  for (size_t i = 0; i < ids.size(); i++)
92  ids[i].clear();
93 }
94 
95 
96 void IndexIVF::train (idx_t n, const float *x)
97 {
98  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
99  if (verbose)
100  printf ("IVF quantizer does not need training.\n");
101  } else if (quantizer_trains_alone) {
102  if (verbose)
103  printf ("IVF quantizer trains alone...\n");
104  quantizer->train (n, x);
105  FAISS_ASSERT (quantizer->ntotal == nlist ||
106  !"nlist not consistent with quantizer size");
107  } else {
108  if (verbose)
109  printf ("Training IVF quantizer on %ld vectors in %dD\n",
110  n, d);
111 
112  Clustering clus (d, nlist, cp);
113 
114  quantizer->reset();
115  clus.train (n, x, *quantizer);
116  quantizer->is_trained = true;
117  }
118  if (verbose)
119  printf ("Training IVF residual\n");
120 
121  train_residual (n, x);
122  is_trained = true;
123 }
124 
125 void IndexIVF::train_residual (idx_t n, const float *x)
126 {
127  if (verbose)
128  printf ("IndexIVF: no residual training\n");
129  // does nothing by default
130 }
131 
132 
133 
135 {
136  std::vector<int> hist (nlist);
137  for (int i = 0; i < nlist; i++) {
138  hist[i] = ids[i].size();
139  }
140  return faiss::imbalance_factor (nlist, hist.data());
141 }
142 
144 {
145  std::vector<int> sizes(40);
146  for (int i = 0; i < nlist; i++) {
147  for (int j = 0; j < sizes.size(); j++) {
148  if ((ids[i].size() >> j) == 0) {
149  sizes[j]++;
150  break;
151  }
152  }
153  }
154  for (int i = 0; i < sizes.size(); i++) {
155  if (sizes[i]) {
156  printf ("list size in < %d: %d instances\n",
157  1 << i, sizes[i]);
158  }
159  }
160 
161 }
162 
163 void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)
164 {
165  // minimal sanity checks
166  FAISS_ASSERT (other.d == d);
167  FAISS_ASSERT (other.nlist == nlist);
168  FAISS_ASSERT ((!maintain_direct_map && !other.maintain_direct_map) ||
169  !"direct map copy not implemented");
170  FAISS_ASSERT (typeid (*this) == typeid (other) ||
171  !"can only merge indexes of the same type");
172  for (long i = 0; i < nlist; i++) {
173  std::vector<idx_t> & src = other.ids[i];
174  std::vector<idx_t> & dest = ids[i];
175  for (long j = 0; j < src.size(); j++)
176  dest.push_back (src[j] + add_id);
177  src.clear();
178  }
179  merge_from_residuals (other);
180  ntotal += other.ntotal;
181  other.ntotal = 0;
182 }
183 
184 
185 
186 
187 IndexIVF::~IndexIVF()
188 {
189  if (own_fields) delete quantizer;
190 }
191 
192 
193 
194 /*****************************************
195  * IndexIVFFlat implementation
196  ******************************************/
197 
198 IndexIVFFlat::IndexIVFFlat (Index * quantizer,
199  size_t d, size_t nlist, MetricType metric):
200  IndexIVF (quantizer, d, nlist, metric)
201 {
202  vecs.resize (nlist);
203  set_typename();
204 }
205 
206 
208 {
209  std::stringstream s;
210  if (metric_type == METRIC_INNER_PRODUCT)
211  s << "IvfIP";
212  else if (metric_type == METRIC_L2)
213  s << "IvfL2";
214  else s << "??";
215  s << "[" << nlist << ":" << quantizer->index_typename << "]";
216  index_typename = s.str();
217 }
218 
219 
220 
221 
222 
223 
224 void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const long *xids)
225 {
226  add_core (n, x, xids, nullptr);
227 }
228 
229 void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,
230  const long *precomputed_idx)
231 
232 {
233  FAISS_ASSERT (is_trained);
234  const long * idx;
235 
236  if (precomputed_idx) {
237  idx = precomputed_idx;
238  } else {
239  long * idx0 = new long [n];
240  quantizer->assign (n, x, idx0);
241  idx = idx0;
242  }
243  long n_add = 0;
244  for (size_t i = 0; i < n; i++) {
245  long id = xids ? xids[i] : ntotal + i;
246  long list_no = idx [i];
247  if (list_no < 0)
248  continue;
249  FAISS_ASSERT (list_no < nlist);
250 
251  ids[list_no].push_back (id);
252  const float *xi = x + i * d;
253  /* store the vectors */
254  for (size_t j = 0 ; j < d ; j++)
255  vecs[list_no].push_back (xi [j]);
256 
258  direct_map.push_back (list_no << 32 | (ids[list_no].size() - 1));
259  n_add++;
260  }
261  if (verbose) {
262  printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
263  n_add, n);
264  }
265  if (!precomputed_idx)
266  delete [] idx;
267  ntotal += n_add;
268 }
269 
270 
271 
272 
274  size_t nx,
275  const float * x,
276  const long * __restrict keys,
277  float_minheap_array_t * res) const
278 {
279 
280  const size_t k = res->k;
281 
282 #pragma omp parallel for
283  for (size_t i = 0; i < nx; i++) {
284  const float * xi = x + i * d;
285  const long * keysi = keys + i * nprobe;
286  float * __restrict simi = res->get_val (i);
287  long * __restrict idxi = res->get_ids (i);
288  minheap_heapify (k, simi, idxi);
289 
290  for (size_t ik = 0; ik < nprobe; ik++) {
291  long key = keysi[ik]; /* select the list */
292  if (key < 0) {
293  // not enough centroids for multiprobe
294  continue;
295  }
296  if (key >= (long) nlist) {
297  fprintf (stderr, "Invalid key=%ld at ik=%ld nlist=%ld\n",
298  key, ik, nlist);
299  throw;
300  }
301 
302  const size_t list_size = ids[key].size();
303  const float * list_vecs = vecs[key].data();
304 
305  for (size_t j = 0; j < list_size; j++) {
306  const float * yj = list_vecs + d * j;
307  float ip = fvec_inner_product (xi, yj, d);
308  if (ip > simi[0]) {
309  minheap_pop (k, simi, idxi);
310  minheap_push (k, simi, idxi, ip, ids[key][j]);
311  }
312  }
313  }
314  minheap_reorder (k, simi, idxi);
315  }
316 }
317 
318 
320  size_t nx,
321  const float * x,
322  const long * __restrict keys,
323  float_maxheap_array_t * res) const
324 {
325  const size_t k = res->k;
326 
327 #pragma omp parallel for
328  for (size_t i = 0; i < nx; i++) {
329  const float * xi = x + i * d;
330  const long * keysi = keys + i * nprobe;
331  float * __restrict disi = res->get_val (i);
332  long * __restrict idxi = res->get_ids (i);
333  maxheap_heapify (k, disi, idxi);
334 
335  for (size_t ik = 0; ik < nprobe; ik++) {
336  long key = keysi[ik]; /* select the list */
337  if (key < 0) {
338  // not enough centroids for multiprobe
339  continue;
340  }
341  if (key >= (long) nlist) {
342  fprintf (stderr, "Invalid key=%ld at ik=%ld nlist=%ld\n",
343  key, ik, nlist);
344  throw;
345  }
346 
347  const size_t list_size = ids[key].size();
348  const float * list_vecs = vecs[key].data();
349 
350  for (size_t j = 0; j < list_size; j++) {
351  const float * yj = list_vecs + d * j;
352  float disij = fvec_L2sqr (xi, yj, d);
353  if (disij < disi[0]) {
354  maxheap_pop (k, disi, idxi);
355  maxheap_push (k, disi, idxi, disij, ids[key][j]);
356  }
357  }
358  }
359  maxheap_reorder (k, disi, idxi);
360  }
361 }
362 
363 
364 void IndexIVFFlat::search (idx_t n, const float *x, idx_t k,
365  float *distances, idx_t *labels) const
366 {
367  idx_t * idx = new idx_t [n * nprobe];
368  quantizer->assign (n, x, idx, nprobe);
369 
370  if (metric_type == METRIC_INNER_PRODUCT) {
371  float_minheap_array_t res = {
372  size_t(n), size_t(k), labels, distances};
373  search_knn_inner_product (n, x, idx, &res);
374 
375  } else if (metric_type == METRIC_L2) {
376  float_maxheap_array_t res = {
377  size_t(n), size_t(k), labels, distances};
378  search_knn_L2sqr (n, x, idx, &res);
379  }
380 
381  delete [] idx;
382 }
383 
384 
385 void IndexIVFFlat::range_search (idx_t nx, const float *x, float radius,
386  RangeSearchResult *result) const
387 {
388  idx_t * keys = new idx_t [nx * nprobe];
389  quantizer->assign (nx, x, keys, nprobe);
390 
391  assert (metric_type == METRIC_L2 || !"Only L2 implemented");
392 #pragma omp parallel
393  {
394  RangeSearchPartialResult pres(result);
395 
396  for (size_t i = 0; i < nx; i++) {
397  const float * xi = x + i * d;
398  const long * keysi = keys + i * nprobe;
399 
401  pres.new_result (i);
402 
403  for (size_t ik = 0; ik < nprobe; ik++) {
404  long key = keysi[ik]; /* select the list */
405  if (key < 0 || key >= (long) nlist) {
406  fprintf (stderr, "Invalid key=%ld at ik=%ld nlist=%ld\n",
407  key, ik, nlist);
408  throw;
409  }
410 
411  const size_t list_size = ids[key].size();
412  const float * list_vecs = vecs[key].data();
413 
414  for (size_t j = 0; j < list_size; j++) {
415  const float * yj = list_vecs + d * j;
416  float disij = fvec_L2sqr (xi, yj, d);
417  if (disij < radius) {
418  qres.add (disij, ids[key][j]);
419  }
420  }
421  }
422  }
423 
424  pres.finalize ();
425  }
426  delete[] keys;
427 }
428 
430 {
431  IndexIVFFlat &other = dynamic_cast<IndexIVFFlat &> (other_in);
432  for (int i = 0; i < nlist; i++) {
433  std::vector<float> & src = other.vecs[i];
434  std::vector<float> & dest = vecs[i];
435  for (int j = 0; j < src.size(); j++)
436  dest.push_back (src[j]);
437  src.clear();
438  }
439 }
440 
441 void IndexIVFFlat::copy_subset_to (IndexIVFFlat & other, int subset_type,
442  long a1, long a2) const
443 {
444  FAISS_ASSERT (nlist == other.nlist);
445  FAISS_ASSERT (!other.maintain_direct_map);
446 
447  for (long list_no = 0; list_no < nlist; list_no++) {
448  const std::vector<idx_t> & ids_in = ids[list_no];
449  std::vector<idx_t> & ids_out = other.ids[list_no];
450  const std::vector<float> & vecs_in = vecs[list_no];
451  std::vector<float> & vecs_out = other.vecs[list_no];
452 
453  for (long i = 0; i < ids_in.size(); i++) {
454  idx_t id = ids_in[i];
455  if (subset_type == 0 && a1 <= id && id < a2) {
456  ids_out.push_back (id);
457  vecs_out.insert (vecs_out.end(),
458  vecs_in.begin() + i * d,
459  vecs_in.begin() + (i + 1) * d);
460  other.ntotal++;
461  }
462  }
463  }
464 }
465 
466 
467 
469 {
470  IndexIVF::reset();
471  for (size_t key = 0; key < nlist; key++) {
472  vecs[key].clear();
473  }
474 }
475 
477 {
478  FAISS_ASSERT (!maintain_direct_map ||
479  !"direct map remove not implemented");
480  long nremove = 0;
481 #pragma omp parallel for reduction(+: nremove)
482  for (long i = 0; i < nlist; i++) {
483  std::vector<idx_t> & idsi = ids[i];
484  float *vecsi = vecs[i].data();
485 
486  long l = idsi.size(), j = 0;
487  while (j < l) {
488  if (sel.is_member (idsi[j])) {
489  l--;
490  idsi [j] = idsi [l];
491  memmove (vecsi + j * d,
492  vecsi + l * d, d * sizeof (float));
493  } else {
494  j++;
495  }
496  }
497  if (l < idsi.size()) {
498  nremove += idsi.size() - l;
499  idsi.resize (l);
500  vecs[i].resize (l * d);
501  }
502  }
503  ntotal -= nremove;
504  return nremove;
505 }
506 
507 
508 void IndexIVFFlat::reconstruct (idx_t key, float * recons) const
509 {
510  assert (direct_map.size() == ntotal);
511  int list_no = direct_map[key] >> 32;
512  int ofs = direct_map[key] & 0xffffffff;
513  memcpy (recons, &vecs[list_no][ofs * d], d * sizeof(recons[0]));
514 }
515 
516 
517 } // namespace faiss
int niter
clustering iterations
Definition: Clustering.h:26
result structure for a single query
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:431
void search_knn_L2sqr(size_t nx, const float *x, const long *keys, float_maxheap_array_t *res) const
Implementation of the search for the L2 metric.
Definition: IndexIVF.cpp:319
T * get_val(size_t key)
Return the list of values for a heap.
Definition: Heap.h:361
double imbalance_factor() const
1= perfectly balanced, &gt;1: imbalanced
Definition: IndexIVF.cpp:134
virtual void reset()=0
removes all elements from the database.
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:48
virtual void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:508
bool quantizer_trains_alone
just pass over the trainset to quantizer
Definition: IndexIVF.h:51
virtual void set_typename() override
Definition: IndexIVF.cpp:207
virtual void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: IndexIVF.cpp:385
void copy_subset_to(IndexIVFFlat &other, int subset_type, long a1, long a2) const
Definition: IndexIVF.cpp:441
virtual void merge_from_residuals(IndexIVF &other) override
Definition: IndexIVF.cpp:429
virtual void add_with_ids(idx_t n, const float *x, const long *xids)
Definition: Index.cpp:32
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:125
size_t k
allocated size per heap
Definition: Heap.h:356
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
Definition: utils.cpp:1444
virtual long remove_ids(const IDSelector &sel) override
Definition: IndexIVF.cpp:476
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:56
int d
vector dimension
Definition: Index.h:66
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:50
virtual void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:96
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:54
virtual void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
Definition: IndexIVF.cpp:224
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:52
long idx_t
all indices are this type
Definition: Index.h:64
virtual void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:468
void make_direct_map()
intialize a direct map
Definition: IndexIVF.cpp:69
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
bool verbose
verbosity level
Definition: Index.h:68
virtual void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:87
QueryResult & new_result(idx_t qno)
begin a new result
the entries in the buffers are split per query
virtual void merge_from_residuals(IndexIVF &other)=0
TI * get_ids(size_t key)
Correspponding identifiers.
Definition: Heap.h:364
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
void print_stats() const
display some stats about the inverted lists
Definition: IndexIVF.cpp:143
size_t nlist
number of possible key values
Definition: IndexIVF.h:47
virtual void add(idx_t n, const float *x) override
Quantizes x and calls add_with_key.
Definition: IndexIVF.cpp:64
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:67
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
void search_knn_inner_product(size_t nx, const float *x, const long *keys, float_minheap_array_t *res) const
Implementation of the search for the inner product metric.
Definition: IndexIVF.cpp:273
virtual void train(idx_t n, const float *x)
Definition: Index.h:92
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:59
bool spherical
do we want normalized centroids?
Definition: Clustering.h:31
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:163
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:44
std::vector< std::vector< float > > vecs
Definition: IndexIVF.h:116
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
Definition: IndexIVF.cpp:229
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:364