Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/IndexIVFFlat.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #include "IndexIVFFlat.h"
12 
13 #include <cstdio>
14 
15 #include "utils.h"
16 
17 #include "FaissAssert.h"
18 #include "IndexFlat.h"
19 #include "AuxIndexStructures.h"
20 
21 
22 namespace faiss {
23 
24 
25 /*****************************************
26  * IndexIVFFlat implementation
27  ******************************************/
28 
29 IndexIVFFlat::IndexIVFFlat (Index * quantizer,
30  size_t d, size_t nlist, MetricType metric):
31  IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
32 {
33  code_size = sizeof(float) * d;
34 }
35 
36 
37 void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const long *xids)
38 {
39  add_core (n, x, xids, nullptr);
40 }
41 
42 void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,
43  const long *precomputed_idx)
44 
45 {
46  FAISS_THROW_IF_NOT (is_trained);
47  assert (invlists);
48  FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
49  "cannot have direct map and add with ids");
50  const long * idx;
52 
53  if (precomputed_idx) {
54  idx = precomputed_idx;
55  } else {
56  long * idx0 = new long [n];
57  del.set (idx0);
58  quantizer->assign (n, x, idx0);
59  idx = idx0;
60  }
61  long n_add = 0;
62  for (size_t i = 0; i < n; i++) {
63  long id = xids ? xids[i] : ntotal + i;
64  long list_no = idx [i];
65 
66  if (list_no < 0)
67  continue;
68  const float *xi = x + i * d;
69  size_t offset = invlists->add_entry (
70  list_no, id, (const uint8_t*) xi);
71 
73  direct_map.push_back (list_no << 32 | offset);
74  n_add++;
75  }
76  if (verbose) {
77  printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
78  n_add, n);
79  }
80  ntotal += n;
81 }
82 
83 void IndexIVFFlat::encode_vectors(idx_t n, const float* x,
84  const idx_t * /* list_nos */,
85  uint8_t * codes) const
86 {
87  memcpy (codes, x, code_size * n);
88 }
89 
90 
91 
92 namespace {
93 
94 
95 template<MetricType metric, bool store_pairs, class C>
96 struct IVFFlatScanner: InvertedListScanner {
97 
98  size_t d;
99  IVFFlatScanner(size_t d): d(d) {}
100 
101  const float *xi;
102  void set_query (const float *query) override {
103  this->xi = query;
104  }
105 
106  idx_t list_no;
107  void set_list (idx_t list_no, float /* coarse_dis */) override {
108  this->list_no = list_no;
109  }
110 
111  float distance_to_code (const uint8_t *code) const override {
112  const float *yj = (float*)code;
113  float dis = metric == METRIC_INNER_PRODUCT ?
114  fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
115  return dis;
116  }
117 
118  size_t scan_codes (size_t list_size,
119  const uint8_t *codes,
120  const idx_t *ids,
121  float *simi, idx_t *idxi,
122  size_t k) const override
123  {
124  const float *list_vecs = (const float*)codes;
125  size_t nup = 0;
126  for (size_t j = 0; j < list_size; j++) {
127  const float * yj = list_vecs + d * j;
128  float dis = metric == METRIC_INNER_PRODUCT ?
129  fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
130  if (C::cmp (simi[0], dis)) {
131  heap_pop<C> (k, simi, idxi);
132  long id = store_pairs ? (list_no << 32 | j) : ids[j];
133  heap_push<C> (k, simi, idxi, dis, id);
134  nup++;
135  }
136  }
137  return nup;
138  }
139 
140 };
141 
142 
143 } // anonymous namespace
144 
145 
146 
147 InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
148  (bool store_pairs) const
149 {
150  if (metric_type == METRIC_INNER_PRODUCT) {
151  if (store_pairs) {
152  return new IVFFlatScanner<
153  METRIC_INNER_PRODUCT, true, CMin<float, long> > (d);
154  } else {
155  return new IVFFlatScanner<
156  METRIC_INNER_PRODUCT, false, CMin<float, long> >(d);
157  }
158  } else if (metric_type == METRIC_L2) {
159  if (store_pairs) {
160  return new IVFFlatScanner<
161  METRIC_L2, true, CMax<float, long> > (d);
162  } else {
163  return new IVFFlatScanner<
164  METRIC_L2, false, CMax<float, long> >(d);
165  }
166  }
167  return nullptr;
168 }
169 
170 
171 void IndexIVFFlat::range_search (idx_t nx, const float *x, float radius,
172  RangeSearchResult *result) const
173 {
174  idx_t * keys = new idx_t [nx * nprobe];
175  ScopeDeleter<idx_t> del (keys);
176  quantizer->assign (nx, x, keys, nprobe);
177 
178 #pragma omp parallel
179  {
180  RangeSearchPartialResult pres(result);
181 
182  for (size_t i = 0; i < nx; i++) {
183  const float * xi = x + i * d;
184  const long * keysi = keys + i * nprobe;
185 
187  pres.new_result (i);
188 
189  for (size_t ik = 0; ik < nprobe; ik++) {
190  long key = keysi[ik]; /* select the list */
191  if (key < 0 || key >= (long) nlist) {
192  fprintf (stderr, "Invalid key=%ld at ik=%ld nlist=%ld\n",
193  key, ik, nlist);
194  throw;
195  }
196 
197  const size_t list_size = invlists->list_size(key);
198  InvertedLists::ScopedCodes scodes (invlists, key);
199  const float * list_vecs = (const float*)scodes.get();
201 
202  for (size_t j = 0; j < list_size; j++) {
203  const float * yj = list_vecs + d * j;
204  if (metric_type == METRIC_L2) {
205  float disij = fvec_L2sqr (xi, yj, d);
206  if (disij < radius) {
207  qres.add (disij, ids[j]);
208  }
209  } else if (metric_type == METRIC_INNER_PRODUCT) {
210  float disij = fvec_inner_product(xi, yj, d);
211  if (disij > radius) {
212  qres.add (disij, ids[j]);
213  }
214  }
215  }
216  }
217  }
218 
219  pres.finalize ();
220  }
221 }
222 
223 void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
224 {
225 
226  FAISS_THROW_IF_NOT (maintain_direct_map);
227  FAISS_THROW_IF_NOT (is_trained);
228  std::vector<idx_t> assign (n);
229  quantizer->assign (n, x, assign.data());
230 
231  for (size_t i = 0; i < n; i++) {
232  idx_t id = new_ids[i];
233  FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
234  "id to update out of range");
235  { // remove old one
236  long dm = direct_map[id];
237  long ofs = dm & 0xffffffff;
238  long il = dm >> 32;
239  size_t l = invlists->list_size (il);
240  if (ofs != l - 1) { // move l - 1 to ofs
241  long id2 = invlists->get_single_id (il, l - 1);
242  direct_map[id2] = (il << 32) | ofs;
243  invlists->update_entry (il, ofs, id2,
244  invlists->get_single_code (il, l - 1));
245  }
246  invlists->resize (il, l - 1);
247  }
248  { // insert new one
249  long il = assign[i];
250  size_t l = invlists->list_size (il);
251  long dm = (il << 32) | l;
252  direct_map[id] = dm;
253  invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
254  }
255  }
256 
257 }
258 
259 void IndexIVFFlat::reconstruct_from_offset (long list_no, long offset,
260  float* recons) const
261 {
262  memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
263 }
264 
265 /*****************************************
266  * IndexIVFFlatDedup implementation
267  ******************************************/
268 
269 IndexIVFFlatDedup::IndexIVFFlatDedup (
270  Index * quantizer, size_t d, size_t nlist_,
271  MetricType metric_type):
272  IndexIVFFlat (quantizer, d, nlist_, metric_type)
273 {}
274 
275 // from Python's stringobject.c
276 static uint64_t hash_bytes (const uint8_t *bytes, long n) {
277  const uint8_t *p = bytes;
278  uint64_t x = (uint64_t)(*p) << 7;
279  long len = n;
280  while (--len >= 0) {
281  x = (1000003*x) ^ *p++;
282  }
283  x ^= n;
284  return x;
285 }
286 
287 
288 void IndexIVFFlatDedup::train(idx_t n, const float* x)
289 {
290  std::unordered_map<uint64_t, idx_t> map;
291  float * x2 = new float [n * d];
292  ScopeDeleter<float> del (x2);
293 
294  long n2 = 0;
295  for (long i = 0; i < n; i++) {
296  uint64_t hash = hash_bytes((uint8_t *)(x + i * d), code_size);
297  if (map.count(hash) &&
298  !memcmp (x2 + map[hash] * d, x + i * d, code_size)) {
299  // is duplicate, skip
300  } else {
301  map [hash] = n2;
302  memcpy (x2 + n2 * d, x + i * d, code_size);
303  n2 ++;
304  }
305  }
306  if (verbose) {
307  printf ("IndexIVFFlatDedup::train: train on %ld points after dedup "
308  "(was %ld points)\n", n2, n);
309  }
310  IndexIVFFlat::train (n2, x2);
311 }
312 
313 
314 
316  idx_t na, const float* x, const long* xids)
317 {
318 
319  FAISS_THROW_IF_NOT (is_trained);
320  assert (invlists);
321  FAISS_THROW_IF_NOT_MSG (
323  "IVFFlatDedup not implemented with direct_map");
324  long * idx = new long [na];
325  ScopeDeleter<long> del (idx);
326  quantizer->assign (na, x, idx);
327 
328  long n_add = 0, n_dup = 0;
329  // TODO make a omp loop with this
330  for (size_t i = 0; i < na; i++) {
331  idx_t id = xids ? xids[i] : ntotal + i;
332  long list_no = idx [i];
333 
334  if (list_no < 0) {
335  continue;
336  }
337  const float *xi = x + i * d;
338 
339  // search if there is already an entry with that id
340  InvertedLists::ScopedCodes codes (invlists, list_no);
341 
342  long n = invlists->list_size (list_no);
343  long offset = -1;
344  for (long o = 0; o < n; o++) {
345  if (!memcmp (codes.get() + o * code_size,
346  xi, code_size)) {
347  offset = o;
348  break;
349  }
350  }
351 
352  if (offset == -1) { // not found
353  invlists->add_entry (list_no, id, (const uint8_t*) xi);
354  } else {
355  // mark equivalence
356  idx_t id2 = invlists->get_single_id (list_no, offset);
357  std::pair<idx_t, idx_t> pair (id2, id);
358  instances.insert (pair);
359  n_dup ++;
360  }
361  n_add++;
362  }
363  if (verbose) {
364  printf("IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
365  " (out of which %ld are duplicates)\n",
366  n_add, na, n_dup);
367  }
368  ntotal += n_add;
369 }
370 
372  idx_t n, const float *x, idx_t k,
373  const idx_t *assign,
374  const float *centroid_dis,
375  float *distances, idx_t *labels,
376  bool store_pairs,
377  const IVFSearchParameters *params) const
378 {
379  FAISS_THROW_IF_NOT_MSG (
380  !store_pairs, "store_pairs not supported in IVFDedup");
381 
382  IndexIVFFlat::search_preassigned (n, x, k, assign, centroid_dis,
383  distances, labels, false,
384  params);
385 
386  std::vector <idx_t> labels2 (k);
387  std::vector <float> dis2 (k);
388 
389  for (long i = 0; i < n; i++) {
390  idx_t *labels1 = labels + i * k;
391  float *dis1 = distances + i * k;
392  long j = 0;
393  for (; j < k; j++) {
394  if (instances.find (labels1[j]) != instances.end ()) {
395  // a duplicate: special handling
396  break;
397  }
398  }
399  if (j < k) {
400  // there are duplicates, special handling
401  long j0 = j;
402  long rp = j;
403  while (j < k) {
404  auto range = instances.equal_range (labels1[rp]);
405  float dis = dis1[rp];
406  labels2[j] = labels1[rp];
407  dis2[j] = dis;
408  j ++;
409  for (auto it = range.first; j < k && it != range.second; ++it) {
410  labels2[j] = it->second;
411  dis2[j] = dis;
412  j++;
413  }
414  rp++;
415  }
416  memcpy (labels1 + j0, labels2.data() + j0,
417  sizeof(labels1[0]) * (k - j0));
418  memcpy (dis1 + j0, dis2.data() + j0,
419  sizeof(dis2[0]) * (k - j0));
420  }
421  }
422 
423 }
424 
425 
427 {
428  std::unordered_map<idx_t, idx_t> replace;
429  std::vector<std::pair<idx_t, idx_t> > toadd;
430  for (auto it = instances.begin(); it != instances.end(); ) {
431  if (sel.is_member(it->first)) {
432  // then we erase this entry
433  if (!sel.is_member(it->second)) {
434  // if the second is not erased
435  if (replace.count(it->first) == 0) {
436  replace[it->first] = it->second;
437  } else { // remember we should add an element
438  std::pair<idx_t, idx_t> new_entry (
439  replace[it->first], it->second);
440  toadd.push_back(new_entry);
441  }
442  }
443  it = instances.erase(it);
444  } else {
445  if (sel.is_member(it->second)) {
446  it = instances.erase(it);
447  } else {
448  ++it;
449  }
450  }
451  }
452 
453  instances.insert (toadd.begin(), toadd.end());
454 
455  // mostly copied from IndexIVF.cpp
456 
457  FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
458  "direct map remove not implemented");
459 
460  std::vector<long> toremove(nlist);
461 
462 #pragma omp parallel for
463  for (long i = 0; i < nlist; i++) {
464  long l0 = invlists->list_size (i), l = l0, j = 0;
466  while (j < l) {
467  if (sel.is_member (idsi[j])) {
468  if (replace.count(idsi[j]) == 0) {
469  l--;
470  invlists->update_entry (
471  i, j,
472  invlists->get_single_id (i, l),
473  InvertedLists::ScopedCodes (invlists, i, l).get());
474  } else {
475  invlists->update_entry (
476  i, j,
477  replace[idsi[j]],
478  InvertedLists::ScopedCodes (invlists, i, j).get());
479  j++;
480  }
481  } else {
482  j++;
483  }
484  }
485  toremove[i] = l0 - l;
486  }
487  // this will not run well in parallel on ondisk because of possible shrinks
488  long nremove = 0;
489  for (long i = 0; i < nlist; i++) {
490  if (toremove[i] > 0) {
491  nremove += toremove[i];
492  invlists->resize(
493  i, invlists->list_size(i) - toremove[i]);
494  }
495  }
496  ntotal -= nremove;
497  return nremove;
498 }
499 
500 
502  idx_t ,
503  const float* ,
504  float ,
505  RangeSearchResult* ) const
506 {
507  FAISS_THROW_MSG ("not implemented");
508 }
509 
510 void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
511 {
512  FAISS_THROW_MSG ("not implemented");
513 }
514 
515 
517  long , long ,
518  float* ) const
519 {
520  FAISS_THROW_MSG ("not implemented");
521 }
522 
523 
524 
525 
526 } // namespace faiss
void train(idx_t n, const float *x) override
also dedups the training set
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
Definition: IndexIVF.cpp:189
result structure for a single query
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils_simd.cpp:502
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
not implemented
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:98
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:35
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
virtual idx_t get_single_id(size_t list_no, size_t offset) const
int d
vector dimension
Definition: Index.h:66
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:408
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
long idx_t
all indices are this type
Definition: Index.h:64
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
bool verbose
verbosity level
Definition: Index.h:68
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
QueryResult & new_result(idx_t qno)
begin a new result
virtual void update_vectors(int nv, idx_t *idx, const float *v)
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
not implemented
void update_vectors(int nv, idx_t *idx, const float *v) override
not implemented
the entries in the buffers are split per query
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:93
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:102
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:96
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
virtual void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
std::unordered_multimap< idx_t, idx_t > instances
Definition: IndexIVFFlat.h:82