Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/IndexIVFFlat.cpp
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #include "IndexIVFFlat.h"
11 
12 #include <cstdio>
13 
14 #include "utils.h"
15 
16 #include "FaissAssert.h"
17 #include "IndexFlat.h"
18 #include "AuxIndexStructures.h"
19 
20 
21 namespace faiss {
22 
23 
24 /*****************************************
25  * IndexIVFFlat implementation
26  ******************************************/
27 
28 IndexIVFFlat::IndexIVFFlat (Index * quantizer,
29  size_t d, size_t nlist, MetricType metric):
30  IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
31 {
32  code_size = sizeof(float) * d;
33 }
34 
35 
36 void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const long *xids)
37 {
38  add_core (n, x, xids, nullptr);
39 }
40 
41 void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,
42  const long *precomputed_idx)
43 
44 {
45  FAISS_THROW_IF_NOT (is_trained);
46  assert (invlists);
47  FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
48  "cannot have direct map and add with ids");
49  const long * idx;
51 
52  if (precomputed_idx) {
53  idx = precomputed_idx;
54  } else {
55  long * idx0 = new long [n];
56  del.set (idx0);
57  quantizer->assign (n, x, idx0);
58  idx = idx0;
59  }
60  long n_add = 0;
61  for (size_t i = 0; i < n; i++) {
62  long id = xids ? xids[i] : ntotal + i;
63  long list_no = idx [i];
64 
65  if (list_no < 0)
66  continue;
67  const float *xi = x + i * d;
68  size_t offset = invlists->add_entry (
69  list_no, id, (const uint8_t*) xi);
70 
72  direct_map.push_back (list_no << 32 | offset);
73  n_add++;
74  }
75  if (verbose) {
76  printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
77  n_add, n);
78  }
79  ntotal += n;
80 }
81 
82 void IndexIVFFlat::encode_vectors(idx_t n, const float* x,
83  const idx_t * /* list_nos */,
84  uint8_t * codes) const
85 {
86  memcpy (codes, x, code_size * n);
87 }
88 
89 
90 
91 namespace {
92 
93 
94 template<MetricType metric, bool store_pairs, class C>
95 struct IVFFlatScanner: InvertedListScanner {
96 
97  size_t d;
98  IVFFlatScanner(size_t d): d(d) {}
99 
100  const float *xi;
101  void set_query (const float *query) override {
102  this->xi = query;
103  }
104 
105  idx_t list_no;
106  void set_list (idx_t list_no, float /* coarse_dis */) override {
107  this->list_no = list_no;
108  }
109 
110  float distance_to_code (const uint8_t *code) const override {
111  const float *yj = (float*)code;
112  float dis = metric == METRIC_INNER_PRODUCT ?
113  fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
114  return dis;
115  }
116 
117  size_t scan_codes (size_t list_size,
118  const uint8_t *codes,
119  const idx_t *ids,
120  float *simi, idx_t *idxi,
121  size_t k) const override
122  {
123  const float *list_vecs = (const float*)codes;
124  size_t nup = 0;
125  for (size_t j = 0; j < list_size; j++) {
126  const float * yj = list_vecs + d * j;
127  float dis = metric == METRIC_INNER_PRODUCT ?
128  fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
129  if (C::cmp (simi[0], dis)) {
130  heap_pop<C> (k, simi, idxi);
131  long id = store_pairs ? (list_no << 32 | j) : ids[j];
132  heap_push<C> (k, simi, idxi, dis, id);
133  nup++;
134  }
135  }
136  return nup;
137  }
138 
139  void scan_codes_range (size_t list_size,
140  const uint8_t *codes,
141  const idx_t *ids,
142  float radius,
143  RangeQueryResult & res) const override
144  {
145  const float *list_vecs = (const float*)codes;
146  for (size_t j = 0; j < list_size; j++) {
147  const float * yj = list_vecs + d * j;
148  float dis = metric == METRIC_INNER_PRODUCT ?
149  fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
150  if (C::cmp (radius, dis)) {
151  long id = store_pairs ? (list_no << 32 | j) : ids[j];
152  res.add (dis, id);
153  }
154  }
155  }
156 
157 
158 };
159 
160 
161 } // anonymous namespace
162 
163 
164 
165 InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
166  (bool store_pairs) const
167 {
168  if (metric_type == METRIC_INNER_PRODUCT) {
169  if (store_pairs) {
170  return new IVFFlatScanner<
171  METRIC_INNER_PRODUCT, true, CMin<float, long> > (d);
172  } else {
173  return new IVFFlatScanner<
174  METRIC_INNER_PRODUCT, false, CMin<float, long> >(d);
175  }
176  } else if (metric_type == METRIC_L2) {
177  if (store_pairs) {
178  return new IVFFlatScanner<
179  METRIC_L2, true, CMax<float, long> > (d);
180  } else {
181  return new IVFFlatScanner<
182  METRIC_L2, false, CMax<float, long> >(d);
183  }
184  }
185  return nullptr;
186 }
187 
188 
189 
190 void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
191 {
192 
193  FAISS_THROW_IF_NOT (maintain_direct_map);
194  FAISS_THROW_IF_NOT (is_trained);
195  std::vector<idx_t> assign (n);
196  quantizer->assign (n, x, assign.data());
197 
198  for (size_t i = 0; i < n; i++) {
199  idx_t id = new_ids[i];
200  FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
201  "id to update out of range");
202  { // remove old one
203  long dm = direct_map[id];
204  long ofs = dm & 0xffffffff;
205  long il = dm >> 32;
206  size_t l = invlists->list_size (il);
207  if (ofs != l - 1) { // move l - 1 to ofs
208  long id2 = invlists->get_single_id (il, l - 1);
209  direct_map[id2] = (il << 32) | ofs;
210  invlists->update_entry (il, ofs, id2,
211  invlists->get_single_code (il, l - 1));
212  }
213  invlists->resize (il, l - 1);
214  }
215  { // insert new one
216  long il = assign[i];
217  size_t l = invlists->list_size (il);
218  long dm = (il << 32) | l;
219  direct_map[id] = dm;
220  invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
221  }
222  }
223 
224 }
225 
226 void IndexIVFFlat::reconstruct_from_offset (long list_no, long offset,
227  float* recons) const
228 {
229  memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
230 }
231 
232 /*****************************************
233  * IndexIVFFlatDedup implementation
234  ******************************************/
235 
236 IndexIVFFlatDedup::IndexIVFFlatDedup (
237  Index * quantizer, size_t d, size_t nlist_,
238  MetricType metric_type):
239  IndexIVFFlat (quantizer, d, nlist_, metric_type)
240 {}
241 
242 
243 void IndexIVFFlatDedup::train(idx_t n, const float* x)
244 {
245  std::unordered_map<uint64_t, idx_t> map;
246  float * x2 = new float [n * d];
247  ScopeDeleter<float> del (x2);
248 
249  long n2 = 0;
250  for (long i = 0; i < n; i++) {
251  uint64_t hash = hash_bytes((uint8_t *)(x + i * d), code_size);
252  if (map.count(hash) &&
253  !memcmp (x2 + map[hash] * d, x + i * d, code_size)) {
254  // is duplicate, skip
255  } else {
256  map [hash] = n2;
257  memcpy (x2 + n2 * d, x + i * d, code_size);
258  n2 ++;
259  }
260  }
261  if (verbose) {
262  printf ("IndexIVFFlatDedup::train: train on %ld points after dedup "
263  "(was %ld points)\n", n2, n);
264  }
265  IndexIVFFlat::train (n2, x2);
266 }
267 
268 
269 
271  idx_t na, const float* x, const long* xids)
272 {
273 
274  FAISS_THROW_IF_NOT (is_trained);
275  assert (invlists);
276  FAISS_THROW_IF_NOT_MSG (
278  "IVFFlatDedup not implemented with direct_map");
279  long * idx = new long [na];
280  ScopeDeleter<long> del (idx);
281  quantizer->assign (na, x, idx);
282 
283  long n_add = 0, n_dup = 0;
284  // TODO make a omp loop with this
285  for (size_t i = 0; i < na; i++) {
286  idx_t id = xids ? xids[i] : ntotal + i;
287  long list_no = idx [i];
288 
289  if (list_no < 0) {
290  continue;
291  }
292  const float *xi = x + i * d;
293 
294  // search if there is already an entry with that id
295  InvertedLists::ScopedCodes codes (invlists, list_no);
296 
297  long n = invlists->list_size (list_no);
298  long offset = -1;
299  for (long o = 0; o < n; o++) {
300  if (!memcmp (codes.get() + o * code_size,
301  xi, code_size)) {
302  offset = o;
303  break;
304  }
305  }
306 
307  if (offset == -1) { // not found
308  invlists->add_entry (list_no, id, (const uint8_t*) xi);
309  } else {
310  // mark equivalence
311  idx_t id2 = invlists->get_single_id (list_no, offset);
312  std::pair<idx_t, idx_t> pair (id2, id);
313  instances.insert (pair);
314  n_dup ++;
315  }
316  n_add++;
317  }
318  if (verbose) {
319  printf("IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
320  " (out of which %ld are duplicates)\n",
321  n_add, na, n_dup);
322  }
323  ntotal += n_add;
324 }
325 
327  idx_t n, const float *x, idx_t k,
328  const idx_t *assign,
329  const float *centroid_dis,
330  float *distances, idx_t *labels,
331  bool store_pairs,
332  const IVFSearchParameters *params) const
333 {
334  FAISS_THROW_IF_NOT_MSG (
335  !store_pairs, "store_pairs not supported in IVFDedup");
336 
337  IndexIVFFlat::search_preassigned (n, x, k, assign, centroid_dis,
338  distances, labels, false,
339  params);
340 
341  std::vector <idx_t> labels2 (k);
342  std::vector <float> dis2 (k);
343 
344  for (long i = 0; i < n; i++) {
345  idx_t *labels1 = labels + i * k;
346  float *dis1 = distances + i * k;
347  long j = 0;
348  for (; j < k; j++) {
349  if (instances.find (labels1[j]) != instances.end ()) {
350  // a duplicate: special handling
351  break;
352  }
353  }
354  if (j < k) {
355  // there are duplicates, special handling
356  long j0 = j;
357  long rp = j;
358  while (j < k) {
359  auto range = instances.equal_range (labels1[rp]);
360  float dis = dis1[rp];
361  labels2[j] = labels1[rp];
362  dis2[j] = dis;
363  j ++;
364  for (auto it = range.first; j < k && it != range.second; ++it) {
365  labels2[j] = it->second;
366  dis2[j] = dis;
367  j++;
368  }
369  rp++;
370  }
371  memcpy (labels1 + j0, labels2.data() + j0,
372  sizeof(labels1[0]) * (k - j0));
373  memcpy (dis1 + j0, dis2.data() + j0,
374  sizeof(dis2[0]) * (k - j0));
375  }
376  }
377 
378 }
379 
380 
382 {
383  std::unordered_map<idx_t, idx_t> replace;
384  std::vector<std::pair<idx_t, idx_t> > toadd;
385  for (auto it = instances.begin(); it != instances.end(); ) {
386  if (sel.is_member(it->first)) {
387  // then we erase this entry
388  if (!sel.is_member(it->second)) {
389  // if the second is not erased
390  if (replace.count(it->first) == 0) {
391  replace[it->first] = it->second;
392  } else { // remember we should add an element
393  std::pair<idx_t, idx_t> new_entry (
394  replace[it->first], it->second);
395  toadd.push_back(new_entry);
396  }
397  }
398  it = instances.erase(it);
399  } else {
400  if (sel.is_member(it->second)) {
401  it = instances.erase(it);
402  } else {
403  ++it;
404  }
405  }
406  }
407 
408  instances.insert (toadd.begin(), toadd.end());
409 
410  // mostly copied from IndexIVF.cpp
411 
412  FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
413  "direct map remove not implemented");
414 
415  std::vector<long> toremove(nlist);
416 
417 #pragma omp parallel for
418  for (long i = 0; i < nlist; i++) {
419  long l0 = invlists->list_size (i), l = l0, j = 0;
421  while (j < l) {
422  if (sel.is_member (idsi[j])) {
423  if (replace.count(idsi[j]) == 0) {
424  l--;
425  invlists->update_entry (
426  i, j,
427  invlists->get_single_id (i, l),
428  InvertedLists::ScopedCodes (invlists, i, l).get());
429  } else {
430  invlists->update_entry (
431  i, j,
432  replace[idsi[j]],
433  InvertedLists::ScopedCodes (invlists, i, j).get());
434  j++;
435  }
436  } else {
437  j++;
438  }
439  }
440  toremove[i] = l0 - l;
441  }
442  // this will not run well in parallel on ondisk because of possible shrinks
443  long nremove = 0;
444  for (long i = 0; i < nlist; i++) {
445  if (toremove[i] > 0) {
446  nremove += toremove[i];
447  invlists->resize(
448  i, invlists->list_size(i) - toremove[i]);
449  }
450  }
451  ntotal -= nremove;
452  return nremove;
453 }
454 
455 
457  idx_t ,
458  const float* ,
459  float ,
460  RangeSearchResult* ) const
461 {
462  FAISS_THROW_MSG ("not implemented");
463 }
464 
465 void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
466 {
467  FAISS_THROW_MSG ("not implemented");
468 }
469 
470 
472  long , long ,
473  float* ) const
474 {
475  FAISS_THROW_MSG ("not implemented");
476 }
477 
478 
479 
480 
481 } // namespace faiss
void train(idx_t n, const float *x) override
also dedups the training set
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
Definition: IndexIVF.cpp:250
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils_simd.cpp:501
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
not implemented
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
virtual size_t list_size(size_t list_no) const =0
get the size of a list
virtual idx_t get_single_id(size_t list_no, size_t offset) const
int d
vector dimension
Definition: Index.h:66
long idx_t
all indices are this type
Definition: Index.h:62
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:688
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
uint64_t hash_bytes(const uint8_t *bytes, long n)
Definition: utils.cpp:1584
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
bool verbose
verbosity level
Definition: Index.h:68
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
virtual void update_vectors(int nv, idx_t *idx, const float *v)
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
not implemented
void update_vectors(int nv, idx_t *idx, const float *v) override
not implemented
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:92
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:32
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:109
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
size_t nlist
number of possible key values
Definition: IndexIVF.h:33
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:95
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
virtual void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
std::unordered_multimap< idx_t, idx_t > instances
Definition: IndexIVFFlat.h:67