Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/MetaIndexes.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #include "MetaIndexes.h"
13 
14 #include <pthread.h>
15 
16 #include <cstdio>
17 
18 #include "FaissAssert.h"
19 #include "Heap.h"
20 #include "AuxIndexStructures.h"
21 
22 
23 namespace faiss {
24 
25 /*****************************************************
26  * IndexIDMap implementation
27  *******************************************************/
28 
29 IndexIDMap::IndexIDMap (Index *index):
30  index (index),
31  own_fields (false)
32 {
33  FAISS_THROW_IF_NOT_MSG (index->ntotal == 0, "index must be empty on input");
34  is_trained = index->is_trained;
35  metric_type = index->metric_type;
36  verbose = index->verbose;
37  d = index->d;
38 }
39 
40 void IndexIDMap::add (idx_t, const float *)
41 {
42  FAISS_THROW_MSG ("add does not make sense with IndexIDMap, "
43  "use add_with_ids");
44 }
45 
46 
47 void IndexIDMap::train (idx_t n, const float *x)
48 {
49  index->train (n, x);
50  is_trained = index->is_trained;
51 }
52 
54 {
55  index->reset ();
56  ntotal = 0;
57 }
58 
59 
60 void IndexIDMap::add_with_ids (idx_t n, const float * x, const long *xids)
61 {
62  index->add (n, x);
63  for (idx_t i = 0; i < n; i++)
64  id_map.push_back (xids[i]);
65  ntotal = index->ntotal;
66 }
67 
68 
69 void IndexIDMap::search (idx_t n, const float *x, idx_t k,
70  float *distances, idx_t *labels) const
71 {
72  index->search (n, x, k, distances, labels);
73  idx_t *li = labels;
74  for (idx_t i = 0; i < n * k; i++) {
75  li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
76  }
77 }
78 
79 
80 void IndexIDMap::range_search (idx_t n, const float *x, float radius,
81  RangeSearchResult *result) const
82 {
83  index->range_search(n, x, radius, result);
84  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
85  result->labels[i] = result->labels[i] < 0 ?
86  result->labels[i] : id_map[result->labels[i]];
87  }
88 }
89 
90 namespace {
91 
92 struct IDTranslatedSelector: IDSelector {
93  const std::vector <long> & id_map;
94  const IDSelector & sel;
95  IDTranslatedSelector (const std::vector <long> & id_map,
96  const IDSelector & sel):
97  id_map (id_map), sel (sel)
98  {}
99  bool is_member(idx_t id) const override {
100  return sel.is_member(id_map[id]);
101  }
102 };
103 
104 }
105 
107 {
108  // remove in sub-index first
109  IDTranslatedSelector sel2 (id_map, sel);
110  long nremove = index->remove_ids (sel2);
111 
112  long j = 0;
113  for (idx_t i = 0; i < ntotal; i++) {
114  if (sel.is_member (id_map[i])) {
115  // remove
116  } else {
117  id_map[j] = id_map[i];
118  j++;
119  }
120  }
121  FAISS_ASSERT (j == index->ntotal);
122  ntotal = j;
123  id_map.resize(ntotal);
124  return nremove;
125 }
126 
127 
128 
129 
130 IndexIDMap::~IndexIDMap ()
131 {
132  if (own_fields) delete index;
133 }
134 
135 /*****************************************************
136  * IndexIDMap2 implementation
137  *******************************************************/
138 
139 IndexIDMap2::IndexIDMap2 (Index *index): IndexIDMap (index)
140 {}
141 
142 void IndexIDMap2::add_with_ids(idx_t n, const float* x, const long* xids)
143 {
144  size_t prev_ntotal = ntotal;
145  IndexIDMap::add_with_ids (n, x, xids);
146  for (size_t i = prev_ntotal; i < ntotal; i++) {
147  rev_map [id_map [i]] = i;
148  }
149 }
150 
152 {
153  rev_map.clear ();
154  for (size_t i = 0; i < ntotal; i++) {
155  rev_map [id_map [i]] = i;
156  }
157 }
158 
159 
161 {
162  // This is quite inefficient
163  long nremove = IndexIDMap::remove_ids (sel);
165  return nremove;
166 }
167 
168 void IndexIDMap2::reconstruct (idx_t key, float * recons) const
169 {
170  try {
171  index->reconstruct (rev_map.at (key), recons);
172  } catch (const std::out_of_range& e) {
173  FAISS_THROW_FMT ("key %ld not found", key);
174  }
175 }
176 
177 
178 
179 /*****************************************************
180  * IndexShards implementation
181  *******************************************************/
182 
183 // subroutines
184 namespace {
185 
186 
187 typedef Index::idx_t idx_t;
188 
189 
190 template<class Job>
191 struct Thread {
192  Job job;
193  pthread_t thread;
194 
195  Thread () {}
196 
197  explicit Thread (const Job & job): job(job) {}
198 
199  void start () {
200  pthread_create (&thread, nullptr, run, this);
201  }
202 
203  void wait () {
204  pthread_join (thread, nullptr);
205  }
206 
207  static void * run (void *arg) {
208  static_cast<Thread*> (arg)->job.run();
209  return nullptr;
210  }
211 
212 };
213 
214 
215 /// callback + thread management to train 1 shard
216 struct TrainJob {
217  IndexShards *index; // the relevant index
218  int no; // shard number
219  idx_t n; // train points
220  const float *x;
221 
222  void run ()
223  {
224  if (index->verbose)
225  printf ("begin train shard %d on %ld points\n", no, n);
226  index->shard_indexes [no]->train(n, x);
227  if (index->verbose)
228  printf ("end train shard %d\n", no);
229  }
230 
231 };
232 
233 struct AddJob {
234  IndexShards *index; // the relevant index
235  int no; // shard number
236 
237  idx_t n;
238  const float *x;
239  const idx_t *ids;
240 
241  void run ()
242  {
243  if (index->verbose)
244  printf ("begin add shard %d on %ld points\n", no, n);
245  if (ids)
246  index->shard_indexes[no]->add_with_ids (n, x, ids);
247  else
248  index->shard_indexes[no]->add (n, x);
249  if (index->verbose)
250  printf ("end add shard %d on %ld points\n", no, n);
251  }
252 };
253 
254 
255 
256 /// callback + thread management to query in 1 shard
257 struct QueryJob {
258  const IndexShards *index; // the relevant index
259  int no; // shard number
260 
261  // query params
262  idx_t n;
263  const float *x;
264  idx_t k;
265  float *distances;
266  idx_t *labels;
267 
268 
269  void run ()
270  {
271  if (index->verbose)
272  printf ("begin query shard %d on %ld points\n", no, n);
273  index->shard_indexes [no]->search (n, x, k,
274  distances, labels);
275  if (index->verbose)
276  printf ("end query shard %d\n", no);
277  }
278 
279 
280 };
281 
282 
283 
284 
285 // add translation to all valid labels
286 void translate_labels (long n, idx_t *labels, long translation)
287 {
288  if (translation == 0) return;
289  for (long i = 0; i < n; i++) {
290  if(labels[i] < 0) return;
291  labels[i] += translation;
292  }
293 }
294 
295 
296 /** merge result tables from several shards.
297  * @param all_distances size nshard * n * k
298  * @param all_labels idem
299  * @param translartions label translations to apply, size nshard
300  */
301 
302 template <class C>
303 void merge_tables (long n, long k, long nshard,
304  float *distances, idx_t *labels,
305  const float *all_distances,
306  idx_t *all_labels,
307  const long *translations)
308 {
309  if(k == 0) {
310  return;
311  }
312 
313  long stride = n * k;
314 #pragma omp parallel
315  {
316  std::vector<int> buf (2 * nshard);
317  int * pointer = buf.data();
318  int * shard_ids = pointer + nshard;
319  std::vector<float> buf2 (nshard);
320  float * heap_vals = buf2.data();
321 #pragma omp for
322  for (long i = 0; i < n; i++) {
323  // the heap maps values to the shard where they are
324  // produced.
325  const float *D_in = all_distances + i * k;
326  const idx_t *I_in = all_labels + i * k;
327  int heap_size = 0;
328 
329  for (long s = 0; s < nshard; s++) {
330  pointer[s] = 0;
331  if (I_in[stride * s] >= 0)
332  heap_push<C> (++heap_size, heap_vals, shard_ids,
333  D_in[stride * s], s);
334  }
335 
336  float *D = distances + i * k;
337  idx_t *I = labels + i * k;
338 
339  for (int j = 0; j < k; j++) {
340  if (heap_size == 0) {
341  I[j] = -1;
342  D[j] = C::neutral();
343  } else {
344  // pop best element
345  int s = shard_ids[0];
346  int & p = pointer[s];
347  D[j] = heap_vals[0];
348  I[j] = I_in[stride * s + p] + translations[s];
349 
350  heap_pop<C> (heap_size--, heap_vals, shard_ids);
351  p++;
352  if (p < k && I_in[stride * s + p] >= 0)
353  heap_push<C> (++heap_size, heap_vals, shard_ids,
354  D_in[stride * s + p], s);
355  }
356  }
357  }
358  }
359 }
360 
361 
362 };
363 
364 
365 
366 
367 IndexShards::IndexShards (idx_t d, bool threaded, bool successive_ids):
368  Index (d), own_fields (false),
369  threaded (threaded), successive_ids (successive_ids)
370 {
371 
372 }
373 
374 void IndexShards::add_shard (Index *idx)
375 {
376  shard_indexes.push_back (idx);
377  sync_with_shard_indexes ();
378 }
379 
380 void IndexShards::sync_with_shard_indexes ()
381 {
382  if (shard_indexes.empty()) return;
383  Index * index0 = shard_indexes[0];
384  d = index0->d;
385  metric_type = index0->metric_type;
386  is_trained = index0->is_trained;
387  ntotal = index0->ntotal;
388  for (int i = 1; i < shard_indexes.size(); i++) {
389  Index * index = shard_indexes[i];
390  FAISS_THROW_IF_NOT (metric_type == index->metric_type);
391  FAISS_THROW_IF_NOT (d == index->d);
392  ntotal += index->ntotal;
393  }
394 }
395 
396 
397 void IndexShards::train (idx_t n, const float *x)
398 {
399 
400  // pre-alloc because we don't want reallocs
401  std::vector<Thread<TrainJob > > tss (shard_indexes.size());
402  int nt = 0;
403  for (int i = 0; i < shard_indexes.size(); i++) {
404  if(!shard_indexes[i]->is_trained) {
405  TrainJob ts = {this, i, n, x};
406  if (threaded) {
407  tss[nt] = Thread<TrainJob> (ts);
408  tss[nt++].start();
409  } else {
410  ts.run();
411  }
412  }
413  }
414  for (int i = 0; i < nt; i++) {
415  tss[i].wait();
416  }
417  sync_with_shard_indexes ();
418 }
419 
420 void IndexShards::add (idx_t n, const float *x)
421 {
422  add_with_ids (n, x, nullptr);
423 }
424 
425  /**
426  * Cases (successive_ids, xids):
427  * - true, non-NULL ERROR: it makes no sense to pass in ids and
428  * request them to be shifted
429  * - true, NULL OK, but should be called only once (calls add()
430  * on sub-indexes).
431  * - false, non-NULL OK: will call add_with_ids with passed in xids
432  * distributed evenly over shards
433  * - false, NULL OK: will call add_with_ids on each sub-index,
434  * starting at ntotal
435  */
436 
437 void IndexShards::add_with_ids (idx_t n, const float * x, const long *xids)
438 {
439 
440  FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
441  "It makes no sense to pass in ids and "
442  "request them to be shifted");
443 
444  if (successive_ids) {
445  FAISS_THROW_IF_NOT_MSG(!xids,
446  "It makes no sense to pass in ids and "
447  "request them to be shifted");
448  FAISS_THROW_IF_NOT_MSG(ntotal == 0,
449  "when adding to IndexShards with sucessive_ids, "
450  "only add() in a single pass is supported");
451  }
452 
453  long nshard = shard_indexes.size();
454  const long *ids = xids;
455  ScopeDeleter<long> del;
456  if (!ids && !successive_ids) {
457  long *aids = new long[n];
458  for (long i = 0; i < n; i++)
459  aids[i] = ntotal + i;
460  ids = aids;
461  del.set (ids);
462  }
463 
464  std::vector<Thread<AddJob > > asa (shard_indexes.size());
465  int nt = 0;
466  for (int i = 0; i < nshard; i++) {
467  long i0 = i * n / nshard;
468  long i1 = (i + 1) * n / nshard;
469 
470  AddJob as = {this, i,
471  i1 - i0, x + i0 * d,
472  ids ? ids + i0 : nullptr};
473  if (threaded) {
474  asa[nt] = Thread<AddJob>(as);
475  asa[nt++].start();
476  } else {
477  as.run();
478  }
479  }
480  for (int i = 0; i < nt; i++) {
481  asa[i].wait();
482  }
483  ntotal += n;
484 }
485 
486 
487 
488 
489 
491 {
492  for (int i = 0; i < shard_indexes.size(); i++) {
493  shard_indexes[i]->reset ();
494  }
495  sync_with_shard_indexes ();
496 }
497 
499  idx_t n, const float *x, idx_t k,
500  float *distances, idx_t *labels) const
501 {
502  long nshard = shard_indexes.size();
503  float *all_distances = new float [nshard * k * n];
504  idx_t *all_labels = new idx_t [nshard * k * n];
505  ScopeDeleter<float> del (all_distances);
506  ScopeDeleter<idx_t> del2 (all_labels);
507 
508 #if 1
509 
510  // pre-alloc because we don't want reallocs
511  std::vector<Thread<QueryJob> > qss (nshard);
512  for (int i = 0; i < nshard; i++) {
513  QueryJob qs = {
514  this, i, n, x, k,
515  all_distances + i * k * n,
516  all_labels + i * k * n
517  };
518  if (threaded) {
519  qss[i] = Thread<QueryJob> (qs);
520  qss[i].start();
521  } else {
522  qs.run();
523  }
524  }
525 
526  if (threaded) {
527  for (int i = 0; i < qss.size(); i++) {
528  qss[i].wait();
529  }
530  }
531 #else
532 
533  // pre-alloc because we don't want reallocs
534  std::vector<QueryJob> qss (nshard);
535  for (int i = 0; i < nshard; i++) {
536  QueryJob qs = {
537  this, i, n, x, k,
538  all_distances + i * k * n,
539  all_labels + i * k * n
540  };
541  if (threaded) {
542  qss[i] = qs;
543  } else {
544  qs.run();
545  }
546  }
547 
548  if (threaded) {
549 #pragma omp parallel for
550  for (int i = 0; i < qss.size(); i++) {
551  qss[i].run();
552  }
553  }
554 
555 #endif
556  std::vector<long> translations (nshard, 0);
557  if (successive_ids) {
558  translations[0] = 0;
559  for (int s = 0; s + 1 < nshard; s++)
560  translations [s + 1] = translations [s] +
561  shard_indexes [s]->ntotal;
562  }
563 
564  if (metric_type == METRIC_L2) {
565  merge_tables< CMin<float, int> > (
566  n, k, nshard, distances, labels,
567  all_distances, all_labels, translations.data ());
568  } else {
569  merge_tables< CMax<float, int> > (
570  n, k, nshard, distances, labels,
571  all_distances, all_labels, translations.data ());
572  }
573 
574 }
575 
576 
577 
578 IndexShards::~IndexShards ()
579 {
580  if (own_fields) {
581  for (int s = 0; s < shard_indexes.size(); s++)
582  delete shard_indexes [s];
583  }
584 }
585 
586 
587 /*****************************************************
588  * IndexSplitVectors implementation
589  *******************************************************/
590 
591 
592 IndexSplitVectors::IndexSplitVectors (idx_t d, bool threaded):
593  Index (d), own_fields (false),
594  threaded (threaded), sum_d (0)
595 {
596 
597 }
598 
599 void IndexSplitVectors::add_sub_index (Index *index)
600 {
601  sub_indexes.push_back (index);
602  sync_with_sub_indexes ();
603 }
604 
605 void IndexSplitVectors::sync_with_sub_indexes ()
606 {
607  if (sub_indexes.empty()) return;
608  Index * index0 = sub_indexes[0];
609  sum_d = index0->d;
610  metric_type = index0->metric_type;
611  is_trained = index0->is_trained;
612  ntotal = index0->ntotal;
613  for (int i = 1; i < sub_indexes.size(); i++) {
614  Index * index = sub_indexes[i];
615  FAISS_THROW_IF_NOT (metric_type == index->metric_type);
616  FAISS_THROW_IF_NOT (ntotal == index->ntotal);
617  sum_d += index->d;
618  }
619 
620 }
621 
622 void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
623  FAISS_THROW_MSG("not implemented");
624 }
625 
626 namespace {
627 
628 /// callback + thread management to query in 1 shard
629 struct SplitQueryJob {
630  const IndexSplitVectors *index; // the relevant index
631  int no; // shard number
632 
633  // query params
634  idx_t n;
635  const float *x;
636  idx_t k;
637  float *distances;
638  idx_t *labels;
639 
640 
641  void run ()
642  {
643  if (index->verbose)
644  printf ("begin query shard %d on %ld points\n", no, n);
645  const Index * sub_index = index->sub_indexes[no];
646  long sub_d = sub_index->d, d = index->d;
647  idx_t ofs = 0;
648  for (int i = 0; i < no; i++) ofs += index->sub_indexes[i]->d;
649  float *sub_x = new float [sub_d * n];
650  ScopeDeleter<float> del (sub_x);
651  for (idx_t i = 0; i < n; i++)
652  memcpy (sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof (sub_x));
653  sub_index->search (n, sub_x, k, distances, labels);
654  if (index->verbose)
655  printf ("end query shard %d\n", no);
656  }
657 
658 };
659 
660 
661 
662 }
663 
664 
665 
667  idx_t n, const float *x, idx_t k,
668  float *distances, idx_t *labels) const
669 {
670  FAISS_THROW_IF_NOT_MSG (k == 1,
671  "search implemented only for k=1");
672  FAISS_THROW_IF_NOT_MSG (sum_d == d,
673  "not enough indexes compared to # dimensions");
674 
675  long nshard = sub_indexes.size();
676  float *all_distances = new float [nshard * k * n];
677  idx_t *all_labels = new idx_t [nshard * k * n];
678  ScopeDeleter<float> del (all_distances);
679  ScopeDeleter<idx_t> del2 (all_labels);
680 
681  // pre-alloc because we don't want reallocs
682  std::vector<Thread<SplitQueryJob> > qss (nshard);
683  for (int i = 0; i < nshard; i++) {
684  SplitQueryJob qs = {
685  this, i, n, x, k,
686  i == 0 ? distances : all_distances + i * k * n,
687  i == 0 ? labels : all_labels + i * k * n
688  };
689  if (threaded) {
690  qss[i] = Thread<SplitQueryJob> (qs);
691  qss[i].start();
692  } else {
693  qs.run();
694  }
695  }
696 
697  if (threaded) {
698  for (int i = 0; i < qss.size(); i++) {
699  qss[i].wait();
700  }
701  }
702 
703  long factor = 1;
704  for (int i = 0; i < nshard; i++) {
705  if (i > 0) { // results of 0 are already in the table
706  const float *distances_i = all_distances + i * k * n;
707  const idx_t *labels_i = all_labels + i * k * n;
708  for (long j = 0; j < n; j++) {
709  if (labels[j] >= 0 && labels_i[j] >= 0) {
710  labels[j] += labels_i[j] * factor;
711  distances[j] += distances_i[j];
712  } else {
713  labels[j] = -1;
714  distances[j] = 0.0 / 0.0;
715  }
716  }
717  }
718  factor *= sub_indexes[i]->ntotal;
719  }
720 
721 }
722 
723 void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
724  FAISS_THROW_MSG("not implemented");
725 }
726 
728 {
729  FAISS_THROW_MSG ("not implemented");
730 }
731 
732 
733 IndexSplitVectors::~IndexSplitVectors ()
734 {
735  if (own_fields) {
736  for (int s = 0; s < sub_indexes.size(); s++)
737  delete sub_indexes [s];
738  }
739 }
740 
741 
742 
743 
744 
745 
746 }; // namespace faiss
void train(idx_t n, const float *x) override
size_t nq
nb of queries
IndexShards(idx_t d, bool threaded=false, bool successive_ids=true)
virtual void reset()=0
removes all elements from the database.
virtual void train(idx_t, const float *)
Definition: Index.h:89
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: MetaIndexes.cpp:69
void add_with_ids(idx_t n, const float *x, const long *xids) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
void reset() override
removes all elements from the database.
void add(idx_t n, const float *x) override
this will fail. Use add_with_ids
Definition: MetaIndexes.cpp:40
int d
vector dimension
Definition: Index.h:64
std::vector< long > id_map
! whether pointers are deleted in destructo
Definition: MetaIndexes.h:29
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: MetaIndexes.cpp:80
virtual void add(idx_t n, const float *x)=0
void train(idx_t n, const float *x) override
void add(idx_t n, const float *x) override
supported only for sub-indices that implement add_with_ids
long idx_t
all indices are this type
Definition: Index.h:62
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
void construct_rev_map()
make the rev_map from scratch
bool verbose
verbosity level
Definition: Index.h:66
void add(idx_t n, const float *x) override
virtual long remove_ids(const IDSelector &sel)
Definition: Index.cpp:37
long remove_ids(const IDSelector &sel) override
remove ids adapted to IndexFlat
bool threaded
should the sub-indexes be deleted along with this?
Definition: MetaIndexes.h:90
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
void reset() override
removes all elements from the database.
void reset() override
removes all elements from the database.
Definition: MetaIndexes.cpp:53
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
long remove_ids(const IDSelector &sel) override
remove ids adapted to IndexFlat
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
void reconstruct(idx_t key, float *recons) const override
void train(idx_t n, const float *x) override
Definition: MetaIndexes.cpp:47
size_t * lims
size (nq + 1)
virtual void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const
Definition: Index.cpp:17
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69
virtual void reconstruct(idx_t key, float *recons) const
Definition: Index.cpp:43
void add_with_ids(idx_t n, const float *x, const long *xids) override
Definition: MetaIndexes.cpp:60
idx_t * labels
result for query i is labels[lims[i]:lims[i+1]]
IndexSplitVectors(idx_t d, bool threaded=false)
sum of dimensions seen so far
bool own_fields
! the sub-index
Definition: MetaIndexes.h:28