Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/MetaIndexes.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #include "MetaIndexes.h"
12 
13 #include <pthread.h>
14 
15 #include <cstdio>
16 
17 #include "FaissAssert.h"
18 #include "Heap.h"
19 #include "AuxIndexStructures.h"
20 
21 
22 namespace faiss {
23 
24 /*****************************************************
25  * IndexIDMap implementation
26  *******************************************************/
27 
28 IndexIDMap::IndexIDMap (Index *index):
29  index (index),
30  own_fields (false)
31 {
32  FAISS_THROW_IF_NOT_MSG (index->ntotal == 0, "index must be empty on input");
33  is_trained = index->is_trained;
34  metric_type = index->metric_type;
35  verbose = index->verbose;
36  d = index->d;
37 }
38 
39 void IndexIDMap::add (idx_t, const float *)
40 {
41  FAISS_THROW_MSG ("add does not make sense with IndexIDMap, "
42  "use add_with_ids");
43 }
44 
45 
46 void IndexIDMap::train (idx_t n, const float *x)
47 {
48  index->train (n, x);
49  is_trained = index->is_trained;
50 }
51 
53 {
54  index->reset ();
55  id_map.clear();
56  ntotal = 0;
57 }
58 
59 
60 void IndexIDMap::add_with_ids (idx_t n, const float * x, const long *xids)
61 {
62  index->add (n, x);
63  for (idx_t i = 0; i < n; i++)
64  id_map.push_back (xids[i]);
65  ntotal = index->ntotal;
66 }
67 
68 
69 void IndexIDMap::search (idx_t n, const float *x, idx_t k,
70  float *distances, idx_t *labels) const
71 {
72  index->search (n, x, k, distances, labels);
73  idx_t *li = labels;
74 #pragma omp parallel for
75  for (idx_t i = 0; i < n * k; i++) {
76  li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
77  }
78 }
79 
80 
81 void IndexIDMap::range_search (idx_t n, const float *x, float radius,
82  RangeSearchResult *result) const
83 {
84  index->range_search(n, x, radius, result);
85 #pragma omp parallel for
86  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
87  result->labels[i] = result->labels[i] < 0 ?
88  result->labels[i] : id_map[result->labels[i]];
89  }
90 }
91 
92 namespace {
93 
94 struct IDTranslatedSelector: IDSelector {
95  const std::vector <long> & id_map;
96  const IDSelector & sel;
97  IDTranslatedSelector (const std::vector <long> & id_map,
98  const IDSelector & sel):
99  id_map (id_map), sel (sel)
100  {}
101  bool is_member(idx_t id) const override {
102  return sel.is_member(id_map[id]);
103  }
104 };
105 
106 }
107 
109 {
110  // remove in sub-index first
111  IDTranslatedSelector sel2 (id_map, sel);
112  long nremove = index->remove_ids (sel2);
113 
114  long j = 0;
115  for (idx_t i = 0; i < ntotal; i++) {
116  if (sel.is_member (id_map[i])) {
117  // remove
118  } else {
119  id_map[j] = id_map[i];
120  j++;
121  }
122  }
123  FAISS_ASSERT (j == index->ntotal);
124  ntotal = j;
125  id_map.resize(ntotal);
126  return nremove;
127 }
128 
129 
130 
131 
132 IndexIDMap::~IndexIDMap ()
133 {
134  if (own_fields) delete index;
135 }
136 
137 /*****************************************************
138  * IndexIDMap2 implementation
139  *******************************************************/
140 
141 IndexIDMap2::IndexIDMap2 (Index *index): IndexIDMap (index)
142 {}
143 
144 void IndexIDMap2::add_with_ids(idx_t n, const float* x, const long* xids)
145 {
146  size_t prev_ntotal = ntotal;
147  IndexIDMap::add_with_ids (n, x, xids);
148  for (size_t i = prev_ntotal; i < ntotal; i++) {
149  rev_map [id_map [i]] = i;
150  }
151 }
152 
154 {
155  rev_map.clear ();
156  for (size_t i = 0; i < ntotal; i++) {
157  rev_map [id_map [i]] = i;
158  }
159 }
160 
161 
163 {
164  // This is quite inefficient
165  long nremove = IndexIDMap::remove_ids (sel);
167  return nremove;
168 }
169 
170 void IndexIDMap2::reconstruct (idx_t key, float * recons) const
171 {
172  try {
173  index->reconstruct (rev_map.at (key), recons);
174  } catch (const std::out_of_range& e) {
175  FAISS_THROW_FMT ("key %ld not found", key);
176  }
177 }
178 
179 
180 
181 /*****************************************************
182  * IndexShards implementation
183  *******************************************************/
184 
185 // subroutines
186 namespace {
187 
188 
189 typedef Index::idx_t idx_t;
190 
191 
192 template<class Job>
193 struct Thread {
194  Job job;
195  pthread_t thread;
196 
197  Thread () {}
198 
199  explicit Thread (const Job & job): job(job) {}
200 
201  void start () {
202  pthread_create (&thread, nullptr, run, this);
203  }
204 
205  void wait () {
206  pthread_join (thread, nullptr);
207  }
208 
209  static void * run (void *arg) {
210  static_cast<Thread*> (arg)->job.run();
211  return nullptr;
212  }
213 
214 };
215 
216 
217 /// callback + thread management to train 1 shard
218 struct TrainJob {
219  IndexShards *index; // the relevant index
220  int no; // shard number
221  idx_t n; // train points
222  const float *x;
223 
224  void run ()
225  {
226  if (index->verbose)
227  printf ("begin train shard %d on %ld points\n", no, n);
228  index->shard_indexes [no]->train(n, x);
229  if (index->verbose)
230  printf ("end train shard %d\n", no);
231  }
232 
233 };
234 
235 struct AddJob {
236  IndexShards *index; // the relevant index
237  int no; // shard number
238 
239  idx_t n;
240  const float *x;
241  const idx_t *ids;
242 
243  void run ()
244  {
245  if (index->verbose)
246  printf ("begin add shard %d on %ld points\n", no, n);
247  if (ids)
248  index->shard_indexes[no]->add_with_ids (n, x, ids);
249  else
250  index->shard_indexes[no]->add (n, x);
251  if (index->verbose)
252  printf ("end add shard %d on %ld points\n", no, n);
253  }
254 };
255 
256 
257 
258 /// callback + thread management to query in 1 shard
259 struct QueryJob {
260  const IndexShards *index; // the relevant index
261  int no; // shard number
262 
263  // query params
264  idx_t n;
265  const float *x;
266  idx_t k;
267  float *distances;
268  idx_t *labels;
269 
270 
271  void run ()
272  {
273  if (index->verbose)
274  printf ("begin query shard %d on %ld points\n", no, n);
275  index->shard_indexes [no]->search (n, x, k,
276  distances, labels);
277  if (index->verbose)
278  printf ("end query shard %d\n", no);
279  }
280 
281 
282 };
283 
284 
285 
286 
287 // add translation to all valid labels
288 void translate_labels (long n, idx_t *labels, long translation)
289 {
290  if (translation == 0) return;
291  for (long i = 0; i < n; i++) {
292  if(labels[i] < 0) return;
293  labels[i] += translation;
294  }
295 }
296 
297 
298 /** merge result tables from several shards.
299  * @param all_distances size nshard * n * k
300  * @param all_labels idem
301  * @param translartions label translations to apply, size nshard
302  */
303 
304 template <class C>
305 void merge_tables (long n, long k, long nshard,
306  float *distances, idx_t *labels,
307  const float *all_distances,
308  idx_t *all_labels,
309  const long *translations)
310 {
311  if(k == 0) {
312  return;
313  }
314 
315  long stride = n * k;
316 #pragma omp parallel
317  {
318  std::vector<int> buf (2 * nshard);
319  int * pointer = buf.data();
320  int * shard_ids = pointer + nshard;
321  std::vector<float> buf2 (nshard);
322  float * heap_vals = buf2.data();
323 #pragma omp for
324  for (long i = 0; i < n; i++) {
325  // the heap maps values to the shard where they are
326  // produced.
327  const float *D_in = all_distances + i * k;
328  const idx_t *I_in = all_labels + i * k;
329  int heap_size = 0;
330 
331  for (long s = 0; s < nshard; s++) {
332  pointer[s] = 0;
333  if (I_in[stride * s] >= 0)
334  heap_push<C> (++heap_size, heap_vals, shard_ids,
335  D_in[stride * s], s);
336  }
337 
338  float *D = distances + i * k;
339  idx_t *I = labels + i * k;
340 
341  for (int j = 0; j < k; j++) {
342  if (heap_size == 0) {
343  I[j] = -1;
344  D[j] = C::neutral();
345  } else {
346  // pop best element
347  int s = shard_ids[0];
348  int & p = pointer[s];
349  D[j] = heap_vals[0];
350  I[j] = I_in[stride * s + p] + translations[s];
351 
352  heap_pop<C> (heap_size--, heap_vals, shard_ids);
353  p++;
354  if (p < k && I_in[stride * s + p] >= 0)
355  heap_push<C> (++heap_size, heap_vals, shard_ids,
356  D_in[stride * s + p], s);
357  }
358  }
359  }
360  }
361 }
362 
363 
364 };
365 
366 
367 
368 
369 IndexShards::IndexShards (idx_t d, bool threaded, bool successive_ids):
370  Index (d), own_fields (false),
371  threaded (threaded), successive_ids (successive_ids)
372 {
373 
374 }
375 
376 void IndexShards::add_shard (Index *idx)
377 {
378  shard_indexes.push_back (idx);
379  sync_with_shard_indexes ();
380 }
381 
382 void IndexShards::sync_with_shard_indexes ()
383 {
384  if (shard_indexes.empty()) return;
385  Index * index0 = shard_indexes[0];
386  d = index0->d;
387  metric_type = index0->metric_type;
388  is_trained = index0->is_trained;
389  ntotal = index0->ntotal;
390  for (int i = 1; i < shard_indexes.size(); i++) {
391  Index * index = shard_indexes[i];
392  FAISS_THROW_IF_NOT (metric_type == index->metric_type);
393  FAISS_THROW_IF_NOT (d == index->d);
394  ntotal += index->ntotal;
395  }
396 }
397 
398 
399 void IndexShards::train (idx_t n, const float *x)
400 {
401 
402  // pre-alloc because we don't want reallocs
403  std::vector<Thread<TrainJob > > tss (shard_indexes.size());
404  int nt = 0;
405  for (int i = 0; i < shard_indexes.size(); i++) {
406  if(!shard_indexes[i]->is_trained) {
407  TrainJob ts = {this, i, n, x};
408  if (threaded) {
409  tss[nt] = Thread<TrainJob> (ts);
410  tss[nt++].start();
411  } else {
412  ts.run();
413  }
414  }
415  }
416  for (int i = 0; i < nt; i++) {
417  tss[i].wait();
418  }
419  sync_with_shard_indexes ();
420 }
421 
422 void IndexShards::add (idx_t n, const float *x)
423 {
424  add_with_ids (n, x, nullptr);
425 }
426 
427 
428 void IndexShards::add_with_ids (idx_t n, const float * x, const long *xids)
429 {
430 
431  FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
432  "It makes no sense to pass in ids and "
433  "request them to be shifted");
434 
435  if (successive_ids) {
436  FAISS_THROW_IF_NOT_MSG(!xids,
437  "It makes no sense to pass in ids and "
438  "request them to be shifted");
439  FAISS_THROW_IF_NOT_MSG(ntotal == 0,
440  "when adding to IndexShards with sucessive_ids, "
441  "only add() in a single pass is supported");
442  }
443 
444  long nshard = shard_indexes.size();
445  const long *ids = xids;
446  ScopeDeleter<long> del;
447  if (!ids && !successive_ids) {
448  long *aids = new long[n];
449  for (long i = 0; i < n; i++)
450  aids[i] = ntotal + i;
451  ids = aids;
452  del.set (ids);
453  }
454 
455  std::vector<Thread<AddJob > > asa (shard_indexes.size());
456  int nt = 0;
457  for (int i = 0; i < nshard; i++) {
458  long i0 = i * n / nshard;
459  long i1 = (i + 1) * n / nshard;
460 
461  AddJob as = {this, i,
462  i1 - i0, x + i0 * d,
463  ids ? ids + i0 : nullptr};
464  if (threaded) {
465  asa[nt] = Thread<AddJob>(as);
466  asa[nt++].start();
467  } else {
468  as.run();
469  }
470  }
471  for (int i = 0; i < nt; i++) {
472  asa[i].wait();
473  }
474  ntotal += n;
475 }
476 
477 
478 
479 
480 
482 {
483  for (int i = 0; i < shard_indexes.size(); i++) {
484  shard_indexes[i]->reset ();
485  }
486  sync_with_shard_indexes ();
487 }
488 
490  idx_t n, const float *x, idx_t k,
491  float *distances, idx_t *labels) const
492 {
493  long nshard = shard_indexes.size();
494  float *all_distances = new float [nshard * k * n];
495  idx_t *all_labels = new idx_t [nshard * k * n];
496  ScopeDeleter<float> del (all_distances);
497  ScopeDeleter<idx_t> del2 (all_labels);
498 
499 #if 1
500 
501  // pre-alloc because we don't want reallocs
502  std::vector<Thread<QueryJob> > qss (nshard);
503  for (int i = 0; i < nshard; i++) {
504  QueryJob qs = {
505  this, i, n, x, k,
506  all_distances + i * k * n,
507  all_labels + i * k * n
508  };
509  if (threaded) {
510  qss[i] = Thread<QueryJob> (qs);
511  qss[i].start();
512  } else {
513  qs.run();
514  }
515  }
516 
517  if (threaded) {
518  for (int i = 0; i < qss.size(); i++) {
519  qss[i].wait();
520  }
521  }
522 #else
523 
524  // pre-alloc because we don't want reallocs
525  std::vector<QueryJob> qss (nshard);
526  for (int i = 0; i < nshard; i++) {
527  QueryJob qs = {
528  this, i, n, x, k,
529  all_distances + i * k * n,
530  all_labels + i * k * n
531  };
532  if (threaded) {
533  qss[i] = qs;
534  } else {
535  qs.run();
536  }
537  }
538 
539  if (threaded) {
540 #pragma omp parallel for
541  for (int i = 0; i < qss.size(); i++) {
542  qss[i].run();
543  }
544  }
545 
546 #endif
547  std::vector<long> translations (nshard, 0);
548  if (successive_ids) {
549  translations[0] = 0;
550  for (int s = 0; s + 1 < nshard; s++)
551  translations [s + 1] = translations [s] +
552  shard_indexes [s]->ntotal;
553  }
554 
555  if (metric_type == METRIC_L2) {
556  merge_tables< CMin<float, int> > (
557  n, k, nshard, distances, labels,
558  all_distances, all_labels, translations.data ());
559  } else {
560  merge_tables< CMax<float, int> > (
561  n, k, nshard, distances, labels,
562  all_distances, all_labels, translations.data ());
563  }
564 
565 }
566 
567 
568 
569 IndexShards::~IndexShards ()
570 {
571  if (own_fields) {
572  for (int s = 0; s < shard_indexes.size(); s++)
573  delete shard_indexes [s];
574  }
575 }
576 
577 
578 /*****************************************************
579  * IndexSplitVectors implementation
580  *******************************************************/
581 
582 
583 IndexSplitVectors::IndexSplitVectors (idx_t d, bool threaded):
584  Index (d), own_fields (false),
585  threaded (threaded), sum_d (0)
586 {
587 
588 }
589 
590 void IndexSplitVectors::add_sub_index (Index *index)
591 {
592  sub_indexes.push_back (index);
593  sync_with_sub_indexes ();
594 }
595 
596 void IndexSplitVectors::sync_with_sub_indexes ()
597 {
598  if (sub_indexes.empty()) return;
599  Index * index0 = sub_indexes[0];
600  sum_d = index0->d;
601  metric_type = index0->metric_type;
602  is_trained = index0->is_trained;
603  ntotal = index0->ntotal;
604  for (int i = 1; i < sub_indexes.size(); i++) {
605  Index * index = sub_indexes[i];
606  FAISS_THROW_IF_NOT (metric_type == index->metric_type);
607  FAISS_THROW_IF_NOT (ntotal == index->ntotal);
608  sum_d += index->d;
609  }
610 
611 }
612 
613 void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
614  FAISS_THROW_MSG("not implemented");
615 }
616 
617 namespace {
618 
619 /// callback + thread management to query in 1 shard
620 struct SplitQueryJob {
621  const IndexSplitVectors *index; // the relevant index
622  int no; // shard number
623 
624  // query params
625  idx_t n;
626  const float *x;
627  idx_t k;
628  float *distances;
629  idx_t *labels;
630 
631 
632  void run ()
633  {
634  if (index->verbose)
635  printf ("begin query shard %d on %ld points\n", no, n);
636  const Index * sub_index = index->sub_indexes[no];
637  long sub_d = sub_index->d, d = index->d;
638  idx_t ofs = 0;
639  for (int i = 0; i < no; i++) ofs += index->sub_indexes[i]->d;
640  float *sub_x = new float [sub_d * n];
641  ScopeDeleter<float> del (sub_x);
642  for (idx_t i = 0; i < n; i++)
643  memcpy (sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof (sub_x));
644  sub_index->search (n, sub_x, k, distances, labels);
645  if (index->verbose)
646  printf ("end query shard %d\n", no);
647  }
648 
649 };
650 
651 
652 
653 }
654 
655 
656 
658  idx_t n, const float *x, idx_t k,
659  float *distances, idx_t *labels) const
660 {
661  FAISS_THROW_IF_NOT_MSG (k == 1,
662  "search implemented only for k=1");
663  FAISS_THROW_IF_NOT_MSG (sum_d == d,
664  "not enough indexes compared to # dimensions");
665 
666  long nshard = sub_indexes.size();
667  float *all_distances = new float [nshard * k * n];
668  idx_t *all_labels = new idx_t [nshard * k * n];
669  ScopeDeleter<float> del (all_distances);
670  ScopeDeleter<idx_t> del2 (all_labels);
671 
672  // pre-alloc because we don't want reallocs
673  std::vector<Thread<SplitQueryJob> > qss (nshard);
674  for (int i = 0; i < nshard; i++) {
675  SplitQueryJob qs = {
676  this, i, n, x, k,
677  i == 0 ? distances : all_distances + i * k * n,
678  i == 0 ? labels : all_labels + i * k * n
679  };
680  if (threaded) {
681  qss[i] = Thread<SplitQueryJob> (qs);
682  qss[i].start();
683  } else {
684  qs.run();
685  }
686  }
687 
688  if (threaded) {
689  for (int i = 0; i < qss.size(); i++) {
690  qss[i].wait();
691  }
692  }
693 
694  long factor = 1;
695  for (int i = 0; i < nshard; i++) {
696  if (i > 0) { // results of 0 are already in the table
697  const float *distances_i = all_distances + i * k * n;
698  const idx_t *labels_i = all_labels + i * k * n;
699  for (long j = 0; j < n; j++) {
700  if (labels[j] >= 0 && labels_i[j] >= 0) {
701  labels[j] += labels_i[j] * factor;
702  distances[j] += distances_i[j];
703  } else {
704  labels[j] = -1;
705  distances[j] = 0.0 / 0.0;
706  }
707  }
708  }
709  factor *= sub_indexes[i]->ntotal;
710  }
711 
712 }
713 
714 void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
715  FAISS_THROW_MSG("not implemented");
716 }
717 
719 {
720  FAISS_THROW_MSG ("not implemented");
721 }
722 
723 
724 IndexSplitVectors::~IndexSplitVectors ()
725 {
726  if (own_fields) {
727  for (int s = 0; s < sub_indexes.size(); s++)
728  delete sub_indexes [s];
729  }
730 }
731 
732 
733 } // namespace faiss
void train(idx_t n, const float *x) override
size_t nq
nb of queries
IndexShards(idx_t d, bool threaded=false, bool successive_ids=true)
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: MetaIndexes.cpp:69
void add_with_ids(idx_t n, const float *x, const long *xids) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
void reset() override
removes all elements from the database.
void add(idx_t n, const float *x) override
this will fail. Use add_with_ids
Definition: MetaIndexes.cpp:39
int d
vector dimension
Definition: Index.h:66
std::vector< long > id_map
! whether pointers are deleted in destructo
Definition: MetaIndexes.h:28
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: MetaIndexes.cpp:81
void train(idx_t n, const float *x) override
void add(idx_t n, const float *x) override
supported only for sub-indices that implement add_with_ids
long idx_t
all indices are this type
Definition: Index.h:64
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
void construct_rev_map()
make the rev_map from scratch
bool verbose
verbosity level
Definition: Index.h:68
void add(idx_t n, const float *x) override
long remove_ids(const IDSelector &sel) override
remove ids adapted to IndexFlat
bool threaded
should the sub-indexes be deleted along with this?
Definition: MetaIndexes.h:89
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
void reset() override
removes all elements from the database.
void reset() override
removes all elements from the database.
Definition: MetaIndexes.cpp:52
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
long remove_ids(const IDSelector &sel) override
remove ids adapted to IndexFlat
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
void reconstruct(idx_t key, float *recons) const override
void train(idx_t n, const float *x) override
Definition: MetaIndexes.cpp:46
size_t * lims
size (nq + 1)
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
virtual void reconstruct(idx_t key, float *recons) const
Definition: Index.cpp:55
void add_with_ids(idx_t n, const float *x, const long *xids) override
Definition: MetaIndexes.cpp:60
idx_t * labels
result for query i is labels[lims[i]:lims[i+1]]
IndexSplitVectors(idx_t d, bool threaded=false)
sum of dimensions seen so far