Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/MetaIndexes.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #include "MetaIndexes.h"
13 
14 #include <pthread.h>
15 
16 #include <cstdio>
17 
18 #include "FaissAssert.h"
19 #include "Heap.h"
20 #include "AuxIndexStructures.h"
21 
22 
23 namespace faiss {
24 
25 /*****************************************************
26  * IndexIDMap implementation
27  *******************************************************/
28 
29 IndexIDMap::IndexIDMap (Index *index):
30  index (index),
31  own_fields (false)
32 {
33  FAISS_THROW_IF_NOT_MSG (index->ntotal == 0, "index must be empty on input");
34  is_trained = index->is_trained;
35  metric_type = index->metric_type;
36  verbose = index->verbose;
37  d = index->d;
38 }
39 
40 void IndexIDMap::add (idx_t, const float *)
41 {
42  FAISS_THROW_MSG ("add does not make sense with IndexIDMap, "
43  "use add_with_ids");
44 }
45 
46 
47 void IndexIDMap::train (idx_t n, const float *x)
48 {
49  index->train (n, x);
50  is_trained = index->is_trained;
51 }
52 
54 {
55  index->reset ();
56  ntotal = 0;
57 }
58 
59 
60 void IndexIDMap::add_with_ids (idx_t n, const float * x, const long *xids)
61 {
62  index->add (n, x);
63  for (idx_t i = 0; i < n; i++)
64  id_map.push_back (xids[i]);
65  ntotal = index->ntotal;
66 }
67 
68 
69 void IndexIDMap::search (idx_t n, const float *x, idx_t k,
70  float *distances, idx_t *labels) const
71 {
72  index->search (n, x, k, distances, labels);
73  idx_t *li = labels;
74  for (idx_t i = 0; i < n * k; i++) {
75  li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
76  }
77 }
78 
79 
80 void IndexIDMap::range_search (idx_t n, const float *x, float radius,
81  RangeSearchResult *result) const
82 {
83  index->range_search(n, x, radius, result);
84  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
85  result->labels[i] = result->labels[i] < 0 ?
86  result->labels[i] : id_map[result->labels[i]];
87  }
88 }
89 
90 namespace {
91 
92 struct IDTranslatedSelector: IDSelector {
93  const std::vector <long> & id_map;
94  const IDSelector & sel;
95  IDTranslatedSelector (const std::vector <long> & id_map,
96  const IDSelector & sel):
97  id_map (id_map), sel (sel)
98  {}
99  bool is_member(idx_t id) const override {
100  return sel.is_member(id_map[id]);
101  }
102 };
103 
104 }
105 
107 {
108  // remove in sub-index first
109  IDTranslatedSelector sel2 (id_map, sel);
110  long nremove = index->remove_ids (sel2);
111 
112  long j = 0;
113  for (idx_t i = 0; i < ntotal; i++) {
114  if (sel.is_member (id_map[i])) {
115  // remove
116  } else {
117  id_map[j] = id_map[i];
118  j++;
119  }
120  }
121  FAISS_ASSERT (j == index->ntotal);
122  ntotal = j;
123  id_map.resize(ntotal);
124  return nremove;
125 }
126 
127 
128 
129 
130 IndexIDMap::~IndexIDMap ()
131 {
132  if (own_fields) delete index;
133 }
134 
135 /*****************************************************
136  * IndexIDMap2 implementation
137  *******************************************************/
138 
139 IndexIDMap2::IndexIDMap2 (Index *index): IndexIDMap (index)
140 {}
141 
142 void IndexIDMap2::add_with_ids(idx_t n, const float* x, const long* xids)
143 {
144  size_t prev_ntotal = ntotal;
145  IndexIDMap::add_with_ids (n, x, xids);
146  for (size_t i = prev_ntotal; i < ntotal; i++) {
147  rev_map [id_map [i]] = i;
148  }
149 }
150 
152 {
153  rev_map.clear ();
154  for (size_t i = 0; i < ntotal; i++) {
155  rev_map [id_map [i]] = i;
156  }
157 }
158 
159 
161 {
162  // This is quite inefficient
163  long nremove = IndexIDMap::remove_ids (sel);
165  return nremove;
166 }
167 
168 void IndexIDMap2::reconstruct (idx_t key, float * recons) const
169 {
170  try {
171  index->reconstruct (rev_map.at (key), recons);
172  } catch (const std::out_of_range& e) {
173  FAISS_THROW_FMT ("key %ld not found", key);
174  }
175 }
176 
177 
178 
179 /*****************************************************
180  * IndexShards implementation
181  *******************************************************/
182 
183 // subroutines
184 namespace {
185 
186 
187 typedef Index::idx_t idx_t;
188 
189 
190 template<class Job>
191 struct Thread {
192  Job job;
193  pthread_t thread;
194 
195  Thread () {}
196 
197  explicit Thread (const Job & job): job(job) {}
198 
199  void start () {
200  pthread_create (&thread, nullptr, run, this);
201  }
202 
203  void wait () {
204  pthread_join (thread, nullptr);
205  }
206 
207  static void * run (void *arg) {
208  static_cast<Thread*> (arg)->job.run();
209  return nullptr;
210  }
211 
212 };
213 
214 
215 /// callback + thread management to train 1 shard
216 struct TrainJob {
217  IndexShards *index; // the relevant index
218  int no; // shard number
219  idx_t n; // train points
220  const float *x;
221 
222  void run ()
223  {
224  if (index->verbose)
225  printf ("begin train shard %d on %ld points\n", no, n);
226  index->shard_indexes [no]->train(n, x);
227  if (index->verbose)
228  printf ("end train shard %d\n", no);
229  }
230 
231 };
232 
233 struct AddJob {
234  IndexShards *index; // the relevant index
235  int no; // shard number
236 
237  idx_t n;
238  const float *x;
239  const idx_t *ids;
240 
241  void run ()
242  {
243  if (index->verbose)
244  printf ("begin add shard %d on %ld points\n", no, n);
245  if (ids)
246  index->shard_indexes[no]->add_with_ids (n, x, ids);
247  else
248  index->shard_indexes[no]->add (n, x);
249  if (index->verbose)
250  printf ("end add shard %d on %ld points\n", no, n);
251  }
252 };
253 
254 
255 
256 /// callback + thread management to query in 1 shard
257 struct QueryJob {
258  const IndexShards *index; // the relevant index
259  int no; // shard number
260 
261  // query params
262  idx_t n;
263  const float *x;
264  idx_t k;
265  float *distances;
266  idx_t *labels;
267 
268 
269  void run ()
270  {
271  if (index->verbose)
272  printf ("begin query shard %d on %ld points\n", no, n);
273  index->shard_indexes [no]->search (n, x, k,
274  distances, labels);
275  if (index->verbose)
276  printf ("end query shard %d\n", no);
277  }
278 
279 
280 };
281 
282 
283 
284 
285 // add translation to all valid labels
286 void translate_labels (long n, idx_t *labels, long translation)
287 {
288  if (translation == 0) return;
289  for (long i = 0; i < n; i++) {
290  if(labels[i] < 0) return;
291  labels[i] += translation;
292  }
293 }
294 
295 
296 /** merge result tables from several shards.
297  * @param all_distances size nshard * n * k
298  * @param all_labels idem
299  * @param translartions label translations to apply, size nshard
300  */
301 
302 template <class C>
303 void merge_tables (long n, long k, long nshard,
304  float *distances, idx_t *labels,
305  const float *all_distances,
306  idx_t *all_labels,
307  const long *translations)
308 {
309  if(k == 0) {
310  return;
311  }
312 
313  long stride = n * k;
314 #pragma omp parallel
315  {
316  std::vector<int> buf (2 * nshard);
317  int * pointer = buf.data();
318  int * shard_ids = pointer + nshard;
319  std::vector<float> buf2 (nshard);
320  float * heap_vals = buf2.data();
321 #pragma omp for
322  for (long i = 0; i < n; i++) {
323  // the heap maps values to the shard where they are
324  // produced.
325  const float *D_in = all_distances + i * k;
326  const idx_t *I_in = all_labels + i * k;
327  int heap_size = 0;
328 
329  for (long s = 0; s < nshard; s++) {
330  pointer[s] = 0;
331  if (I_in[stride * s] >= 0)
332  heap_push<C> (++heap_size, heap_vals, shard_ids,
333  D_in[stride * s], s);
334  }
335 
336  float *D = distances + i * k;
337  idx_t *I = labels + i * k;
338 
339  for (int j = 0; j < k; j++) {
340  if (heap_size == 0) {
341  I[j] = -1;
342  D[j] = C::neutral();
343  } else {
344  // pop best element
345  int s = shard_ids[0];
346  int & p = pointer[s];
347  D[j] = heap_vals[0];
348  I[j] = I_in[stride * s + p] + translations[s];
349 
350  heap_pop<C> (heap_size--, heap_vals, shard_ids);
351  p++;
352  if (p < k && I_in[stride * s + p] >= 0)
353  heap_push<C> (++heap_size, heap_vals, shard_ids,
354  D_in[stride * s + p], s);
355  }
356  }
357  }
358  }
359 }
360 
361 
362 };
363 
364 
365 
366 
367 IndexShards::IndexShards (idx_t d, bool threaded, bool successive_ids):
368  Index (d), own_fields (false),
369  threaded (threaded), successive_ids (successive_ids)
370 {
371 
372 }
373 
374 void IndexShards::add_shard (Index *idx)
375 {
376  shard_indexes.push_back (idx);
377  sync_with_shard_indexes ();
378 }
379 
380 void IndexShards::sync_with_shard_indexes ()
381 {
382  if (shard_indexes.empty()) return;
383  Index * index0 = shard_indexes[0];
384  d = index0->d;
385  metric_type = index0->metric_type;
386  is_trained = index0->is_trained;
387  ntotal = index0->ntotal;
388  for (int i = 1; i < shard_indexes.size(); i++) {
389  Index * index = shard_indexes[i];
390  FAISS_THROW_IF_NOT (metric_type == index->metric_type);
391  FAISS_THROW_IF_NOT (d == index->d);
392  ntotal += index->ntotal;
393  }
394 }
395 
396 
397 void IndexShards::train (idx_t n, const float *x)
398 {
399 
400  // pre-alloc because we don't want reallocs
401  std::vector<Thread<TrainJob > > tss (shard_indexes.size());
402  int nt = 0;
403  for (int i = 0; i < shard_indexes.size(); i++) {
404  if(!shard_indexes[i]->is_trained) {
405  TrainJob ts = {this, i, n, x};
406  if (threaded) {
407  tss[nt] = Thread<TrainJob> (ts);
408  tss[nt++].start();
409  } else {
410  ts.run();
411  }
412  }
413  }
414  for (int i = 0; i < nt; i++) {
415  tss[i].wait();
416  }
417  sync_with_shard_indexes ();
418 }
419 
420 void IndexShards::add (idx_t n, const float *x)
421 {
422  add_with_ids (n, x, nullptr);
423 }
424 
425 
426 void IndexShards::add_with_ids (idx_t n, const float * x, const long *xids)
427 {
428 
429  FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
430  "It makes no sense to pass in ids and "
431  "request them to be shifted");
432 
433  if (successive_ids) {
434  FAISS_THROW_IF_NOT_MSG(!xids,
435  "It makes no sense to pass in ids and "
436  "request them to be shifted");
437  FAISS_THROW_IF_NOT_MSG(ntotal == 0,
438  "when adding to IndexShards with sucessive_ids, "
439  "only add() in a single pass is supported");
440  }
441 
442  long nshard = shard_indexes.size();
443  const long *ids = xids;
444  ScopeDeleter<long> del;
445  if (!ids && !successive_ids) {
446  long *aids = new long[n];
447  for (long i = 0; i < n; i++)
448  aids[i] = ntotal + i;
449  ids = aids;
450  del.set (ids);
451  }
452 
453  std::vector<Thread<AddJob > > asa (shard_indexes.size());
454  int nt = 0;
455  for (int i = 0; i < nshard; i++) {
456  long i0 = i * n / nshard;
457  long i1 = (i + 1) * n / nshard;
458 
459  AddJob as = {this, i,
460  i1 - i0, x + i0 * d,
461  ids ? ids + i0 : nullptr};
462  if (threaded) {
463  asa[nt] = Thread<AddJob>(as);
464  asa[nt++].start();
465  } else {
466  as.run();
467  }
468  }
469  for (int i = 0; i < nt; i++) {
470  asa[i].wait();
471  }
472  ntotal += n;
473 }
474 
475 
476 
477 
478 
480 {
481  for (int i = 0; i < shard_indexes.size(); i++) {
482  shard_indexes[i]->reset ();
483  }
484  sync_with_shard_indexes ();
485 }
486 
488  idx_t n, const float *x, idx_t k,
489  float *distances, idx_t *labels) const
490 {
491  long nshard = shard_indexes.size();
492  float *all_distances = new float [nshard * k * n];
493  idx_t *all_labels = new idx_t [nshard * k * n];
494  ScopeDeleter<float> del (all_distances);
495  ScopeDeleter<idx_t> del2 (all_labels);
496 
497 #if 1
498 
499  // pre-alloc because we don't want reallocs
500  std::vector<Thread<QueryJob> > qss (nshard);
501  for (int i = 0; i < nshard; i++) {
502  QueryJob qs = {
503  this, i, n, x, k,
504  all_distances + i * k * n,
505  all_labels + i * k * n
506  };
507  if (threaded) {
508  qss[i] = Thread<QueryJob> (qs);
509  qss[i].start();
510  } else {
511  qs.run();
512  }
513  }
514 
515  if (threaded) {
516  for (int i = 0; i < qss.size(); i++) {
517  qss[i].wait();
518  }
519  }
520 #else
521 
522  // pre-alloc because we don't want reallocs
523  std::vector<QueryJob> qss (nshard);
524  for (int i = 0; i < nshard; i++) {
525  QueryJob qs = {
526  this, i, n, x, k,
527  all_distances + i * k * n,
528  all_labels + i * k * n
529  };
530  if (threaded) {
531  qss[i] = qs;
532  } else {
533  qs.run();
534  }
535  }
536 
537  if (threaded) {
538 #pragma omp parallel for
539  for (int i = 0; i < qss.size(); i++) {
540  qss[i].run();
541  }
542  }
543 
544 #endif
545  std::vector<long> translations (nshard, 0);
546  if (successive_ids) {
547  translations[0] = 0;
548  for (int s = 0; s + 1 < nshard; s++)
549  translations [s + 1] = translations [s] +
550  shard_indexes [s]->ntotal;
551  }
552 
553  if (metric_type == METRIC_L2) {
554  merge_tables< CMin<float, int> > (
555  n, k, nshard, distances, labels,
556  all_distances, all_labels, translations.data ());
557  } else {
558  merge_tables< CMax<float, int> > (
559  n, k, nshard, distances, labels,
560  all_distances, all_labels, translations.data ());
561  }
562 
563 }
564 
565 
566 
567 IndexShards::~IndexShards ()
568 {
569  if (own_fields) {
570  for (int s = 0; s < shard_indexes.size(); s++)
571  delete shard_indexes [s];
572  }
573 }
574 
575 
576 /*****************************************************
577  * IndexSplitVectors implementation
578  *******************************************************/
579 
580 
581 IndexSplitVectors::IndexSplitVectors (idx_t d, bool threaded):
582  Index (d), own_fields (false),
583  threaded (threaded), sum_d (0)
584 {
585 
586 }
587 
588 void IndexSplitVectors::add_sub_index (Index *index)
589 {
590  sub_indexes.push_back (index);
591  sync_with_sub_indexes ();
592 }
593 
594 void IndexSplitVectors::sync_with_sub_indexes ()
595 {
596  if (sub_indexes.empty()) return;
597  Index * index0 = sub_indexes[0];
598  sum_d = index0->d;
599  metric_type = index0->metric_type;
600  is_trained = index0->is_trained;
601  ntotal = index0->ntotal;
602  for (int i = 1; i < sub_indexes.size(); i++) {
603  Index * index = sub_indexes[i];
604  FAISS_THROW_IF_NOT (metric_type == index->metric_type);
605  FAISS_THROW_IF_NOT (ntotal == index->ntotal);
606  sum_d += index->d;
607  }
608 
609 }
610 
611 void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
612  FAISS_THROW_MSG("not implemented");
613 }
614 
615 namespace {
616 
617 /// callback + thread management to query in 1 shard
618 struct SplitQueryJob {
619  const IndexSplitVectors *index; // the relevant index
620  int no; // shard number
621 
622  // query params
623  idx_t n;
624  const float *x;
625  idx_t k;
626  float *distances;
627  idx_t *labels;
628 
629 
630  void run ()
631  {
632  if (index->verbose)
633  printf ("begin query shard %d on %ld points\n", no, n);
634  const Index * sub_index = index->sub_indexes[no];
635  long sub_d = sub_index->d, d = index->d;
636  idx_t ofs = 0;
637  for (int i = 0; i < no; i++) ofs += index->sub_indexes[i]->d;
638  float *sub_x = new float [sub_d * n];
639  ScopeDeleter<float> del (sub_x);
640  for (idx_t i = 0; i < n; i++)
641  memcpy (sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof (sub_x));
642  sub_index->search (n, sub_x, k, distances, labels);
643  if (index->verbose)
644  printf ("end query shard %d\n", no);
645  }
646 
647 };
648 
649 
650 
651 }
652 
653 
654 
656  idx_t n, const float *x, idx_t k,
657  float *distances, idx_t *labels) const
658 {
659  FAISS_THROW_IF_NOT_MSG (k == 1,
660  "search implemented only for k=1");
661  FAISS_THROW_IF_NOT_MSG (sum_d == d,
662  "not enough indexes compared to # dimensions");
663 
664  long nshard = sub_indexes.size();
665  float *all_distances = new float [nshard * k * n];
666  idx_t *all_labels = new idx_t [nshard * k * n];
667  ScopeDeleter<float> del (all_distances);
668  ScopeDeleter<idx_t> del2 (all_labels);
669 
670  // pre-alloc because we don't want reallocs
671  std::vector<Thread<SplitQueryJob> > qss (nshard);
672  for (int i = 0; i < nshard; i++) {
673  SplitQueryJob qs = {
674  this, i, n, x, k,
675  i == 0 ? distances : all_distances + i * k * n,
676  i == 0 ? labels : all_labels + i * k * n
677  };
678  if (threaded) {
679  qss[i] = Thread<SplitQueryJob> (qs);
680  qss[i].start();
681  } else {
682  qs.run();
683  }
684  }
685 
686  if (threaded) {
687  for (int i = 0; i < qss.size(); i++) {
688  qss[i].wait();
689  }
690  }
691 
692  long factor = 1;
693  for (int i = 0; i < nshard; i++) {
694  if (i > 0) { // results of 0 are already in the table
695  const float *distances_i = all_distances + i * k * n;
696  const idx_t *labels_i = all_labels + i * k * n;
697  for (long j = 0; j < n; j++) {
698  if (labels[j] >= 0 && labels_i[j] >= 0) {
699  labels[j] += labels_i[j] * factor;
700  distances[j] += distances_i[j];
701  } else {
702  labels[j] = -1;
703  distances[j] = 0.0 / 0.0;
704  }
705  }
706  }
707  factor *= sub_indexes[i]->ntotal;
708  }
709 
710 }
711 
712 void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
713  FAISS_THROW_MSG("not implemented");
714 }
715 
717 {
718  FAISS_THROW_MSG ("not implemented");
719 }
720 
721 
722 IndexSplitVectors::~IndexSplitVectors ()
723 {
724  if (own_fields) {
725  for (int s = 0; s < sub_indexes.size(); s++)
726  delete sub_indexes [s];
727  }
728 }
729 
730 
731 
732 
733 
734 
735 }; // namespace faiss
void train(idx_t n, const float *x) override
size_t nq
nb of queries
IndexShards(idx_t d, bool threaded=false, bool successive_ids=true)
virtual void reset()=0
removes all elements from the database.
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: MetaIndexes.cpp:69
void add_with_ids(idx_t n, const float *x, const long *xids) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
virtual void train(idx_t n, const float *x)
Definition: Index.cpp:23
void reset() override
removes all elements from the database.
void add(idx_t n, const float *x) override
this will fail. Use add_with_ids
Definition: MetaIndexes.cpp:40
int d
vector dimension
Definition: Index.h:64
std::vector< long > id_map
! whether pointers are deleted in destructo
Definition: MetaIndexes.h:29
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: MetaIndexes.cpp:80
virtual void add(idx_t n, const float *x)=0
void train(idx_t n, const float *x) override
void add(idx_t n, const float *x) override
supported only for sub-indices that implement add_with_ids
long idx_t
all indices are this type
Definition: Index.h:62
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
void construct_rev_map()
make the rev_map from scratch
bool verbose
verbosity level
Definition: Index.h:66
void add(idx_t n, const float *x) override
virtual long remove_ids(const IDSelector &sel)
Definition: Index.cpp:48
long remove_ids(const IDSelector &sel) override
remove ids adapted to IndexFlat
bool threaded
should the sub-indexes be deleted along with this?
Definition: MetaIndexes.h:90
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
void reset() override
removes all elements from the database.
void reset() override
removes all elements from the database.
Definition: MetaIndexes.cpp:53
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
long remove_ids(const IDSelector &sel) override
remove ids adapted to IndexFlat
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
void reconstruct(idx_t key, float *recons) const override
void train(idx_t n, const float *x) override
Definition: MetaIndexes.cpp:47
size_t * lims
size (nq + 1)
virtual void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const
Definition: Index.cpp:28
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69
virtual void reconstruct(idx_t key, float *recons) const
Definition: Index.cpp:54
void add_with_ids(idx_t n, const float *x, const long *xids) override
Definition: MetaIndexes.cpp:60
idx_t * labels
result for query i is labels[lims[i]:lims[i+1]]
IndexSplitVectors(idx_t d, bool threaded=false)
sum of dimensions seen so far
bool own_fields
! the sub-index
Definition: MetaIndexes.h:28