13 #include "MetaIndexes.h"
19 #include "FaissAssert.h"
28 IndexIDMap::IndexIDMap (Index *index):
32 FAISS_ASSERT (index->ntotal == 0 || !
"index must be empty on input");
42 FAISS_ASSERT (!
"add does not make sense with IndexIDMap, "
63 for (
idx_t i = 0; i < n; i++)
64 id_map.push_back (xids[i]);
70 float *distances,
idx_t *labels)
const
72 index->
search (n, x, k, distances, labels);
74 for (
idx_t i = 0; i < n * k; i++) {
75 li[i] = li[i] < 0 ? li[i] :
id_map[li[i]];
81 IndexIDMap::~IndexIDMap ()
86 void IndexIDMap::set_typename ()
88 index_typename =
"IDMap[" + index->index_typename +
"]";
110 explicit Thread (
const Job & job): job(job) {}
113 pthread_create (&thread,
nullptr, run,
this);
117 pthread_join (thread,
nullptr);
120 static void * run (
void *arg) {
121 static_cast<Thread*
> (arg)->job.run();
138 printf (
"begin train shard %d on %ld points\n", no, n);
139 index->shard_indexes [no]->train(n, x);
141 printf (
"end train shard %d\n", no);
157 printf (
"begin add shard %d on %ld points\n", no, n);
159 index->shard_indexes[no]->add_with_ids (n, x, ids);
161 index->shard_indexes[no]->add (n, x);
163 printf (
"end add shard %d on %ld points\n", no, n);
171 const IndexShards *index;
185 printf (
"begin query shard %d on %ld points\n", no, n);
186 index->shard_indexes [no]->search (n, x, k,
189 printf (
"end query shard %d\n", no);
199 void translate_labels (
long n, idx_t *labels,
long translation)
201 if (translation == 0)
return;
202 for (
long i = 0; i < n; i++) {
203 if(labels[i] < 0)
return;
204 labels[i] += translation;
216 void merge_tables (
long n,
long k,
long nshard,
217 float *distances, idx_t *labels,
218 const float *all_distances,
220 const long *translations)
229 std::vector<int> buf (2 * nshard);
230 int * pointer = buf.data();
231 int * shard_ids = pointer + nshard;
232 std::vector<float> buf2 (nshard);
233 float * heap_vals = buf2.data();
235 for (
long i = 0; i < n; i++) {
238 const float *D_in = all_distances + i * k;
239 const idx_t *I_in = all_labels + i * k;
242 for (
long s = 0; s < nshard; s++) {
244 if (I_in[stride * s] >= 0)
245 heap_push<C> (++heap_size, heap_vals, shard_ids,
246 D_in[stride * s], s);
249 float *D = distances + i * k;
250 idx_t *I = labels + i * k;
252 for (
int j = 0; j < k; j++) {
253 if (heap_size == 0) {
258 int s = shard_ids[0];
259 int & p = pointer[s];
261 I[j] = I_in[stride * s + p] + translations[s];
263 heap_pop<C> (heap_size--, heap_vals, shard_ids);
265 if (p < k && I_in[stride * s + p] >= 0)
266 heap_push<C> (++heap_size, heap_vals, shard_ids,
267 D_in[stride * s + p], s);
281 Index (d), own_fields (false),
282 threaded (threaded), successive_ids (successive_ids)
287 void IndexShards::add_shard (
Index *idx)
289 shard_indexes.push_back (idx);
290 sync_with_shard_indexes ();
293 void IndexShards::sync_with_shard_indexes ()
295 if (shard_indexes.empty())
return;
296 Index * index0 = shard_indexes[0];
301 for (
int i = 1; i < shard_indexes.size(); i++) {
302 Index * index = shard_indexes[i];
304 FAISS_ASSERT (
d == index->d);
314 std::vector<Thread<TrainJob > > tss (shard_indexes.size());
316 for (
int i = 0; i < shard_indexes.size(); i++) {
318 TrainJob ts = {
this, i, n, x};
320 tss[nt] = Thread<TrainJob> (ts);
327 for (
int i = 0; i < nt; i++) {
330 sync_with_shard_indexes ();
353 FAISS_ASSERT(!(successive_ids && xids) ||
354 !
"It makes no sense to pass in ids and request them to be shifted");
356 if (successive_ids) {
357 FAISS_ASSERT(!xids ||
358 !
"It makes no sense to pass in ids and request them to be shifted");
359 FAISS_ASSERT(
ntotal == 0 ||
360 !
"when adding to IndexShards with sucessive_ids, only add() "
361 "in a single pass is supported");
364 long nshard = shard_indexes.size();
365 const long *ids = xids;
366 if (!ids && !successive_ids) {
367 long *aids =
new long[n];
368 for (
long i = 0; i < n; i++)
373 std::vector<Thread<AddJob > > asa (shard_indexes.size());
375 for (
int i = 0; i < nshard; i++) {
376 long i0 = i * n / nshard;
377 long i1 = (i + 1) * n / nshard;
379 AddJob as = {
this, i,
381 ids ? ids + i0 :
nullptr};
383 asa[nt] = Thread<AddJob>(as);
389 for (
int i = 0; i < nt; i++) {
392 if (ids != xids)
delete [] ids;
402 for (
int i = 0; i < shard_indexes.size(); i++) {
403 shard_indexes[i]->reset ();
405 sync_with_shard_indexes ();
409 idx_t n,
const float *x, idx_t k,
410 float *distances, idx_t *labels)
const
412 long nshard = shard_indexes.size();
413 float *all_distances =
new float [nshard * k * n];
414 idx_t *all_labels =
new idx_t [nshard * k * n];
419 std::vector<Thread<QueryJob> > qss (nshard);
420 for (
int i = 0; i < nshard; i++) {
423 all_distances + i * k * n,
424 all_labels + i * k * n
427 qss[i] = Thread<QueryJob> (qs);
435 for (
int i = 0; i < qss.size(); i++) {
442 std::vector<QueryJob> qss (nshard);
443 for (
int i = 0; i < nshard; i++) {
446 all_distances + i * k * n,
447 all_labels + i * k * n
457 #pragma omp parallel for
458 for (
int i = 0; i < qss.size(); i++) {
464 std::vector<long> translations (nshard, 0);
465 if (successive_ids) {
467 for (
int s = 0; s + 1 < nshard; s++)
468 translations [s + 1] = translations [s] +
469 shard_indexes [s]->
ntotal;
473 merge_tables< CMin<float, int> > (
474 n, k, nshard, distances, labels,
475 all_distances, all_labels, translations.data ());
477 merge_tables< CMax<float, int> > (
478 n, k, nshard, distances, labels,
479 all_distances, all_labels, translations.data ());
481 delete [] all_distances;
482 delete [] all_labels;
486 void IndexShards::set_typename ()
491 IndexShards::~IndexShards ()
494 for (
int s = 0; s < shard_indexes.size(); s++)
495 delete shard_indexes [s];
506 Index (d), own_fields (false),
507 threaded (threaded), sum_d (0)
512 void IndexSplitVectors::add_sub_index (
Index *index)
514 sub_indexes.push_back (index);
515 sync_with_sub_indexes ();
518 void IndexSplitVectors::sync_with_sub_indexes ()
520 if (sub_indexes.empty())
return;
521 Index * index0 = sub_indexes[0];
526 for (
int i = 1; i < sub_indexes.size(); i++) {
527 Index * index = sub_indexes[i];
529 FAISS_ASSERT (
ntotal == index->ntotal);
537 FAISS_ASSERT (!
"not implemented");
543 struct SplitQueryJob {
558 printf (
"begin query shard %d on %ld points\n", no, n);
559 const Index * sub_index = index->sub_indexes[no];
560 long sub_d = sub_index->
d, d = index->
d;
562 for (
int i = 0; i < no; i++) ofs += index->sub_indexes[i]->
d;
563 float *sub_x =
new float [sub_d * n];
564 for (idx_t i = 0; i < n; i++)
565 memcpy (sub_x + i * sub_d, x + ofs + i * d, sub_d *
sizeof (sub_x));
566 sub_index->
search (n, sub_x, k, distances, labels);
569 printf (
"end query shard %d\n", no);
581 idx_t n,
const float *x, idx_t k,
582 float *distances, idx_t *labels)
const
584 FAISS_ASSERT (k == 1 || !
"search implemented only for k=1");
585 FAISS_ASSERT (sum_d == d || !
"not enough indexes compared to # dimensions");
587 long nshard = sub_indexes.size();
588 float *all_distances =
new float [nshard * k * n];
589 idx_t *all_labels =
new idx_t [nshard * k * n];
592 std::vector<Thread<SplitQueryJob> > qss (nshard);
593 for (
int i = 0; i < nshard; i++) {
596 i == 0 ? distances : all_distances + i * k * n,
597 i == 0 ? labels : all_labels + i * k * n
600 qss[i] = Thread<SplitQueryJob> (qs);
608 for (
int i = 0; i < qss.size(); i++) {
614 for (
int i = 0; i < nshard; i++) {
616 const float *distances_i = all_distances + i * k * n;
617 const idx_t *labels_i = all_labels + i * k * n;
618 for (
long j = 0; j < n; j++) {
619 if (labels[j] >= 0 && labels_i[j] >= 0) {
620 labels[j] += labels_i[j] * factor;
621 distances[j] += distances_i[j];
624 distances[j] = 0.0 / 0.0;
628 factor *= sub_indexes[i]->ntotal;
630 delete [] all_labels;
631 delete [] all_distances;
637 FAISS_ASSERT (!
"not implemented");
642 FAISS_ASSERT (!
"not implemented");
645 void IndexSplitVectors::set_typename ()
648 IndexSplitVectors::~IndexSplitVectors ()
651 for (
int s = 0; s < sub_indexes.size(); s++)
652 delete sub_indexes [s];
virtual void train(idx_t n, const float *x) override
IndexShards(idx_t d, bool threaded=false, bool successive_ids=true)
virtual void reset()=0
removes all elements from the database.
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
virtual void add_with_ids(idx_t n, const float *x, const long *xids) override
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
virtual void reset() override
removes all elements from the database.
virtual void add(idx_t n, const float *x) override
this will fail. Use add_with_ids
std::vector< long > id_map
! whether pointers are deleted in destructo
virtual void add(idx_t n, const float *x)=0
virtual void train(idx_t n, const float *x) override
virtual void add(idx_t n, const float *x) override
supported only for sub-indices that implement add_with_ids
long idx_t
all indices are this type
idx_t ntotal
total nb of indexed vectors
bool verbose
verbosity level
virtual void add(idx_t n, const float *x) override
bool threaded
should the sub-indexes be deleted along with this?
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
virtual void reset() override
removes all elements from the database.
virtual void reset() override
removes all elements from the database.
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
MetricType metric_type
type of metric this index uses for search
virtual void train(idx_t n, const float *x) override
bool is_trained
set if the Index does not require training, or if training is done already
virtual void train(idx_t n, const float *x)
virtual void add_with_ids(idx_t n, const float *x, const long *xids) override
IndexSplitVectors(idx_t d, bool threaded=false)
sum of dimensions seen so far
bool own_fields
! the sub-index