docs/html/IndexIVF_8cpp_source.html

 /**

  * Copyright (c) 2015-present, Facebook, Inc.

  * All rights reserved.

  *

  * This source code is licensed under the CC-by-NC license found in the

  * LICENSE file in the root directory of this source tree.

  */


 /* Copyright 2004-present Facebook. All Rights Reserved.

    Inverted list structure.

 */


 #include "IndexIVF.h"


 #include <cstdio>


 #include "utils.h"

 #include "hamming.h"


 #include "FaissAssert.h"

 #include "IndexFlat.h"

 #include "AuxIndexStructures.h"


 namespace faiss {


 /*****************************************

  * IndexIVF implementation

  ******************************************/


 IndexIVF::IndexIVF (Index * quantizer, size_t d, size_t nlist,

                     MetricType metric):

     Index (d, metric),

     nlist (nlist),

     nprobe (1),

     quantizer (quantizer),

     quantizer_trains_alone (false),

     own_fields (false),

     ids (nlist),

     maintain_direct_map (false)

 {

     FAISS_ASSERT (d == quantizer->d);

     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);

     // Spherical by default if the metric is inner_product

     if (metric_type == METRIC_INNER_PRODUCT) {

         cp.spherical = true;

     }

     // here we set a low # iterations because this is typically used

     // for large clusterings (nb this is not used for the MultiIndex,

     // for which quantizer_trains_alone = true)

     cp.niter = 10;

     cp.verbose = verbose;


 }


 IndexIVF::IndexIVF ():

     nlist (0), nprobe (1), quantizer (nullptr),

     quantizer_trains_alone (false), own_fields (false),

     maintain_direct_map (false)

 {}


 void IndexIVF::add (idx_t n, const float * x)

 {

     add_with_ids (n, x, nullptr);

 }


 void IndexIVF::make_direct_map ()

 {

     if (maintain_direct_map) return;


     direct_map.resize (ntotal, -1);

     for (size_t key = 0; key < nlist; key++) {

         const std::vector<long> & idlist = ids[key];


         for (long ofs = 0; ofs < idlist.size(); ofs++) {

             direct_map [idlist [ofs]] =

                 key << 32 | ofs;

         }

     }


     maintain_direct_map = true;

 }


 void IndexIVF::reset ()

 {

     ntotal = 0;

     direct_map.clear();

     for (size_t i = 0; i < ids.size(); i++)

         ids[i].clear();

 }


 void IndexIVF::train (idx_t n, const float *x)

 {

     if (quantizer->is_trained && (quantizer->ntotal == nlist)) {

         if (verbose)

             printf ("IVF quantizer does not need training.\n");

     } else if (quantizer_trains_alone) {

         if (verbose)

             printf ("IVF quantizer trains alone...\n");

         quantizer->train (n, x);

         FAISS_ASSERT (quantizer->ntotal == nlist ||

                       !"nlist not consistent with quantizer size");

     } else {

         if (verbose)

             printf ("Training IVF quantizer on %ld vectors in %dD\n",

                     n, d);


         Clustering clus (d, nlist, cp);


         quantizer->reset();

         clus.train (n, x, *quantizer);

         quantizer->is_trained = true;

     }

     if (verbose)

         printf ("Training IVF residual\n");


     train_residual (n, x);

     is_trained = true;

 }


 void IndexIVF::train_residual (idx_t n, const float *x)

 {

     if (verbose)

         printf ("IndexIVF: no residual training\n");

     // does nothing by default

 }


 double IndexIVF::imbalance_factor () const

 {

     std::vector<int> hist (nlist);

     for (int i = 0; i < nlist; i++) {

         hist[i] = ids[i].size();

     }

     return faiss::imbalance_factor (nlist, hist.data());

 }


 void IndexIVF::print_stats () const

 {

     std::vector<int> sizes(40);

     for (int i = 0; i < nlist; i++) {

         for (int j = 0; j < sizes.size(); j++) {

             if ((ids[i].size() >> j) == 0) {

                 sizes[j]++;

                 break;

             }

         }

     }

     for (int i = 0; i < sizes.size(); i++) {

         if (sizes[i]) {

             printf ("list size in < %d: %d instances\n",

                     1 << i, sizes[i]);

         }

     }


 }


 void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)

 {

     // minimal sanity checks

     FAISS_ASSERT (other.d == d);

     FAISS_ASSERT (other.nlist == nlist);

     FAISS_ASSERT ((!maintain_direct_map && !other.maintain_direct_map) ||

                   !"direct map copy not implemented");

     FAISS_ASSERT (typeid (*this) == typeid (other) ||

                   !"can only merge indexes of the same type");

     for (long i = 0; i < nlist; i++) {

         std::vector<idx_t> & src = other.ids[i];

         std::vector<idx_t> & dest = ids[i];

         for (long j = 0; j < src.size(); j++)

             dest.push_back (src[j] + add_id);

         src.clear();

     }

     merge_from_residuals (other);

     ntotal += other.ntotal;

     other.ntotal = 0;

 }


 IndexIVF::~IndexIVF()

 {

     if (own_fields) delete quantizer;

 }


 /*****************************************

  * IndexIVFFlat implementation

  ******************************************/


 IndexIVFFlat::IndexIVFFlat (Index * quantizer,

                             size_t d, size_t nlist, MetricType metric):

     IndexIVF (quantizer, d, nlist, metric)

 {

     vecs.resize (nlist);

     set_typename();

 }


 void IndexIVFFlat::set_typename ()

 {

     std::stringstream s;

     if (metric_type == METRIC_INNER_PRODUCT)

         s << "IvfIP";

     else if (metric_type == METRIC_L2)

         s << "IvfL2";

     else s << "??";

     s << "[" << nlist << ":" << quantizer->index_typename << "]";

     index_typename = s.str();

 }


 void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const long *xids)

 {

     add_core (n, x, xids, nullptr);

 }


 void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,

                              const long *precomputed_idx)


 {

     FAISS_ASSERT (is_trained);

     const long * idx;


     if (precomputed_idx) {

         idx = precomputed_idx;

     } else {

         long * idx0 = new long [n];

         quantizer->assign (n, x, idx0);

         idx = idx0;

     }

     long n_add = 0;

     for (size_t i = 0; i < n; i++) {

         long id = xids ? xids[i] : ntotal + i;

         long list_no = idx [i];

         if (list_no < 0)

             continue;

         FAISS_ASSERT (list_no < nlist);


         ids[list_no].push_back (id);

         const float *xi = x + i * d;

         /* store the vectors */

         for (size_t j = 0 ; j < d ; j++)

             vecs[list_no].push_back (xi [j]);


         if (maintain_direct_map)

             direct_map.push_back (list_no << 32 | (ids[list_no].size() - 1));

         n_add++;

     }

     if (verbose) {

         printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",

                n_add, n);

     }

     if (!precomputed_idx)

         delete [] idx;

     ntotal += n_add;

 }


 void IndexIVFFlat::search_knn_inner_product (

     size_t nx,

     const float * x,

     const long * __restrict keys,

     float_minheap_array_t * res) const

 {


     const size_t k = res->k;


 #pragma omp parallel for

     for (size_t i = 0; i < nx; i++) {

         const float * xi = x + i * d;

         const long * keysi = keys + i * nprobe;

         float * __restrict simi = res->get_val (i);

         long * __restrict idxi = res->get_ids (i);

         minheap_heapify (k, simi, idxi);


         for (size_t ik = 0; ik < nprobe; ik++) {

             long key = keysi[ik];  /* select the list  */

             if (key < 0) {

                 // not enough centroids for multiprobe

                 continue;

             }

             if (key >= (long) nlist) {

                 fprintf (stderr, "Invalid key=%ld  at ik=%ld nlist=%ld\n",

                                   key, ik, nlist);

                 throw;

             }


             const size_t list_size = ids[key].size();

             const float * list_vecs = vecs[key].data();


             for (size_t j = 0; j < list_size; j++) {

                 const float * yj = list_vecs + d * j;

                 float ip = fvec_inner_product (xi, yj, d);

                 if (ip > simi[0]) {

                     minheap_pop (k, simi, idxi);

                     minheap_push (k, simi, idxi, ip, ids[key][j]);

                 }

             }

         }

         minheap_reorder (k, simi, idxi);

     }

 }


 void IndexIVFFlat::search_knn_L2sqr (

     size_t nx,

     const float * x,

     const long * __restrict keys,

     float_maxheap_array_t * res) const

 {

     const size_t k = res->k;


 #pragma omp parallel for

     for (size_t i = 0; i < nx; i++) {

         const float * xi = x + i * d;

         const long * keysi = keys + i * nprobe;

         float * __restrict disi = res->get_val (i);

         long * __restrict idxi = res->get_ids (i);

         maxheap_heapify (k, disi, idxi);


         for (size_t ik = 0; ik < nprobe; ik++) {

             long key = keysi[ik];  /* select the list  */

             if (key < 0) {

                 // not enough centroids for multiprobe

                 continue;

             }

             if (key >= (long) nlist) {

                 fprintf (stderr, "Invalid key=%ld  at ik=%ld nlist=%ld\n",

                                   key, ik, nlist);

                 throw;

             }


             const size_t list_size = ids[key].size();

             const float * list_vecs = vecs[key].data();


             for (size_t j = 0; j < list_size; j++) {

                 const float * yj = list_vecs + d * j;

                 float disij = fvec_L2sqr (xi, yj, d);

                 if (disij < disi[0]) {

                     maxheap_pop (k, disi, idxi);

                     maxheap_push (k, disi, idxi, disij, ids[key][j]);

                 }

             }

         }

         maxheap_reorder (k, disi, idxi);

     }

 }


 void IndexIVFFlat::search (idx_t n, const float *x, idx_t k,

                                 float *distances, idx_t *labels) const

 {

     idx_t * idx = new idx_t [n * nprobe];

     quantizer->assign (n, x, idx, nprobe);


    if (metric_type == METRIC_INNER_PRODUCT) {

         float_minheap_array_t res = {

             size_t(n), size_t(k), labels, distances};

         search_knn_inner_product (n, x, idx, &res);


     } else if (metric_type == METRIC_L2) {

         float_maxheap_array_t res = {

             size_t(n), size_t(k), labels, distances};

         search_knn_L2sqr (n, x, idx, &res);

     }


     delete [] idx;

 }


 void IndexIVFFlat::range_search (idx_t nx, const float *x, float radius,

                                  RangeSearchResult *result) const

 {

     idx_t * keys = new idx_t [nx * nprobe];

     quantizer->assign (nx, x, keys, nprobe);


     assert (metric_type == METRIC_L2 || !"Only L2 implemented");

 #pragma omp parallel

     {

         RangeSearchPartialResult pres(result);


         for (size_t i = 0; i < nx; i++) {

             const float * xi = x + i * d;

             const long * keysi = keys + i * nprobe;


             RangeSearchPartialResult::QueryResult & qres =

                 pres.new_result (i);


             for (size_t ik = 0; ik < nprobe; ik++) {

                 long key = keysi[ik];  /* select the list  */

                 if (key < 0 || key >= (long) nlist) {

                     fprintf (stderr, "Invalid key=%ld  at ik=%ld nlist=%ld\n",

                              key, ik, nlist);

                     throw;

                 }


                 const size_t list_size = ids[key].size();

                 const float * list_vecs = vecs[key].data();


                 for (size_t j = 0; j < list_size; j++) {

                     const float * yj = list_vecs + d * j;

                     float disij = fvec_L2sqr (xi, yj, d);

                     if (disij < radius) {

                         qres.add (disij, ids[key][j]);

                     }

                 }

             }

         }


         pres.finalize ();

     }

     delete[] keys;

 }


 void IndexIVFFlat::merge_from_residuals (IndexIVF &other_in)

 {

     IndexIVFFlat &other = dynamic_cast<IndexIVFFlat &> (other_in);

     for (int i = 0; i < nlist; i++) {

         std::vector<float> & src = other.vecs[i];

         std::vector<float> & dest = vecs[i];

         for (int j = 0; j < src.size(); j++)

             dest.push_back (src[j]);

         src.clear();

     }

 }


 void IndexIVFFlat::copy_subset_to (IndexIVFFlat & other, int subset_type,

                      long a1, long a2) const

 {

     FAISS_ASSERT (nlist == other.nlist);

     FAISS_ASSERT (!other.maintain_direct_map);


     for (long list_no = 0; list_no < nlist; list_no++) {

         const std::vector<idx_t> & ids_in = ids[list_no];

         std::vector<idx_t> & ids_out = other.ids[list_no];

         const std::vector<float> & vecs_in = vecs[list_no];

         std::vector<float> & vecs_out = other.vecs[list_no];


         for (long i = 0; i < ids_in.size(); i++) {

             idx_t id = ids_in[i];

             if (subset_type == 0 && a1 <= id && id < a2) {

                 ids_out.push_back (id);

                 vecs_out.insert (vecs_out.end(),

                                   vecs_in.begin() + i * d,

                                   vecs_in.begin() + (i + 1) * d);

                 other.ntotal++;

             }

         }

     }

 }


 void IndexIVFFlat::reset()

 {

     IndexIVF::reset();

     for (size_t key = 0; key < nlist; key++) {

         vecs[key].clear();

     }

 }


 long IndexIVFFlat::remove_ids (const IDSelector & sel)

 {

     FAISS_ASSERT (!maintain_direct_map ||

                   !"direct map remove not implemented");

     long nremove = 0;

 #pragma omp parallel for reduction(+: nremove)

     for (long i = 0; i < nlist; i++) {

         std::vector<idx_t> & idsi = ids[i];

         float *vecsi = vecs[i].data();


         long l = idsi.size(), j = 0;

         while (j < l) {

             if (sel.is_member (idsi[j])) {

                 l--;

                 idsi [j] = idsi [l];

                 memmove (vecsi + j * d,

                          vecsi + l * d, d * sizeof (float));

             } else {

                 j++;

             }

         }

         if (l < idsi.size()) {

             nremove += idsi.size() - l;

             idsi.resize (l);

             vecs[i].resize (l * d);

         }

     }

     ntotal -= nremove;

     return nremove;

 }


 void IndexIVFFlat::reconstruct (idx_t key, float * recons) const

 {

     assert (direct_map.size() == ntotal);

     int list_no = direct_map[key] >> 32;

     int ofs = direct_map[key] & 0xffffffff;

     memcpy (recons, &vecs[list_no][ofs * d], d * sizeof(recons[0]));

 }


 } // namespace faiss

faiss::IndexIVF
Definition: IndexIVF.h:46

faiss::ClusteringParameters::niter
int niter
clustering iterations
Definition: Clustering.h:26

faiss::RangeSearchPartialResult::QueryResult
result structure for a single query
Definition: AuxIndexStructures.h:157

faiss::fvec_L2sqr
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:431

faiss::IndexIVFFlat::search_knn_L2sqr
void search_knn_L2sqr(size_t nx, const float *x, const long *keys, float_maxheap_array_t *res) const
Implementation of the search for the L2 metric.
Definition: IndexIVF.cpp:319

faiss::HeapArray::get_val
T * get_val(size_t key)
Return the list of values for a heap.
Definition: Heap.h:361

faiss::IndexIVF::imbalance_factor
double imbalance_factor() const
1= perfectly balanced, &gt;1: imbalanced
Definition: IndexIVF.cpp:134

faiss::Index::reset
virtual void reset()=0
removes all elements from the database.

faiss::IndexIVF::nprobe
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:48

faiss::IndexIVFFlat::reconstruct
virtual void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:508

faiss::IDSelector
Definition: AuxIndexStructures.h:58

faiss::IndexIVF::quantizer_trains_alone
bool quantizer_trains_alone
just pass over the trainset to quantizer
Definition: IndexIVF.h:51

faiss::IndexIVFFlat::set_typename
virtual void set_typename() override
Definition: IndexIVF.cpp:207

faiss::IndexIVFFlat::range_search
virtual void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: IndexIVF.cpp:385

faiss::IndexIVFFlat::copy_subset_to
void copy_subset_to(IndexIVFFlat &other, int subset_type, long a1, long a2) const
Definition: IndexIVF.cpp:441

faiss::IndexIVFFlat::merge_from_residuals
virtual void merge_from_residuals(IndexIVF &other) override
Definition: IndexIVF.cpp:429

faiss::Index::add_with_ids
virtual void add_with_ids(idx_t n, const float *x, const long *xids)
Definition: Index.cpp:32

faiss::IndexIVF::train_residual
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:125

faiss::HeapArray::k
size_t k
allocated size per heap
Definition: Heap.h:356

faiss::imbalance_factor
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
Definition: utils.cpp:1444

faiss::IndexIVFFlat::remove_ids
virtual long remove_ids(const IDSelector &sel) override
Definition: IndexIVF.cpp:476

faiss::IndexIVFFlat
Definition: IndexIVF.h:113

faiss::IndexIVF::ids
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:56

faiss::Index::d
int d
vector dimension
Definition: Index.h:66

faiss::IndexIVF::quantizer
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:50

faiss::IndexIVF::train
virtual void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:96

faiss::IndexIVF::cp
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:54

faiss::IndexIVFFlat::add_with_ids
virtual void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
Definition: IndexIVF.cpp:224

faiss::Clustering
Definition: Clustering.h:58

faiss::IndexIVF::own_fields
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:52

faiss::HeapArray
Definition: Heap.h:351

faiss::Index::idx_t
long idx_t
all indices are this type
Definition: Index.h:64

faiss::IndexIVFFlat::reset
virtual void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:468

faiss::IndexIVF::make_direct_map
void make_direct_map()
intialize a direct map
Definition: IndexIVF.cpp:69

faiss::Index::ntotal
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67

faiss::Index::verbose
bool verbose
verbosity level
Definition: Index.h:68

faiss::IndexIVF::reset
virtual void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:87

faiss::RangeSearchPartialResult::new_result
QueryResult & new_result(idx_t qno)
begin a new result
Definition: AuxIndexStructures.cpp:114

faiss::RangeSearchPartialResult
the entries in the buffers are split per query
Definition: AuxIndexStructures.h:151

faiss::RangeSearchResult
Definition: AuxIndexStructures.h:35

faiss::IndexIVF::merge_from_residuals
virtual void merge_from_residuals(IndexIVF &other)=0

faiss::HeapArray::get_ids
TI * get_ids(size_t key)
Correspponding identifiers.
Definition: Heap.h:364

faiss::Index::metric_type
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74

faiss::IndexIVF::print_stats
void print_stats() const
display some stats about the inverted lists
Definition: IndexIVF.cpp:143

faiss::IndexIVF::nlist
size_t nlist
number of possible key values
Definition: IndexIVF.h:47

faiss::Index
Definition: Index.h:61

faiss::IndexIVF::add
virtual void add(idx_t n, const float *x) override
Quantizes x and calls add_with_key.
Definition: IndexIVF.cpp:64

faiss::Clustering::train
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Definition: Clustering.cpp:67

faiss::Index::is_trained
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71

faiss::IndexIVFFlat::search_knn_inner_product
void search_knn_inner_product(size_t nx, const float *x, const long *keys, float_minheap_array_t *res) const
Implementation of the search for the inner product metric.
Definition: IndexIVF.cpp:273

faiss::Index::train
virtual void train(idx_t n, const float *x)
Definition: Index.h:92

faiss::IndexIVF::maintain_direct_map
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:59

faiss::ClusteringParameters::spherical
bool spherical
do we want normalized centroids?
Definition: Clustering.h:31

faiss::IndexIVF::merge_from
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:163

faiss::MetricType
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:44

faiss::IndexIVFFlat::vecs
std::vector< std::vector< float > > vecs
Definition: IndexIVF.h:116

faiss::IndexIVFFlat::add_core
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
Definition: IndexIVF.cpp:229

faiss::IndexIVFFlat::search
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:364