20 #include "FaissAssert.h"
21 #include "IndexFlat.h"
22 #include "AuxIndexStructures.h"
31 Level1Quantizer::Level1Quantizer (Index * quantizer,
size_t nlist):
32 quantizer (quantizer),
34 quantizer_trains_alone (0),
36 clustering_index (nullptr)
41 Level1Quantizer::Level1Quantizer ():
44 quantizer_trains_alone (0), own_fields (false),
45 clustering_index (nullptr)
48 Level1Quantizer::~Level1Quantizer ()
58 printf (
"IVF quantizer does not need training.\n");
61 printf (
"IVF quantizer trains alone...\n");
65 "nlist not consistent with quantizer size");
68 printf (
"Training level-1 quantizer on %ld vectors in %ldD\n",
83 "Training L2 quantizer on %ld vectors in %ldD%s\n",
86 FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
90 clus.
train(n, x, assigner);
95 printf (
"Adding centroids to quantizer\n");
106 IndexIVF::IndexIVF (
Index * quantizer,
size_t d,
size_t nlist,
112 maintain_direct_map (false)
114 FAISS_THROW_IF_NOT (d == quantizer->
d);
125 codes.resize (nlist);
128 IndexIVF::IndexIVF ():
129 nprobe (1), max_codes (0),
130 maintain_direct_map (false)
145 if (new_maintain_direct_map) {
146 direct_map.resize (
ntotal, -1);
147 for (
size_t key = 0; key <
nlist; key++) {
148 const std::vector<long> & idlist =
ids[key];
150 for (
long ofs = 0; ofs < idlist.size(); ofs++) {
151 FAISS_THROW_IF_NOT_MSG (
152 0 <= idlist [ofs] && idlist[ofs] <
ntotal,
153 "direct map supported only for seuquential ids");
154 direct_map [idlist [ofs]] = key << 32 | ofs;
165 float *distances,
idx_t *labels)
const
167 long * idx =
new long [n *
nprobe];
169 float * coarse_dis =
new float [n *
nprobe];
175 distances, labels,
false);
182 FAISS_THROW_IF_NOT_MSG (direct_map.size() ==
ntotal,
183 "direct map is not initialized");
184 long list_no = direct_map[key] >> 32;
185 long offset = direct_map[key] & 0xffffffff;
192 FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <=
ntotal));
194 for (
long list_no = 0; list_no <
nlist; list_no++) {
195 const std::vector<long>& idlist =
ids[list_no];
197 for (
long offset = 0; offset < idlist.size(); offset++) {
198 long id = idlist[offset];
199 if (!(
id >= i0 &&
id < i0 + ni)) {
203 float* reconstructed = recons + (
id - i0) *
d;
211 float *distances,
idx_t *labels,
214 long * idx =
new long [n *
nprobe];
216 float * coarse_dis =
new float [n *
nprobe];
224 distances, labels,
true );
225 for (
idx_t i = 0; i < n; ++i) {
226 for (
idx_t j = 0; j < k; ++j) {
227 idx_t ij = i * k + j;
228 idx_t key = labels[ij];
229 float* reconstructed = recons + ij *
d;
232 memset(reconstructed, -1,
sizeof(*reconstructed) * d);
234 int list_no = key >> 32;
235 int offset = key & 0xffffffff;
238 labels[ij] =
ids[list_no][offset];
249 FAISS_THROW_MSG (
"reconstruct_from_offset not implemented");
256 for (
size_t i = 0; i <
ids.size(); i++) {
266 "direct map remove not implemented");
268 #pragma omp parallel for reduction(+: nremove)
269 for (
long i = 0; i <
nlist; i++) {
270 std::vector<idx_t> & idsi =
ids[i];
271 uint8_t * codesi = codes[i].data();
273 long l = idsi.size(), j = 0;
275 if (sel.is_member (idsi[j])) {
279 codesi + l * code_size, code_size);
284 if (l < idsi.size()) {
285 nremove += idsi.size() - l;
300 printf (
"Training level-1 quantizer\n");
305 printf (
"Training IVF residual\n");
313 printf(
"IndexIVF: no residual training\n");
321 std::vector<int> hist (
nlist);
322 for (
int i = 0; i <
nlist; i++) {
323 hist[i] =
ids[i].size();
330 std::vector<int> sizes(40);
331 for (
int i = 0; i <
nlist; i++) {
332 for (
int j = 0; j < sizes.size(); j++) {
333 if ((
ids[i].size() >> j) == 0) {
339 for (
int i = 0; i < sizes.size(); i++) {
341 printf (
"list size in < %d: %d instances\n",
351 FAISS_THROW_IF_NOT (other.
d ==
d);
355 "direct map copy not implemented");
356 FAISS_THROW_IF_NOT_MSG (
typeid (*
this) ==
typeid (other),
357 "can only merge indexes of the same type");
358 for (
long i = 0; i <
nlist; i++) {
359 std::vector<idx_t> & src = other.
ids[i];
360 std::vector<idx_t> & dest =
ids[i];
361 for (
long j = 0; j < src.size(); j++)
362 dest.push_back (src[j] + add_id);
364 codes[i].insert (codes[i].end(),
365 other.codes[i].begin(),
366 other.codes[i].end());
367 other.codes[i].clear();
376 long a1,
long a2)
const
380 FAISS_THROW_IF_NOT_FMT (
381 subset_type == 0 || subset_type == 1 || subset_type == 2,
382 "subset type %d not implemented", subset_type);
388 for (
long list_no = 0; list_no <
nlist; list_no++) {
389 const std::vector<idx_t> & ids_in =
ids[list_no];
390 std::vector<idx_t> & ids_out = other.
ids[list_no];
391 const std::vector<uint8_t> & codes_in = codes[list_no];
392 std::vector<uint8_t> & codes_out = other.codes[list_no];
393 size_t n = ids_in.size();
395 if (subset_type == 0) {
396 for (
long i = 0; i < n; i++) {
397 idx_t id = ids_in[i];
398 if (a1 <=
id &&
id < a2) {
399 ids_out.push_back (
id);
400 codes_out.insert (codes_out.end(),
402 codes_in.begin() + (i + 1) * code_size);
406 }
else if (subset_type == 1) {
407 for (
long i = 0; i < n; i++) {
408 idx_t id = ids_in[i];
410 ids_out.push_back (
id);
411 codes_out.insert (codes_out.end(),
413 codes_in.begin() + (i + 1) * code_size);
417 }
else if (subset_type == 2) {
419 size_t next_accu_n = accu_n + n;
420 size_t next_accu_a1 = next_accu_n * a1 /
ntotal;
421 size_t i1 = next_accu_a1 - accu_a1;
422 size_t next_accu_a2 = next_accu_n * a2 /
ntotal;
423 size_t i2 = next_accu_a2 - accu_a2;
424 ids_out.insert(ids_out.end(),
426 ids_in.begin() + i2);
427 codes_out.insert (codes_out.end(),
431 accu_a1 = next_accu_a1;
432 accu_a2 = next_accu_a2;
436 FAISS_ASSERT(accu_n ==
ntotal);
441 IndexIVF::~IndexIVF()
451 IndexIVFFlat::IndexIVFFlat (Index * quantizer,
453 IndexIVF (quantizer, d, nlist, metric)
469 const long *precomputed_idx)
474 "cannot have direct map and add with ids");
478 if (precomputed_idx) {
479 idx = precomputed_idx;
481 long * idx0 =
new long [n];
487 for (
size_t i = 0; i < n; i++) {
488 long id = xids ? xids[i] :
ntotal + i;
489 long list_no = idx [i];
492 assert (list_no <
nlist);
494 ids[list_no].push_back (
id);
495 const float *xi = x + i *
d;
497 size_t ofs = codes[list_no].size();
499 memcpy(codes[list_no].data() + ofs,
503 direct_map.push_back (list_no << 32 | (
ids[list_no].size() - 1));
507 printf(
"IndexIVFFlat::add_core: added %ld / %ld vectors\n",
513 void IndexIVFStats::reset()
515 memset ((
void*)
this, 0,
sizeof (*
this));
519 IndexIVFStats indexIVF_stats;
523 void search_knn_inner_product (
const IndexIVFFlat & ivf,
527 float_minheap_array_t * res,
531 const size_t k = res->k;
532 size_t nlistv = 0, ndis = 0;
535 #pragma omp parallel for reduction(+: nlistv, ndis)
536 for (
size_t i = 0; i < nx; i++) {
537 const float * xi = x + i * d;
538 const long * keysi = keys + i * ivf.nprobe;
539 float * __restrict simi = res->get_val (i);
540 long * __restrict idxi = res->get_ids (i);
541 minheap_heapify (k, simi, idxi);
544 for (
size_t ik = 0; ik < ivf.nprobe; ik++) {
545 long key = keysi[ik];
550 FAISS_THROW_IF_NOT_FMT (
551 key < (
long) ivf.nlist,
552 "Invalid key=%ld at ik=%ld nlist=%ld\n",
556 const size_t list_size = ivf.ids[key].size();
557 const float * list_vecs = (
const float*)(ivf.codes[key].data());
559 for (
size_t j = 0; j < list_size; j++) {
560 const float * yj = list_vecs + d * j;
561 float ip = fvec_inner_product (xi, yj, d);
563 minheap_pop (k, simi, idxi);
564 long id = store_pairs ? (key << 32 | j) : ivf.ids[key][j];
565 minheap_push (k, simi, idxi, ip,
id);
569 if (ivf.max_codes && nscan >= ivf.max_codes)
573 minheap_reorder (k, simi, idxi);
575 indexIVF_stats.nq += nx;
576 indexIVF_stats.nlist += nlistv;
577 indexIVF_stats.ndis += ndis;
581 void search_knn_L2sqr (
const IndexIVFFlat &ivf,
585 float_maxheap_array_t * res,
588 const size_t k = res->k;
589 size_t nlistv = 0, ndis = 0;
591 #pragma omp parallel for reduction(+: nlistv, ndis)
592 for (
size_t i = 0; i < nx; i++) {
593 const float * xi = x + i * d;
594 const long * keysi = keys + i * ivf.nprobe;
595 float * __restrict disi = res->get_val (i);
596 long * __restrict idxi = res->get_ids (i);
597 maxheap_heapify (k, disi, idxi);
601 for (
size_t ik = 0; ik < ivf.nprobe; ik++) {
602 long key = keysi[ik];
607 FAISS_THROW_IF_NOT_FMT (
608 key < (
long) ivf.nlist,
609 "Invalid key=%ld at ik=%ld nlist=%ld\n",
613 const size_t list_size = ivf.ids[key].size();
614 const float * list_vecs = (
const float*)(ivf.codes[key].data());
616 for (
size_t j = 0; j < list_size; j++) {
617 const float * yj = list_vecs + d * j;
619 if (disij < disi[0]) {
620 maxheap_pop (k, disi, idxi);
621 long id = store_pairs ? (key << 32 | j) : ivf.ids[key][j];
622 maxheap_push (k, disi, idxi, disij,
id);
626 if (ivf.max_codes && nscan >= ivf.max_codes)
630 maxheap_reorder (k, disi, idxi);
632 indexIVF_stats.nq += nx;
633 indexIVF_stats.nlist += nlistv;
634 indexIVF_stats.ndis += ndis;
643 float *distances,
idx_t *labels,
644 bool store_pairs)
const
648 size_t(n), size_t(k), labels, distances};
649 search_knn_inner_product (*
this, n, x, idx, &res, store_pairs);
653 size_t(n), size_t(k), labels, distances};
654 search_knn_L2sqr (*
this, n, x, idx, &res, store_pairs);
670 for (
size_t i = 0; i < nx; i++) {
671 const float * xi = x + i *
d;
672 const long * keysi = keys + i *
nprobe;
677 for (
size_t ik = 0; ik <
nprobe; ik++) {
678 long key = keysi[ik];
679 if (key < 0 || key >= (
long)
nlist) {
680 fprintf (stderr,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
685 const size_t list_size =
ids[key].size();
686 const float * list_vecs = (
const float *)(codes[key].data());
688 for (
size_t j = 0; j < list_size; j++) {
689 const float * yj = list_vecs + d * j;
692 if (disij < radius) {
693 qres.add (disij,
ids[key][j]);
696 float disij = fvec_inner_product(xi, yj, d);
697 if (disij > radius) {
698 qres.add (disij,
ids[key][j]);
713 std::vector<idx_t>
assign (n);
716 for (
int i = 0; i < n; i++) {
717 idx_t id = new_ids[i];
718 FAISS_THROW_IF_NOT_MSG (0 <=
id &&
id <
ntotal,
719 "id to update out of range");
721 long dm = direct_map[id];
722 long ofs = dm & 0xffffffff;
724 size_t l =
ids[il].size();
726 long id2 =
ids[il].back();
728 direct_map[id2] = (il << 32) | ofs;
729 float * vecs = (
float*)codes[il].data();
730 memcpy (vecs + ofs * d,
739 size_t l =
ids[il].size();
740 long dm = (il << 32) | l;
742 ids[il].push_back (
id);
744 float * vecs = (
float*)codes[il].data();
745 memcpy (vecs + l * d,
756 memcpy (recons, &codes[list_no][offset *
code_size], d *
sizeof(recons[0]));
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const =0
result structure for a single query
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
double imbalance_factor() const
1= perfectly balanced, >1: imbalanced
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
virtual void reset()=0
removes all elements from the database.
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
size_t nprobe
number of probes at query time
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
void reconstruct(idx_t key, float *recons) const override
virtual void train(idx_t n, const float *x)
virtual void add_with_ids(idx_t n, const float *x, const long *xids)
virtual void train_residual(idx_t n, const float *x)
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
std::vector< std::vector< long > > ids
Inverted lists for indexes.
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
char quantizer_trains_alone
virtual void add(idx_t n, const float *x)=0
long idx_t
all indices are this type
ClusteringParameters cp
to override default clustering params
idx_t ntotal
total nb of indexed vectors
bool verbose
verbosity level
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
void reset() override
removes all elements from the database.
std::vector< float > centroids
centroids (k * d)
QueryResult & new_result(idx_t qno)
begin a new result
void update_vectors(int nv, idx_t *idx, const float *v)
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
Index * clustering_index
to override index used during clustering
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
the entries in the buffers are split per query
void make_direct_map(bool new_maintain_direct_map=true)
MetricType metric_type
type of metric this index uses for search
void print_stats() const
display some stats about the inverted lists
void add(idx_t n, const float *x) override
Quantizes x and calls add_with_key.
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
Index * quantizer
quantizer that maps vectors to inverted lists
bool is_trained
set if the Index does not require training, or if training is done already
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
bool spherical
do we want normalized centroids?
bool own_fields
whether object owns the quantizer
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
virtual void merge_from(IndexIVF &other, idx_t add_id)
size_t nlist
number of possible key values
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.
virtual void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer