20 #include "FaissAssert.h"
21 #include "IndexFlat.h"
22 #include "AuxIndexStructures.h"
31 Level1Quantizer::Level1Quantizer (Index * quantizer,
size_t nlist):
32 quantizer (quantizer),
34 quantizer_trains_alone (0),
36 clustering_index (nullptr)
44 Level1Quantizer::Level1Quantizer ():
47 quantizer_trains_alone (0), own_fields (false),
48 clustering_index (nullptr)
51 Level1Quantizer::~Level1Quantizer ()
61 printf (
"IVF quantizer does not need training.\n");
64 printf (
"IVF quantizer trains alone...\n");
68 "nlist not consistent with quantizer size");
71 printf (
"Training level-1 quantizer on %ld vectors in %ldD\n",
86 "Training L2 quantizer on %ld vectors in %ldD%s\n",
89 FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
93 clus.
train(n, x, assigner);
98 printf (
"Adding centroids to quantizer\n");
108 InvertedLists::InvertedLists (
size_t nlist,
size_t code_size):
109 nlist (nlist), code_size (code_size)
114 InvertedLists::~InvertedLists ()
119 size_t list_no,
size_t offset)
const
122 return get_ids(list_no)[offset];
130 size_t list_no,
size_t offset)
const
139 return add_entries (list_no, 1, &theid, code);
142 void InvertedLists::update_entry (
size_t list_no,
size_t offset,
143 idx_t
id,
const uint8_t *code)
145 update_entries (list_no, offset, 1, &
id, code);
148 void InvertedLists::reset () {
149 for (
size_t i = 0; i <
nlist; i++) {
158 ArrayInvertedLists::ArrayInvertedLists (
size_t nlist,
size_t code_size):
159 InvertedLists (nlist, code_size)
162 codes.resize (nlist);
165 size_t ArrayInvertedLists::add_entries (
166 size_t list_no,
size_t n_entry,
167 const idx_t* ids_in,
const uint8_t *code)
169 if (n_entry == 0)
return 0;
170 assert (list_no < nlist);
171 size_t o =
ids [list_no].size();
172 ids [list_no].resize (o + n_entry);
173 memcpy (&
ids[list_no][o], ids_in,
sizeof (ids_in[0]) * n_entry);
174 codes [list_no].resize ((o + n_entry) * code_size);
175 memcpy (&codes[list_no][o * code_size], code, code_size * n_entry);
181 assert (list_no < nlist);
182 return ids[list_no].size();
187 assert (list_no < nlist);
188 return codes[list_no].data();
193 assert (list_no < nlist);
194 return ids[list_no].data();
197 void ArrayInvertedLists::resize (
size_t list_no,
size_t new_size)
199 ids[list_no].resize (new_size);
200 codes[list_no].resize (new_size * code_size);
203 void ArrayInvertedLists::update_entries (
204 size_t list_no,
size_t offset,
size_t n_entry,
205 const idx_t *ids_in,
const uint8_t *codes_in)
207 assert (list_no < nlist);
208 assert (n_entry + offset <=
ids[list_no].size());
209 memcpy (&
ids[list_no][offset], ids_in,
sizeof(ids_in[0]) * n_entry);
210 memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
214 ArrayInvertedLists::~ArrayInvertedLists ()
224 IndexIVF::IndexIVF (
Index * quantizer,
size_t d,
225 size_t nlist,
size_t code_size,
231 code_size (code_size),
234 maintain_direct_map (false)
236 FAISS_THROW_IF_NOT (d == quantizer->
d);
245 IndexIVF::IndexIVF ():
246 invlists (nullptr), own_invlists (false),
248 nprobe (1), max_codes (0),
249 maintain_direct_map (false)
263 if (new_maintain_direct_map) {
264 direct_map.resize (
ntotal, -1);
265 for (
size_t key = 0; key <
nlist; key++) {
269 for (
long ofs = 0; ofs < list_size; ofs++) {
270 FAISS_THROW_IF_NOT_MSG (
271 0 <= idlist [ofs] && idlist[ofs] <
ntotal,
272 "direct map supported only for seuquential ids");
273 direct_map [idlist [ofs]] = key << 32 | ofs;
284 float *distances,
idx_t *labels)
const
286 long * idx =
new long [n *
nprobe];
288 float * coarse_dis =
new float [n *
nprobe];
296 distances, labels,
false);
303 FAISS_THROW_IF_NOT_MSG (direct_map.size() ==
ntotal,
304 "direct map is not initialized");
305 long list_no = direct_map[key] >> 32;
306 long offset = direct_map[key] & 0xffffffff;
313 FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <=
ntotal));
315 for (
long list_no = 0; list_no <
nlist; list_no++) {
319 for (
long offset = 0; offset < list_size; offset++) {
320 long id = idlist[offset];
321 if (!(
id >= i0 &&
id < i0 + ni)) {
325 float* reconstructed = recons + (
id - i0) *
d;
333 float *distances,
idx_t *labels,
336 long * idx =
new long [n *
nprobe];
338 float * coarse_dis =
new float [n *
nprobe];
348 distances, labels,
true );
349 for (
idx_t i = 0; i < n; ++i) {
350 for (
idx_t j = 0; j < k; ++j) {
351 idx_t ij = i * k + j;
352 idx_t key = labels[ij];
353 float* reconstructed = recons + ij *
d;
356 memset(reconstructed, -1,
sizeof(*reconstructed) * d);
358 int list_no = key >> 32;
359 int offset = key & 0xffffffff;
373 FAISS_THROW_MSG (
"reconstruct_from_offset not implemented");
387 "direct map remove not implemented");
389 std::vector<long> toremove(nlist);
391 #pragma omp parallel for
392 for (
long i = 0; i <
nlist; i++) {
396 if (sel.is_member (idsi[j])) {
406 toremove[i] = l0 - l;
410 for (
long i = 0; i <
nlist; i++) {
411 if (toremove[i] > 0) {
412 nremove += toremove[i];
427 printf (
"Training level-1 quantizer\n");
432 printf (
"Training IVF residual\n");
441 printf(
"IndexIVF: no residual training\n");
449 std::vector<int> hist (nlist);
450 for (
int i = 0; i <
nlist; i++) {
458 std::vector<int> sizes(40);
459 for (
int i = 0; i <
nlist; i++) {
460 for (
int j = 0; j < sizes.size(); j++) {
467 for (
int i = 0; i < sizes.size(); i++) {
469 printf (
"list size in < %d: %d instances\n",
479 FAISS_THROW_IF_NOT (other.
d ==
d);
480 FAISS_THROW_IF_NOT (other.
nlist == nlist);
481 FAISS_THROW_IF_NOT (other.
code_size == code_size);
484 "direct map copy not implemented");
485 FAISS_THROW_IF_NOT_MSG (
typeid (*
this) ==
typeid (other),
486 "can only merge indexes of the same type");
489 #pragma omp parallel for
490 for (
long i = 0; i <
nlist; i++) {
494 invlists->add_entries (i, list_size, ids,
497 std::vector <idx_t> new_ids (list_size);
499 for (
size_t j = 0; j < list_size; j++) {
500 new_ids [j] = ids[j] + add_id;
503 invlists->add_entries (i, list_size, new_ids.data(),
514 void IndexIVF::replace_invlists (
InvertedLists *il,
bool own)
517 FAISS_THROW_IF_NOT (il->
nlist == nlist &&
528 long a1,
long a2)
const
531 FAISS_THROW_IF_NOT (nlist == other.
nlist);
532 FAISS_THROW_IF_NOT (code_size == other.
code_size);
534 FAISS_THROW_IF_NOT_FMT (
535 subset_type == 0 || subset_type == 1 || subset_type == 2,
536 "subset type %d not implemented", subset_type);
544 for (
long list_no = 0; list_no <
nlist; list_no++) {
548 if (subset_type == 0) {
549 for (
long i = 0; i < n; i++) {
550 idx_t id = ids_in[i];
551 if (a1 <=
id &&
id < a2) {
558 }
else if (subset_type == 1) {
559 for (
long i = 0; i < n; i++) {
560 idx_t id = ids_in[i];
568 }
else if (subset_type == 2) {
570 size_t next_accu_n = accu_n + n;
571 size_t next_accu_a1 = next_accu_n * a1 /
ntotal;
572 size_t i1 = next_accu_a1 - accu_a1;
573 size_t next_accu_a2 = next_accu_n * a2 /
ntotal;
574 size_t i2 = next_accu_a2 - accu_a2;
576 for (
long i = i1; i < i2; i++) {
583 accu_a1 = next_accu_a1;
584 accu_a2 = next_accu_a2;
588 FAISS_ASSERT(accu_n ==
ntotal);
594 IndexIVF::~IndexIVF()
602 void IndexIVFStats::reset()
604 memset ((
void*)
this, 0,
sizeof (*
this));
608 IndexIVFStats indexIVF_stats;
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const =0
const uint8_t * get_codes(size_t list_no) const override
const idx_t * get_ids(size_t list_no) const override
virtual const idx_t * get_ids(size_t list_no) const =0
double imbalance_factor() const
1= perfectly balanced, >1: imbalanced
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
virtual void reset()=0
removes all elements from the database.
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
size_t nprobe
number of probes at query time
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reconstruct(idx_t key, float *recons) const override
virtual void train(idx_t n, const float *x)
virtual void add_with_ids(idx_t n, const float *x, const long *xids)
virtual void train_residual(idx_t n, const float *x)
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
virtual idx_t get_single_id(size_t list_no, size_t offset) const
size_t code_size
code size per vector in bytes
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
char quantizer_trains_alone
virtual void add(idx_t n, const float *x)=0
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
long idx_t
all indices are this type
ClusteringParameters cp
to override default clustering params
idx_t ntotal
total nb of indexed vectors
bool verbose
verbosity level
void reset() override
removes all elements from the database.
std::vector< float > centroids
centroids (k * d)
virtual void prefetch_lists(const long *list_nos, int nlist) const
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
Index * clustering_index
to override index used during clustering
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
size_t nlist
number of possible key values
void make_direct_map(bool new_maintain_direct_map=true)
MetricType metric_type
type of metric this index uses for search
void print_stats() const
display some stats about the inverted lists
InvertedLists * invlists
Acess to the actual data.
std::vector< std::vector< idx_t > > ids
Inverted lists for indexes.
void add(idx_t n, const float *x) override
Quantizes x and calls add_with_key.
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
virtual const uint8_t * get_codes(size_t list_no) const =0
Index * quantizer
quantizer that maps vectors to inverted lists
bool is_trained
set if the Index does not require training, or if training is done already
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
bool spherical
do we want normalized centroids?
bool own_fields
whether object owns the quantizer
size_t list_size(size_t list_no) const override
get the size of a list
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
virtual void merge_from(IndexIVF &other, idx_t add_id)
size_t nlist
number of possible key values
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.