18 #include "FaissAssert.h"
19 #include "IndexFlat.h"
20 #include "AuxIndexStructures.h"
24 using ScopedIds = InvertedLists::ScopedIds;
25 using ScopedCodes = InvertedLists::ScopedCodes;
32 Level1Quantizer::Level1Quantizer (Index * quantizer,
size_t nlist):
33 quantizer (quantizer),
35 quantizer_trains_alone (0),
37 clustering_index (nullptr)
45 Level1Quantizer::Level1Quantizer ():
48 quantizer_trains_alone (0), own_fields (false),
49 clustering_index (nullptr)
52 Level1Quantizer::~Level1Quantizer ()
62 printf (
"IVF quantizer does not need training.\n");
65 printf (
"IVF quantizer trains alone...\n");
69 "nlist not consistent with quantizer size");
72 printf (
"Training level-1 quantizer on %ld vectors in %ldD\n",
87 "Training L2 quantizer on %ld vectors in %ldD%s\n",
90 FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
94 clus.
train(n, x, assigner);
99 printf (
"Adding centroids to quantizer\n");
111 IndexIVF::IndexIVF (
Index * quantizer,
size_t d,
112 size_t nlist,
size_t code_size,
118 code_size (code_size),
121 maintain_direct_map (false)
123 FAISS_THROW_IF_NOT (d == quantizer->
d);
132 IndexIVF::IndexIVF ():
133 invlists (nullptr), own_invlists (false),
135 nprobe (1), max_codes (0),
136 maintain_direct_map (false)
150 if (new_maintain_direct_map) {
151 direct_map.resize (
ntotal, -1);
152 for (
size_t key = 0; key <
nlist; key++) {
156 for (
long ofs = 0; ofs < list_size; ofs++) {
157 FAISS_THROW_IF_NOT_MSG (
158 0 <= idlist [ofs] && idlist[ofs] <
ntotal,
159 "direct map supported only for seuquential ids");
160 direct_map [idlist [ofs]] = key << 32 | ofs;
171 float *distances,
idx_t *labels)
const
173 long * idx =
new long [n *
nprobe];
175 float * coarse_dis =
new float [n *
nprobe];
183 distances, labels,
false);
191 const float *coarse_dis ,
192 float *distances,
idx_t *labels,
199 size_t nlistv = 0, ndis = 0, nheap = 0;
204 #pragma omp parallel reduction(+: nlistv, ndis, nheap)
209 for (
size_t i = 0; i < n; i++) {
211 const float * xi = x + i *
d;
213 const long * keysi = keys + i *
nprobe;
214 float * simi = distances + i * k;
215 long * idxi = labels + i * k;
218 heap_heapify<HeapForIP> (k, simi, idxi);
220 heap_heapify<HeapForL2> (k, simi, idxi);
226 for (
size_t ik = 0; ik <
nprobe; ik++) {
227 long key = keysi[ik];
232 FAISS_THROW_IF_NOT_FMT (key < (
long)
nlist,
233 "Invalid key=%ld at ik=%ld nlist=%ld\n",
240 if (list_size == 0) {
244 scanner->
set_list (key, coarse_dis[i * nprobe + ik]);
253 nheap += scanner->
scan_codes (list_size, scodes.get(),
261 if (max_codes && nscan >= max_codes)
267 heap_reorder<HeapForIP> (k, simi, idxi);
269 heap_reorder<HeapForL2> (k, simi, idxi);
275 indexIVF_stats.nq += n;
276 indexIVF_stats.nlist += nlistv;
277 indexIVF_stats.ndis += ndis;
278 indexIVF_stats.nheap_updates += nheap;
286 FAISS_THROW_IF_NOT_MSG (direct_map.size() ==
ntotal,
287 "direct map is not initialized");
288 long list_no = direct_map[key] >> 32;
289 long offset = direct_map[key] & 0xffffffff;
296 FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <=
ntotal));
298 for (
long list_no = 0; list_no <
nlist; list_no++) {
302 for (
long offset = 0; offset < list_size; offset++) {
303 long id = idlist[offset];
304 if (!(
id >= i0 &&
id < i0 + ni)) {
308 float* reconstructed = recons + (
id - i0) *
d;
316 float *distances,
idx_t *labels,
319 long * idx =
new long [n *
nprobe];
321 float * coarse_dis =
new float [n *
nprobe];
331 distances, labels,
true );
332 for (
idx_t i = 0; i < n; ++i) {
333 for (
idx_t j = 0; j < k; ++j) {
334 idx_t ij = i * k + j;
335 idx_t key = labels[ij];
336 float* reconstructed = recons + ij *
d;
339 memset(reconstructed, -1,
sizeof(*reconstructed) * d);
341 int list_no = key >> 32;
342 int offset = key & 0xffffffff;
357 FAISS_THROW_MSG (
"reconstruct_from_offset not implemented");
371 "direct map remove not implemented");
373 std::vector<long> toremove(
nlist);
375 #pragma omp parallel for
376 for (
long i = 0; i <
nlist; i++) {
380 if (sel.is_member (idsi[j])) {
390 toremove[i] = l0 - l;
394 for (
long i = 0; i <
nlist; i++) {
395 if (toremove[i] > 0) {
396 nremove += toremove[i];
411 printf (
"Training level-1 quantizer\n");
416 printf (
"Training IVF residual\n");
425 printf(
"IndexIVF: no residual training\n");
433 std::vector<int> hist (
nlist);
434 for (
int i = 0; i <
nlist; i++) {
442 std::vector<int> sizes(40);
443 for (
int i = 0; i <
nlist; i++) {
444 for (
int j = 0; j < sizes.size(); j++) {
451 for (
int i = 0; i < sizes.size(); i++) {
453 printf (
"list size in < %d: %d instances\n",
464 FAISS_THROW_IF_NOT (other.
d ==
d);
467 FAISS_THROW_IF_NOT_MSG (
typeid (*
this) ==
typeid (other),
468 "can only merge indexes of the same type");
477 "direct map copy not implemented");
500 long a1,
long a2)
const
506 FAISS_THROW_IF_NOT_FMT (
507 subset_type == 0 || subset_type == 1 || subset_type == 2,
508 "subset type %d not implemented", subset_type);
516 for (
long list_no = 0; list_no <
nlist; list_no++) {
520 if (subset_type == 0) {
521 for (
long i = 0; i < n; i++) {
522 idx_t id = ids_in[i];
523 if (a1 <=
id &&
id < a2) {
530 }
else if (subset_type == 1) {
531 for (
long i = 0; i < n; i++) {
532 idx_t id = ids_in[i];
540 }
else if (subset_type == 2) {
542 size_t next_accu_n = accu_n + n;
543 size_t next_accu_a1 = next_accu_n * a1 /
ntotal;
544 size_t i1 = next_accu_a1 - accu_a1;
545 size_t next_accu_a2 = next_accu_n * a2 /
ntotal;
546 size_t i2 = next_accu_a2 - accu_a2;
548 for (
long i = i1; i < i2; i++) {
555 accu_a1 = next_accu_a1;
556 accu_a2 = next_accu_a2;
560 FAISS_ASSERT(accu_n ==
ntotal);
565 IndexIVF::~IndexIVF()
573 void IndexIVFStats::reset()
575 memset ((
void*)
this, 0,
sizeof (*
this));
579 IndexIVFStats indexIVF_stats;
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
simple (default) implementation as an array of inverted lists
void check_compatible_for_merge(const IndexIVF &other) const
virtual const idx_t * get_ids(size_t list_no) const =0
double imbalance_factor() const
1= perfectly balanced, >1: imbalanced
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
virtual void reset()=0
removes all elements from the database.
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
size_t nprobe
number of probes at query time
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reconstruct(idx_t key, float *recons) const override
virtual void train(idx_t n, const float *x)
virtual void add_with_ids(idx_t n, const float *x, const long *xids)
virtual void train_residual(idx_t n, const float *x)
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
void merge_from(InvertedLists *oivf, size_t add_id)
move all entries from oivf (empty on output)
virtual idx_t get_single_id(size_t list_no, size_t offset) const
size_t code_size
code size per vector in bytes
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
char quantizer_trains_alone
virtual void add(idx_t n, const float *x)=0
virtual void set_list(idx_t list_no, float coarse_dis)=0
following codes come from this inverted list
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
virtual size_t scan_codes(size_t n, const uint8_t *codes, const idx_t *ids, float *distances, idx_t *labels, size_t k) const =0
long idx_t
all indices are this type
void replace_invlists(InvertedLists *il, bool own=false)
replace the inverted lists, old one is deallocated if own_invlists
ClusteringParameters cp
to override default clustering params
idx_t ntotal
total nb of indexed vectors
bool verbose
verbosity level
void reset() override
removes all elements from the database.
std::vector< float > centroids
centroids (k * d)
virtual void prefetch_lists(const long *list_nos, int nlist) const
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
Index * clustering_index
to override index used during clustering
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
size_t nprobe
number of probes at query time
size_t nlist
number of possible key values
void make_direct_map(bool new_maintain_direct_map=true)
MetricType metric_type
type of metric this index uses for search
void print_stats() const
display some stats about the inverted lists
InvertedLists * invlists
Acess to the actual data.
void add(idx_t n, const float *x) override
Calls add_with_ids with NULL ids.
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
size_t max_codes
max nb of codes to visit to do a query
Index * quantizer
quantizer that maps vectors to inverted lists
bool is_trained
set if the Index does not require training, or if training is done already
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
size_t max_codes
max nb of codes to visit to do a query
virtual InvertedListScanner * get_InvertedListScanner(bool store_pairs=false) const
get a scanner for this index (store_pairs means ignore labels)
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
bool spherical
do we want normalized centroids?
bool own_fields
whether object owns the quantizer
virtual void set_query(const float *query_vector)=0
from now on we handle this query.
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
virtual void release_ids(const idx_t *ids) const
release ids returned by get_ids
virtual void merge_from(IndexIVF &other, idx_t add_id)
size_t nlist
number of possible key values
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.