12 #include "IndexBinaryIVF.h"
20 #include "AuxIndexStructures.h"
21 #include "FaissAssert.h"
22 #include "IndexFlat.h"
27 IndexBinaryIVF::IndexBinaryIVF(
IndexBinary *quantizer,
size_t d,
size_t nlist)
33 maintain_direct_map(false),
37 clustering_index(nullptr)
39 FAISS_THROW_IF_NOT (d == quantizer->
d);
45 IndexBinaryIVF::IndexBinaryIVF()
50 maintain_direct_map(false),
54 clustering_index(nullptr)
66 const long *precomputed_idx) {
70 "cannot have direct map and add with ids");
74 std::unique_ptr<long[]> scoped_idx;
76 if (precomputed_idx) {
77 idx = precomputed_idx;
79 scoped_idx.reset(
new long[n]);
81 idx = scoped_idx.get();
85 for (
size_t i = 0; i < n; i++) {
86 long id = xids ? xids[i] :
ntotal + i;
87 long list_no = idx[i];
95 direct_map.push_back(list_no << 32 | offset);
99 printf(
"IndexBinaryIVF::add_with_ids: added %ld / %ld vectors\n",
110 if (new_maintain_direct_map) {
111 direct_map.resize(
ntotal, -1);
112 for (
size_t key = 0; key <
nlist; key++) {
116 for (
long ofs = 0; ofs < list_size; ofs++) {
117 FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] <
ntotal,
118 "direct map supported only for seuquential ids");
119 direct_map[idlist[ofs]] = key << 32 | ofs;
129 int32_t *distances,
idx_t *labels)
const {
130 std::unique_ptr<idx_t[]> idx(
new idx_t[n *
nprobe]);
131 std::unique_ptr<int32_t[]> coarse_dis(
new int32_t[n * nprobe]);
138 distances, labels,
false);
142 FAISS_THROW_IF_NOT_MSG(direct_map.size() ==
ntotal,
143 "direct map is not initialized");
144 long list_no = direct_map[key] >> 32;
145 long offset = direct_map[key] & 0xffffffff;
150 FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <=
ntotal));
152 for (
long list_no = 0; list_no <
nlist; list_no++) {
156 for (
long offset = 0; offset < list_size; offset++) {
157 long id = idlist[offset];
158 if (!(
id >= i0 &&
id < i0 + ni)) {
162 uint8_t *reconstructed = recons + (
id - i0) *
d;
169 int32_t *distances,
idx_t *labels,
170 uint8_t *recons)
const {
171 std::unique_ptr<idx_t[]> idx(
new long[n *
nprobe]);
172 std::unique_ptr<int32_t[]> coarse_dis(
new int32_t[n * nprobe]);
181 distances, labels,
true);
182 for (
idx_t i = 0; i < n; ++i) {
183 for (
idx_t j = 0; j < k; ++j) {
184 idx_t ij = i * k + j;
185 idx_t key = labels[ij];
186 uint8_t *reconstructed = recons + ij *
d;
189 memset(reconstructed, -1,
sizeof(*reconstructed) * d);
191 int list_no = key >> 32;
192 int offset = key & 0xffffffff;
204 uint8_t *recons)
const {
216 "direct map remove not implemented");
218 std::vector<long> toremove(
nlist);
220 #pragma omp parallel for
221 for (
long i = 0; i <
nlist; i++) {
225 if (sel.is_member(idsi[j])) {
235 toremove[i] = l0 - l;
239 for (
long i = 0; i <
nlist; i++) {
240 if (toremove[i] > 0) {
241 nremove += toremove[i];
252 printf(
"Training level-1 quantizer\n");
260 std::vector<int> hist(
nlist);
262 for (
int i = 0; i <
nlist; i++) {
270 std::vector<int> sizes(40);
271 for (
int i = 0; i <
nlist; i++) {
272 for (
int j = 0; j < sizes.size(); j++) {
279 for (
int i = 0; i < sizes.size(); i++) {
281 printf(
"list size in < %d: %d instances\n", 1 << i, sizes[i]);
288 FAISS_THROW_IF_NOT(other.
d ==
d);
293 "direct map copy not implemented");
294 FAISS_THROW_IF_NOT_MSG(
typeid (*
this) ==
typeid (other),
295 "can only merge indexes of the same type");
303 void IndexBinaryIVF::replace_invlists(
InvertedLists *il,
bool own) {
317 printf(
"IVF quantizer does not need training.\n");
320 printf(
"Training level-1 quantizer on %ld vectors in %dD\n", n,
d);
325 std::unique_ptr<float[]> x_f(
new float[n *
d]);
331 printf(
"using clustering_index of dimension %d to do the clustering\n",
337 std::unique_ptr<uint8_t[]> x_b(
new uint8_t[clus.
k *
code_size]);
351 template<
class HammingComputer,
bool store_pairs>
357 IVFBinaryScannerL2 (
size_t code_size): code_size (code_size)
360 void set_query (
const uint8_t *query_vector)
override {
361 hc.set (query_vector, code_size);
365 void set_list (idx_t list_no, uint8_t )
override {
366 this->list_no = list_no;
369 uint32_t distance_to_code (
const uint8_t *code)
const override {
370 return hc.hamming (code);
373 size_t scan_codes (
size_t n,
374 const uint8_t *codes,
376 int32_t *simi, idx_t *idxi,
377 size_t k)
const override
379 using C = CMax<int32_t, idx_t>;
382 for (
size_t j = 0; j < n; j++) {
383 uint32_t dis = hc.hamming (codes);
385 heap_pop<C> (k, simi, idxi);
386 long id = store_pairs ? (list_no << 32 | j) : ids[j];
387 heap_push<C> (k, simi, idxi, dis, id);
399 template <
bool store_pairs>
400 BinaryInvertedListScanner *select_IVFBinaryScannerL2 (
size_t code_size) {
403 #define HANDLE_CS(cs) \
405 return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
414 if (code_size % 8 == 0) {
415 return new IVFBinaryScannerL2<HammingComputerM8,
416 store_pairs> (code_size);
417 }
else if (code_size % 4 == 0) {
418 return new IVFBinaryScannerL2<HammingComputerM4,
419 store_pairs> (code_size);
421 return new IVFBinaryScannerL2<HammingComputerDefault,
422 store_pairs> (code_size);
428 void search_knn_hamming_heap(
const IndexBinaryIVF& ivf,
433 const int32_t * coarse_dis,
434 int32_t *distances, idx_t *labels,
436 const IVFSearchParameters *params)
438 long nprobe = params ? params->nprobe : ivf.nprobe;
439 long max_codes = params ? params->max_codes : ivf.max_codes;
444 size_t nlistv = 0, ndis = 0, nheap = 0;
445 using HeapForIP = CMin<int32_t, idx_t>;
446 using HeapForL2 = CMax<int32_t, idx_t>;
448 #pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
450 std::unique_ptr<BinaryInvertedListScanner> scanner
451 (ivf.get_InvertedListScanner (store_pairs));
454 for (
size_t i = 0; i < n; i++) {
455 const uint8_t *xi = x + i * ivf.code_size;
456 scanner->set_query(xi);
458 const long * keysi = keys + i * nprobe;
459 int32_t * simi = distances + k * i;
460 long * idxi = labels + k * i;
462 if (metric_type == METRIC_INNER_PRODUCT) {
463 heap_heapify<HeapForIP> (k, simi, idxi);
465 heap_heapify<HeapForL2> (k, simi, idxi);
470 for (
size_t ik = 0; ik < nprobe; ik++) {
471 long key = keysi[ik];
476 FAISS_THROW_IF_NOT_FMT
477 (key < (
long) ivf.nlist,
478 "Invalid key=%ld at ik=%ld nlist=%ld\n",
481 scanner->set_list (key, coarse_dis[i * nprobe + ik]);
485 size_t list_size = ivf.invlists->list_size(key);
486 InvertedLists::ScopedCodes scodes (ivf.invlists, key);
488 ivf.invlists->get_ids (key);
490 nheap += scanner->scan_codes (list_size, scodes.get(),
494 ivf.invlists->release_ids (ids);
498 if (max_codes && nscan >= max_codes)
503 if (metric_type == METRIC_INNER_PRODUCT) {
504 heap_reorder<HeapForIP> (k, simi, idxi);
506 heap_reorder<HeapForL2> (k, simi, idxi);
512 indexIVF_stats.nq += n;
513 indexIVF_stats.nlist += nlistv;
514 indexIVF_stats.ndis += ndis;
515 indexIVF_stats.nheap_updates += nheap;
519 template<
class HammingComputer,
bool store_pairs>
520 void search_knn_hamming_count(
const IndexBinaryIVF& ivf,
527 const IVFSearchParameters *params) {
528 const int nBuckets = ivf.d + 1;
529 std::vector<int> all_counters(nx * nBuckets, 0);
530 std::unique_ptr<long[]> all_ids_per_dis(
new long[nx * nBuckets * k]);
532 long nprobe = params ? params->nprobe : ivf.nprobe;
533 long max_codes = params ? params->max_codes : ivf.max_codes;
535 std::vector<HCounterState<HammingComputer>> cs;
536 for (
size_t i = 0; i < nx; ++i) {
537 cs.push_back(HCounterState<HammingComputer>(
538 all_counters.data() + i * nBuckets,
539 all_ids_per_dis.get() + i * nBuckets * k,
540 x + i * ivf.code_size,
546 size_t nlistv = 0, ndis = 0;
548 #pragma omp parallel for reduction(+: nlistv, ndis)
549 for (
size_t i = 0; i < nx; i++) {
550 const long * keysi = keys + i * nprobe;
551 HCounterState<HammingComputer>& csi = cs[i];
555 for (
size_t ik = 0; ik < nprobe; ik++) {
556 long key = keysi[ik];
561 FAISS_THROW_IF_NOT_FMT (
562 key < (
long) ivf.nlist,
563 "Invalid key=%ld at ik=%ld nlist=%ld\n",
567 size_t list_size = ivf.invlists->list_size(key);
568 InvertedLists::ScopedCodes scodes (ivf.invlists, key);
569 const uint8_t *list_vecs = scodes.get();
572 : ivf.invlists->get_ids(key);
574 for (
size_t j = 0; j < list_size; j++) {
575 const uint8_t * yj = list_vecs + ivf.code_size * j;
577 long id = store_pairs ? (key << 32 | j) : ids[j];
578 csi.update_counter(yj,
id);
581 ivf.invlists->release_ids (ids);
584 if (max_codes && nscan >= max_codes)
590 for (
int b = 0; b < nBuckets && nres < k; b++) {
591 for (
int l = 0; l < csi.counters[b] && nres < k; l++) {
592 labels[i * k + nres] = csi.ids_per_dis[b * k + l];
593 distances[i * k + nres] = b;
598 labels[i * k + nres] = -1;
599 distances[i * k + nres] = std::numeric_limits<int32_t>::max();
604 indexIVF_stats.nq += nx;
605 indexIVF_stats.nlist += nlistv;
606 indexIVF_stats.ndis += ndis;
611 template<
bool store_pairs>
612 void search_knn_hamming_count_1 (
613 const IndexBinaryIVF& ivf,
620 const IVFSearchParameters *params) {
621 switch (ivf.code_size) {
622 #define HANDLE_CS(cs) \
624 search_knn_hamming_count<HammingComputer ## cs, store_pairs>( \
625 ivf, nx, x, keys, k, distances, labels, params); \
635 if (ivf.code_size % 8 == 0) {
636 search_knn_hamming_count<HammingComputerM8, store_pairs>
637 (ivf, nx, x, keys, k, distances, labels, params);
638 }
else if (ivf.code_size % 4 == 0) {
639 search_knn_hamming_count<HammingComputerM4, store_pairs>
640 (ivf, nx, x, keys, k, distances, labels, params);
642 search_knn_hamming_count<HammingComputerDefault, store_pairs>
643 (ivf, nx, x, keys, k, distances, labels, params);
652 BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
653 (
bool store_pairs)
const
656 return select_IVFBinaryScannerL2<true> (
code_size);
658 return select_IVFBinaryScannerL2<false> (
code_size);
664 const int32_t * coarse_dis,
665 int32_t *distances,
idx_t *labels,
671 search_knn_hamming_heap (*
this, n, x, k, idx, coarse_dis,
672 distances, labels, store_pairs,
676 search_knn_hamming_count_1<true>
677 (*
this, n, x, idx, k, distances, labels, params);
679 search_knn_hamming_count_1<false>
680 (*
this, n, x, idx, k, distances, labels, params);
685 IndexBinaryIVF::~IndexBinaryIVF() {
virtual void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const =0
size_t nprobe
number of probes at query time
void add_core(idx_t n, const uint8_t *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
int niter
clustering iterations
simple (default) implementation as an array of inverted lists
virtual void reconstruct_from_offset(long list_no, long offset, uint8_t *recons) const
virtual const idx_t * get_ids(size_t list_no) const =0
virtual void reset()=0
Removes all elements from the database.
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
ClusteringParameters cp
to override default clustering params
void search_preassigned(idx_t n, const uint8_t *x, idx_t k, const idx_t *assign, const int32_t *centroid_dis, int32_t *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
size_t nlist
number of possible key values
virtual size_t list_size(size_t list_no) const =0
get the size of a list
bool verbose
verbosity level
double imbalance_factor(int n, int k, const long *assign)
a balanced assignment has a IF of 1
bool is_trained
set if the Index does not require training, or if training is done already
IndexBinary * quantizer
quantizer that maps vectors to inverted lists
int code_size
number of bytes per vector ( = d / 8 )
bool own_fields
whether object owns the quantizer
virtual void merge_from(IndexBinaryIVF &other, idx_t add_id)
void make_direct_map(bool new_maintain_direct_map=true)
Index * clustering_index
to override index used during clustering
void merge_from(InvertedLists *oivf, size_t add_id)
move all entries from oivf (empty on output)
virtual idx_t get_single_id(size_t list_no, size_t offset) const
size_t code_size
code size per vector in bytes
void train_q1(size_t n, const uint8_t *x, bool verbose)
Trains the quantizer and calls train_residual to train sub-quantizers.
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
double imbalance_factor() const
1= perfectly balanced, >1: imbalanced
void train(idx_t n, const uint8_t *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
void assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k=1)
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
void reset() override
Removes all elements from the database.
long idx_t
all indices are this type
std::vector< float > centroids
centroids (k * d)
virtual void prefetch_lists(const long *list_nos, int nlist) const
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
void print_stats() const
display some stats about the inverted lists
void add_with_ids(idx_t n, const uint8_t *x, const long *xids) override
void add(idx_t n, const uint8_t *x) override
Quantizes x and calls add_with_key.
size_t nlist
number of possible key values
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
idx_t ntotal
total nb of indexed vectors
long idx_t
all indices are this type
virtual void add(idx_t n, const uint8_t *x)=0
virtual void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const override
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)
void search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels, uint8_t *recons) const override
void reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const override
void reconstruct(idx_t key, uint8_t *recons) const override
MetricType
Some algorithms support both an inner product version and a L2 search version.
InvertedLists * invlists
Acess to the actual data.