11 #include "IndexBinaryIVF.h"
19 #include "AuxIndexStructures.h"
20 #include "FaissAssert.h"
21 #include "IndexFlat.h"
26 IndexBinaryIVF::IndexBinaryIVF(
IndexBinary *quantizer,
size_t d,
size_t nlist)
32 maintain_direct_map(false),
36 clustering_index(nullptr)
38 FAISS_THROW_IF_NOT (d == quantizer->
d);
44 IndexBinaryIVF::IndexBinaryIVF()
49 maintain_direct_map(false),
53 clustering_index(nullptr)
65 const long *precomputed_idx) {
69 "cannot have direct map and add with ids");
73 std::unique_ptr<long[]> scoped_idx;
75 if (precomputed_idx) {
76 idx = precomputed_idx;
78 scoped_idx.reset(
new long[n]);
80 idx = scoped_idx.get();
84 for (
size_t i = 0; i < n; i++) {
85 long id = xids ? xids[i] :
ntotal + i;
86 long list_no = idx[i];
94 direct_map.push_back(list_no << 32 | offset);
98 printf(
"IndexBinaryIVF::add_with_ids: added %ld / %ld vectors\n",
109 if (new_maintain_direct_map) {
110 direct_map.resize(
ntotal, -1);
111 for (
size_t key = 0; key <
nlist; key++) {
115 for (
long ofs = 0; ofs < list_size; ofs++) {
116 FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] <
ntotal,
117 "direct map supported only for seuquential ids");
118 direct_map[idlist[ofs]] = key << 32 | ofs;
128 int32_t *distances,
idx_t *labels)
const {
129 std::unique_ptr<idx_t[]> idx(
new idx_t[n *
nprobe]);
130 std::unique_ptr<int32_t[]> coarse_dis(
new int32_t[n * nprobe]);
134 indexIVF_stats.quantization_time +=
getmillisecs() - t0;
140 distances, labels,
false);
145 FAISS_THROW_IF_NOT_MSG(direct_map.size() ==
ntotal,
146 "direct map is not initialized");
147 long list_no = direct_map[key] >> 32;
148 long offset = direct_map[key] & 0xffffffff;
153 FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <=
ntotal));
155 for (
long list_no = 0; list_no <
nlist; list_no++) {
159 for (
long offset = 0; offset < list_size; offset++) {
160 long id = idlist[offset];
161 if (!(
id >= i0 &&
id < i0 + ni)) {
165 uint8_t *reconstructed = recons + (
id - i0) *
d;
172 int32_t *distances,
idx_t *labels,
173 uint8_t *recons)
const {
174 std::unique_ptr<idx_t[]> idx(
new long[n *
nprobe]);
175 std::unique_ptr<int32_t[]> coarse_dis(
new int32_t[n * nprobe]);
184 distances, labels,
true);
185 for (
idx_t i = 0; i < n; ++i) {
186 for (
idx_t j = 0; j < k; ++j) {
187 idx_t ij = i * k + j;
188 idx_t key = labels[ij];
189 uint8_t *reconstructed = recons + ij *
d;
192 memset(reconstructed, -1,
sizeof(*reconstructed) * d);
194 int list_no = key >> 32;
195 int offset = key & 0xffffffff;
207 uint8_t *recons)
const {
219 "direct map remove not implemented");
221 std::vector<long> toremove(
nlist);
223 #pragma omp parallel for
224 for (
long i = 0; i <
nlist; i++) {
228 if (sel.is_member(idsi[j])) {
238 toremove[i] = l0 - l;
242 for (
long i = 0; i <
nlist; i++) {
243 if (toremove[i] > 0) {
244 nremove += toremove[i];
255 printf(
"Training quantizer\n");
260 printf(
"IVF quantizer does not need training.\n");
264 printf(
"Training quantizer on %ld vectors in %dD\n", n,
d);
270 std::unique_ptr<float[]> x_f(
new float[n *
d]);
276 printf(
"using clustering_index of dimension %d to do the clustering\n",
282 std::unique_ptr<uint8_t[]> x_b(
new uint8_t[clus.
k *
code_size]);
294 FAISS_THROW_IF_NOT(other.
d ==
d);
299 "direct map copy not implemented");
300 FAISS_THROW_IF_NOT_MSG(
typeid (*
this) ==
typeid (other),
301 "can only merge indexes of the same type");
309 void IndexBinaryIVF::replace_invlists(
InvertedLists *il,
bool own) {
325 template<
class HammingComputer,
bool store_pairs>
326 struct IVFBinaryScannerL2: BinaryInvertedListScanner {
331 IVFBinaryScannerL2 (
size_t code_size): code_size (code_size)
334 void set_query (
const uint8_t *query_vector)
override {
335 hc.set (query_vector, code_size);
339 void set_list (idx_t list_no, uint8_t )
override {
340 this->list_no = list_no;
343 uint32_t distance_to_code (
const uint8_t *code)
const override {
344 return hc.hamming (code);
347 size_t scan_codes (
size_t n,
348 const uint8_t *codes,
350 int32_t *simi, idx_t *idxi,
351 size_t k)
const override
353 using C = CMax<int32_t, idx_t>;
356 for (
size_t j = 0; j < n; j++) {
357 uint32_t dis = hc.hamming (codes);
359 heap_pop<C> (k, simi, idxi);
360 long id = store_pairs ? (list_no << 32 | j) : ids[j];
361 heap_push<C> (k, simi, idxi, dis, id);
373 template <
bool store_pairs>
374 BinaryInvertedListScanner *select_IVFBinaryScannerL2 (
size_t code_size) {
377 #define HANDLE_CS(cs) \
379 return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
388 if (code_size % 8 == 0) {
389 return new IVFBinaryScannerL2<HammingComputerM8,
390 store_pairs> (code_size);
391 }
else if (code_size % 4 == 0) {
392 return new IVFBinaryScannerL2<HammingComputerM4,
393 store_pairs> (code_size);
395 return new IVFBinaryScannerL2<HammingComputerDefault,
396 store_pairs> (code_size);
402 void search_knn_hamming_heap(
const IndexBinaryIVF& ivf,
407 const int32_t * coarse_dis,
408 int32_t *distances, idx_t *labels,
410 const IVFSearchParameters *params)
412 long nprobe = params ? params->nprobe : ivf.nprobe;
413 long max_codes = params ? params->max_codes : ivf.max_codes;
418 size_t nlistv = 0, ndis = 0, nheap = 0;
419 using HeapForIP = CMin<int32_t, idx_t>;
420 using HeapForL2 = CMax<int32_t, idx_t>;
422 #pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
424 std::unique_ptr<BinaryInvertedListScanner> scanner
425 (ivf.get_InvertedListScanner (store_pairs));
428 for (
size_t i = 0; i < n; i++) {
429 const uint8_t *xi = x + i * ivf.code_size;
430 scanner->set_query(xi);
432 const long * keysi = keys + i * nprobe;
433 int32_t * simi = distances + k * i;
434 long * idxi = labels + k * i;
436 if (metric_type == METRIC_INNER_PRODUCT) {
437 heap_heapify<HeapForIP> (k, simi, idxi);
439 heap_heapify<HeapForL2> (k, simi, idxi);
444 for (
size_t ik = 0; ik < nprobe; ik++) {
445 long key = keysi[ik];
450 FAISS_THROW_IF_NOT_FMT
451 (key < (
long) ivf.nlist,
452 "Invalid key=%ld at ik=%ld nlist=%ld\n",
455 scanner->set_list (key, coarse_dis[i * nprobe + ik]);
459 size_t list_size = ivf.invlists->list_size(key);
460 InvertedLists::ScopedCodes scodes (ivf.invlists, key);
461 std::unique_ptr<InvertedLists::ScopedIds> sids;
465 sids.reset (
new InvertedLists::ScopedIds (ivf.invlists, key));
469 nheap += scanner->scan_codes (list_size, scodes.get(),
473 if (max_codes && nscan >= max_codes)
478 if (metric_type == METRIC_INNER_PRODUCT) {
479 heap_reorder<HeapForIP> (k, simi, idxi);
481 heap_reorder<HeapForL2> (k, simi, idxi);
487 indexIVF_stats.nq += n;
488 indexIVF_stats.nlist += nlistv;
489 indexIVF_stats.ndis += ndis;
490 indexIVF_stats.nheap_updates += nheap;
494 template<
class HammingComputer,
bool store_pairs>
495 void search_knn_hamming_count(
const IndexBinaryIVF& ivf,
502 const IVFSearchParameters *params) {
503 const int nBuckets = ivf.d + 1;
504 std::vector<int> all_counters(nx * nBuckets, 0);
505 std::unique_ptr<long[]> all_ids_per_dis(
new long[nx * nBuckets * k]);
507 long nprobe = params ? params->nprobe : ivf.nprobe;
508 long max_codes = params ? params->max_codes : ivf.max_codes;
510 std::vector<HCounterState<HammingComputer>> cs;
511 for (
size_t i = 0; i < nx; ++i) {
512 cs.push_back(HCounterState<HammingComputer>(
513 all_counters.data() + i * nBuckets,
514 all_ids_per_dis.get() + i * nBuckets * k,
515 x + i * ivf.code_size,
521 size_t nlistv = 0, ndis = 0;
523 #pragma omp parallel for reduction(+: nlistv, ndis)
524 for (
size_t i = 0; i < nx; i++) {
525 const long * keysi = keys + i * nprobe;
526 HCounterState<HammingComputer>& csi = cs[i];
530 for (
size_t ik = 0; ik < nprobe; ik++) {
531 long key = keysi[ik];
536 FAISS_THROW_IF_NOT_FMT (
537 key < (
long) ivf.nlist,
538 "Invalid key=%ld at ik=%ld nlist=%ld\n",
542 size_t list_size = ivf.invlists->list_size(key);
543 InvertedLists::ScopedCodes scodes (ivf.invlists, key);
544 const uint8_t *list_vecs = scodes.get();
547 : ivf.invlists->get_ids(key);
549 for (
size_t j = 0; j < list_size; j++) {
550 const uint8_t * yj = list_vecs + ivf.code_size * j;
552 long id = store_pairs ? (key << 32 | j) : ids[j];
553 csi.update_counter(yj,
id);
556 ivf.invlists->release_ids (key, ids);
559 if (max_codes && nscan >= max_codes)
565 for (
int b = 0; b < nBuckets && nres < k; b++) {
566 for (
int l = 0; l < csi.counters[b] && nres < k; l++) {
567 labels[i * k + nres] = csi.ids_per_dis[b * k + l];
568 distances[i * k + nres] = b;
573 labels[i * k + nres] = -1;
574 distances[i * k + nres] = std::numeric_limits<int32_t>::max();
579 indexIVF_stats.nq += nx;
580 indexIVF_stats.nlist += nlistv;
581 indexIVF_stats.ndis += ndis;
586 template<
bool store_pairs>
587 void search_knn_hamming_count_1 (
588 const IndexBinaryIVF& ivf,
595 const IVFSearchParameters *params) {
596 switch (ivf.code_size) {
597 #define HANDLE_CS(cs) \
599 search_knn_hamming_count<HammingComputer ## cs, store_pairs>( \
600 ivf, nx, x, keys, k, distances, labels, params); \
610 if (ivf.code_size % 8 == 0) {
611 search_knn_hamming_count<HammingComputerM8, store_pairs>
612 (ivf, nx, x, keys, k, distances, labels, params);
613 }
else if (ivf.code_size % 4 == 0) {
614 search_knn_hamming_count<HammingComputerM4, store_pairs>
615 (ivf, nx, x, keys, k, distances, labels, params);
617 search_knn_hamming_count<HammingComputerDefault, store_pairs>
618 (ivf, nx, x, keys, k, distances, labels, params);
627 BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
628 (
bool store_pairs)
const
631 return select_IVFBinaryScannerL2<true> (
code_size);
633 return select_IVFBinaryScannerL2<false> (
code_size);
639 const int32_t * coarse_dis,
640 int32_t *distances,
idx_t *labels,
646 search_knn_hamming_heap (*
this, n, x, k, idx, coarse_dis,
647 distances, labels, store_pairs,
651 search_knn_hamming_count_1<true>
652 (*
this, n, x, idx, k, distances, labels, params);
654 search_knn_hamming_count_1<false>
655 (*
this, n, x, idx, k, distances, labels, params);
660 IndexBinaryIVF::~IndexBinaryIVF() {
virtual void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const =0
size_t nprobe
number of probes at query time
void add_core(idx_t n, const uint8_t *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
int niter
clustering iterations
simple (default) implementation as an array of inverted lists
virtual void reconstruct_from_offset(long list_no, long offset, uint8_t *recons) const
virtual const idx_t * get_ids(size_t list_no) const =0
virtual void reset()=0
Removes all elements from the database.
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
ClusteringParameters cp
to override default clustering params
void search_preassigned(idx_t n, const uint8_t *x, idx_t k, const idx_t *assign, const int32_t *centroid_dis, int32_t *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
size_t nlist
number of possible key values
virtual size_t list_size(size_t list_no) const =0
get the size of a list
bool verbose
verbosity level
bool is_trained
set if the Index does not require training, or if training is done already
IndexBinary * quantizer
quantizer that maps vectors to inverted lists
int code_size
number of bytes per vector ( = d / 8 )
bool own_fields
whether object owns the quantizer
virtual void merge_from(IndexBinaryIVF &other, idx_t add_id)
Index::idx_t idx_t
all indices are this type
void make_direct_map(bool new_maintain_direct_map=true)
Index * clustering_index
to override index used during clustering
void merge_from(InvertedLists *oivf, size_t add_id)
move all entries from oivf (empty on output)
virtual idx_t get_single_id(size_t list_no, size_t offset) const
long idx_t
all indices are this type
size_t code_size
code size per vector in bytes
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
void train(idx_t n, const uint8_t *x) override
Trains the quantizer.
void assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k=1)
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
void reset() override
Removes all elements from the database.
double getmillisecs()
ms elapsed since some arbitrary epoch
std::vector< float > centroids
centroids (k * d)
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
void add_with_ids(idx_t n, const uint8_t *x, const long *xids) override
void add(idx_t n, const uint8_t *x) override
size_t nlist
number of possible key values
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
idx_t ntotal
total nb of indexed vectors
virtual void add(idx_t n, const uint8_t *x)=0
virtual void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const override
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage.
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)
virtual void prefetch_lists(const idx_t *list_nos, int nlist) const
void search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels, uint8_t *recons) const override
void reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const override
void reconstruct(idx_t key, uint8_t *recons) const override
MetricType
Some algorithms support both an inner product version and a L2 search version.
InvertedLists * invlists
Acess to the actual data.