11 #include "IndexIVFFlat.h"
17 #include "FaissAssert.h"
18 #include "IndexFlat.h"
19 #include "AuxIndexStructures.h"
29 IndexIVFFlat::IndexIVFFlat (Index * quantizer,
31 IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
33 code_size =
sizeof(float) * d;
43 const long *precomputed_idx)
49 "cannot have direct map and add with ids");
53 if (precomputed_idx) {
54 idx = precomputed_idx;
56 long * idx0 =
new long [n];
62 for (
size_t i = 0; i < n; i++) {
63 long id = xids ? xids[i] :
ntotal + i;
64 long list_no = idx [i];
68 const float *xi = x + i *
d;
70 list_no,
id, (
const uint8_t*) xi);
73 direct_map.push_back (list_no << 32 | offset);
77 printf(
"IndexIVFFlat::add_core: added %ld / %ld vectors\n",
85 uint8_t * codes)
const
95 template<MetricType metric,
bool store_pairs,
class C>
99 IVFFlatScanner(
size_t d): d(d) {}
102 void set_query (
const float *query)
override {
107 void set_list (idx_t list_no,
float )
override {
108 this->list_no = list_no;
111 float distance_to_code (
const uint8_t *code)
const override {
112 const float *yj = (
float*)code;
113 float dis = metric == METRIC_INNER_PRODUCT ?
114 fvec_inner_product (xi, yj, d) :
fvec_L2sqr (xi, yj, d);
118 size_t scan_codes (
size_t list_size,
119 const uint8_t *codes,
121 float *simi, idx_t *idxi,
122 size_t k)
const override
124 const float *list_vecs = (
const float*)codes;
126 for (
size_t j = 0; j < list_size; j++) {
127 const float * yj = list_vecs + d * j;
128 float dis = metric == METRIC_INNER_PRODUCT ?
129 fvec_inner_product (xi, yj, d) :
fvec_L2sqr (xi, yj, d);
130 if (C::cmp (simi[0], dis)) {
131 heap_pop<C> (k, simi, idxi);
132 long id = store_pairs ? (list_no << 32 | j) : ids[j];
133 heap_push<C> (k, simi, idxi, dis, id);
148 (
bool store_pairs)
const
150 if (metric_type == METRIC_INNER_PRODUCT) {
152 return new IVFFlatScanner<
155 return new IVFFlatScanner<
158 }
else if (metric_type == METRIC_L2) {
160 return new IVFFlatScanner<
163 return new IVFFlatScanner<
182 for (
size_t i = 0; i < nx; i++) {
183 const float * xi = x + i *
d;
184 const long * keysi = keys + i *
nprobe;
189 for (
size_t ik = 0; ik <
nprobe; ik++) {
190 long key = keysi[ik];
191 if (key < 0 || key >= (
long)
nlist) {
192 fprintf (stderr,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
199 const float * list_vecs = (
const float*)scodes.get();
202 for (
size_t j = 0; j < list_size; j++) {
203 const float * yj = list_vecs + d * j;
206 if (disij < radius) {
207 qres.add (disij, ids[j]);
210 float disij = fvec_inner_product(xi, yj, d);
211 if (disij > radius) {
212 qres.add (disij, ids[j]);
228 std::vector<idx_t>
assign (n);
231 for (
size_t i = 0; i < n; i++) {
232 idx_t id = new_ids[i];
233 FAISS_THROW_IF_NOT_MSG (0 <=
id &&
id <
ntotal,
234 "id to update out of range");
236 long dm = direct_map[id];
237 long ofs = dm & 0xffffffff;
242 direct_map[id2] = (il << 32) | ofs;
243 invlists->update_entry (il, ofs, id2,
251 long dm = (il << 32) | l;
269 IndexIVFFlatDedup::IndexIVFFlatDedup (
270 Index * quantizer,
size_t d,
size_t nlist_,
276 static uint64_t hash_bytes (
const uint8_t *bytes,
long n) {
277 const uint8_t *p = bytes;
278 uint64_t x = (uint64_t)(*p) << 7;
281 x = (1000003*x) ^ *p++;
290 std::unordered_map<uint64_t, idx_t> map;
291 float * x2 =
new float [n *
d];
295 for (
long i = 0; i < n; i++) {
296 uint64_t hash = hash_bytes((uint8_t *)(x + i *
d),
code_size);
297 if (map.count(hash) &&
298 !memcmp (x2 + map[hash] * d, x + i * d,
code_size)) {
302 memcpy (x2 + n2 * d, x + i * d,
code_size);
307 printf (
"IndexIVFFlatDedup::train: train on %ld points after dedup "
308 "(was %ld points)\n", n2, n);
316 idx_t na,
const float* x,
const long* xids)
321 FAISS_THROW_IF_NOT_MSG (
323 "IVFFlatDedup not implemented with direct_map");
324 long * idx =
new long [na];
328 long n_add = 0, n_dup = 0;
330 for (
size_t i = 0; i < na; i++) {
331 idx_t
id = xids ? xids[i] :
ntotal + i;
332 long list_no = idx [i];
337 const float *xi = x + i *
d;
344 for (
long o = 0; o < n; o++) {
345 if (!memcmp (codes.get() + o *
code_size,
357 std::pair<idx_t, idx_t> pair (id2,
id);
364 printf(
"IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
365 " (out of which %ld are duplicates)\n",
372 idx_t n,
const float *x, idx_t k,
374 const float *centroid_dis,
375 float *distances, idx_t *labels,
379 FAISS_THROW_IF_NOT_MSG (
380 !store_pairs,
"store_pairs not supported in IVFDedup");
383 distances, labels,
false,
386 std::vector <idx_t> labels2 (k);
387 std::vector <float> dis2 (k);
389 for (
long i = 0; i < n; i++) {
390 idx_t *labels1 = labels + i * k;
391 float *dis1 = distances + i * k;
404 auto range =
instances.equal_range (labels1[rp]);
405 float dis = dis1[rp];
406 labels2[j] = labels1[rp];
409 for (
auto it = range.first; j < k && it != range.second; ++it) {
410 labels2[j] = it->second;
416 memcpy (labels1 + j0, labels2.data() + j0,
417 sizeof(labels1[0]) * (k - j0));
418 memcpy (dis1 + j0, dis2.data() + j0,
419 sizeof(dis2[0]) * (k - j0));
428 std::unordered_map<idx_t, idx_t> replace;
429 std::vector<std::pair<idx_t, idx_t> > toadd;
431 if (sel.is_member(it->first)) {
433 if (!sel.is_member(it->second)) {
435 if (replace.count(it->first) == 0) {
436 replace[it->first] = it->second;
438 std::pair<idx_t, idx_t> new_entry (
439 replace[it->first], it->second);
440 toadd.push_back(new_entry);
445 if (sel.is_member(it->second)) {
453 instances.insert (toadd.begin(), toadd.end());
458 "direct map remove not implemented");
460 std::vector<long> toremove(
nlist);
462 #pragma omp parallel for
463 for (
long i = 0; i <
nlist; i++) {
467 if (sel.is_member (idsi[j])) {
468 if (replace.count(idsi[j]) == 0) {
485 toremove[i] = l0 - l;
489 for (
long i = 0; i <
nlist; i++) {
490 if (toremove[i] > 0) {
491 nremove += toremove[i];
507 FAISS_THROW_MSG (
"not implemented");
512 FAISS_THROW_MSG (
"not implemented");
520 FAISS_THROW_MSG (
"not implemented");
void train(idx_t n, const float *x) override
also dedups the training set
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
result structure for a single query
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
not implemented
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
size_t nprobe
number of probes at query time
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
virtual idx_t get_single_id(size_t list_no, size_t offset) const
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const override
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
long idx_t
all indices are this type
idx_t ntotal
total nb of indexed vectors
bool verbose
verbosity level
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
QueryResult & new_result(idx_t qno)
begin a new result
virtual void update_vectors(int nv, idx_t *idx, const float *v)
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
not implemented
void update_vectors(int nv, idx_t *idx, const float *v) override
not implemented
the entries in the buffers are split per query
MetricType metric_type
type of metric this index uses for search
InvertedLists * invlists
Acess to the actual data.
Index * quantizer
quantizer that maps vectors to inverted lists
bool is_trained
set if the Index does not require training, or if training is done already
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
size_t nlist
number of possible key values
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
virtual void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
std::unordered_multimap< idx_t, idx_t > instances