Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVFFlat.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 /* Copyright 2004-present Facebook. All Rights Reserved.
10  Inverted list structure.
11 */
12 
13 #include "IndexIVFFlat.h"
14 
15 #include <cstdio>
16 
17 #include "utils.h"
18 
19 #include "FaissAssert.h"
20 #include "IndexFlat.h"
21 #include "AuxIndexStructures.h"
22 
23 namespace faiss {
24 
25 
26 /*****************************************
27  * IndexIVFFlat implementation
28  ******************************************/
29 
30 IndexIVFFlat::IndexIVFFlat (Index * quantizer,
31  size_t d, size_t nlist, MetricType metric):
32  IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
33 {
34  code_size = sizeof(float) * d;
35 }
36 
37 
38 
39 
40 void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const long *xids)
41 {
42  add_core (n, x, xids, nullptr);
43 }
44 
45 void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,
46  const long *precomputed_idx)
47 
48 {
49  FAISS_THROW_IF_NOT (is_trained);
50  assert (invlists);
51  FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
52  "cannot have direct map and add with ids");
53  const long * idx;
55 
56  if (precomputed_idx) {
57  idx = precomputed_idx;
58  } else {
59  long * idx0 = new long [n];
60  del.set (idx0);
61  quantizer->assign (n, x, idx0);
62  idx = idx0;
63  }
64  long n_add = 0;
65  for (size_t i = 0; i < n; i++) {
66  long id = xids ? xids[i] : ntotal + i;
67  long list_no = idx [i];
68 
69  if (list_no < 0)
70  continue;
71  const float *xi = x + i * d;
72  size_t offset = invlists->add_entry (
73  list_no, id, (const uint8_t*) xi);
74 
76  direct_map.push_back (list_no << 32 | offset);
77  n_add++;
78  }
79  if (verbose) {
80  printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
81  n_add, n);
82  }
83  ntotal += n_add;
84 }
85 
86 
87 namespace {
88 
89 void search_knn_inner_product (const IndexIVFFlat & ivf,
90  size_t nx,
91  const float * x,
92  const long * keys,
94  bool store_pairs)
95 {
96 
97  const size_t k = res->k;
98  size_t nlistv = 0, ndis = 0;
99  size_t d = ivf.d;
100 
101 #pragma omp parallel for reduction(+: nlistv, ndis)
102  for (size_t i = 0; i < nx; i++) {
103  const float * xi = x + i * d;
104  const long * keysi = keys + i * ivf.nprobe;
105  float * __restrict simi = res->get_val (i);
106  long * __restrict idxi = res->get_ids (i);
107  minheap_heapify (k, simi, idxi);
108  size_t nscan = 0;
109 
110  for (size_t ik = 0; ik < ivf.nprobe; ik++) {
111  long key = keysi[ik]; /* select the list */
112  if (key < 0) {
113  // not enough centroids for multiprobe
114  continue;
115  }
116  FAISS_THROW_IF_NOT_FMT (
117  key < (long) ivf.nlist,
118  "Invalid key=%ld at ik=%ld nlist=%ld\n",
119  key, ik, ivf.nlist);
120 
121  nlistv++;
122  size_t list_size = ivf.invlists->list_size(key);
123  const float * list_vecs =
124  (const float*)ivf.invlists->get_codes (key);
125  const Index::idx_t * ids = store_pairs ? nullptr :
126  ivf.invlists->get_ids (key);
127 
128  for (size_t j = 0; j < list_size; j++) {
129  const float * yj = list_vecs + d * j;
130  float ip = fvec_inner_product (xi, yj, d);
131  if (ip > simi[0]) {
132  minheap_pop (k, simi, idxi);
133  long id = store_pairs ? (key << 32 | j) : ids[j];
134  minheap_push (k, simi, idxi, ip, id);
135  }
136  }
137  nscan += list_size;
138  if (ivf.max_codes && nscan >= ivf.max_codes)
139  break;
140  }
141  ndis += nscan;
142  minheap_reorder (k, simi, idxi);
143  }
144  indexIVF_stats.nq += nx;
145  indexIVF_stats.nlist += nlistv;
146  indexIVF_stats.ndis += ndis;
147 }
148 
149 
150 void search_knn_L2sqr (const IndexIVFFlat &ivf,
151  size_t nx,
152  const float * x,
153  const long * keys,
154  float_maxheap_array_t * res,
155  bool store_pairs)
156 {
157  const size_t k = res->k;
158  size_t nlistv = 0, ndis = 0;
159  size_t d = ivf.d;
160 #pragma omp parallel for reduction(+: nlistv, ndis)
161  for (size_t i = 0; i < nx; i++) {
162  const float * xi = x + i * d;
163  const long * keysi = keys + i * ivf.nprobe;
164  float * __restrict disi = res->get_val (i);
165  long * __restrict idxi = res->get_ids (i);
166  maxheap_heapify (k, disi, idxi);
167 
168  size_t nscan = 0;
169 
170  for (size_t ik = 0; ik < ivf.nprobe; ik++) {
171  long key = keysi[ik]; /* select the list */
172  if (key < 0) {
173  // not enough centroids for multiprobe
174  continue;
175  }
176  FAISS_THROW_IF_NOT_FMT (
177  key < (long) ivf.nlist,
178  "Invalid key=%ld at ik=%ld nlist=%ld\n",
179  key, ik, ivf.nlist);
180 
181  nlistv++;
182  size_t list_size = ivf.invlists->list_size(key);
183  const float * list_vecs =
184  (const float*)ivf.invlists->get_codes (key);
185  const Index::idx_t * ids = store_pairs ? nullptr :
186  ivf.invlists->get_ids (key);
187 
188  for (size_t j = 0; j < list_size; j++) {
189  const float * yj = list_vecs + d * j;
190  float disij = fvec_L2sqr (xi, yj, d);
191  if (disij < disi[0]) {
192  maxheap_pop (k, disi, idxi);
193  long id = store_pairs ? (key << 32 | j) : ids[j];
194  maxheap_push (k, disi, idxi, disij, id);
195  }
196  }
197  nscan += list_size;
198  if (ivf.max_codes && nscan >= ivf.max_codes)
199  break;
200  }
201  ndis += nscan;
202  maxheap_reorder (k, disi, idxi);
203  }
204  indexIVF_stats.nq += nx;
205  indexIVF_stats.nlist += nlistv;
206  indexIVF_stats.ndis += ndis;
207 }
208 
209 
210 } // anonymous namespace
211 
212 void IndexIVFFlat::search_preassigned (idx_t n, const float *x, idx_t k,
213  const idx_t *idx,
214  const float * /* coarse_dis */,
215  float *distances, idx_t *labels,
216  bool store_pairs) const
217 {
218  if (metric_type == METRIC_INNER_PRODUCT) {
219  float_minheap_array_t res = {
220  size_t(n), size_t(k), labels, distances};
221  search_knn_inner_product (*this, n, x, idx, &res, store_pairs);
222 
223  } else if (metric_type == METRIC_L2) {
224  float_maxheap_array_t res = {
225  size_t(n), size_t(k), labels, distances};
226  search_knn_L2sqr (*this, n, x, idx, &res, store_pairs);
227  }
228 }
229 
230 
231 void IndexIVFFlat::range_search (idx_t nx, const float *x, float radius,
232  RangeSearchResult *result) const
233 {
234  idx_t * keys = new idx_t [nx * nprobe];
235  ScopeDeleter<idx_t> del (keys);
236  quantizer->assign (nx, x, keys, nprobe);
237 
238 #pragma omp parallel
239  {
240  RangeSearchPartialResult pres(result);
241 
242  for (size_t i = 0; i < nx; i++) {
243  const float * xi = x + i * d;
244  const long * keysi = keys + i * nprobe;
245 
247  pres.new_result (i);
248 
249  for (size_t ik = 0; ik < nprobe; ik++) {
250  long key = keysi[ik]; /* select the list */
251  if (key < 0 || key >= (long) nlist) {
252  fprintf (stderr, "Invalid key=%ld at ik=%ld nlist=%ld\n",
253  key, ik, nlist);
254  throw;
255  }
256 
257  const size_t list_size = invlists->list_size(key);
258  const float * list_vecs =
259  (const float*)invlists->get_codes (key);
260  const Index::idx_t * ids = invlists->get_ids (key);
261 
262  for (size_t j = 0; j < list_size; j++) {
263  const float * yj = list_vecs + d * j;
264  if (metric_type == METRIC_L2) {
265  float disij = fvec_L2sqr (xi, yj, d);
266  if (disij < radius) {
267  qres.add (disij, ids[j]);
268  }
269  } else if (metric_type == METRIC_INNER_PRODUCT) {
270  float disij = fvec_inner_product(xi, yj, d);
271  if (disij > radius) {
272  qres.add (disij, ids[j]);
273  }
274  }
275  }
276  }
277  }
278 
279  pres.finalize ();
280  }
281 }
282 
283 void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
284 {
285 
286  FAISS_THROW_IF_NOT (maintain_direct_map);
287  FAISS_THROW_IF_NOT (is_trained);
288  std::vector<idx_t> assign (n);
289  quantizer->assign (n, x, assign.data());
290 
291  for (size_t i = 0; i < n; i++) {
292  idx_t id = new_ids[i];
293  FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
294  "id to update out of range");
295  { // remove old one
296  long dm = direct_map[id];
297  long ofs = dm & 0xffffffff;
298  long il = dm >> 32;
299  size_t l = invlists->list_size (il);
300  if (ofs != l - 1) { // move l - 1 to ofs
301  long id2 = invlists->get_single_id (il, l - 1);
302  direct_map[id2] = (il << 32) | ofs;
303  invlists->update_entry (il, ofs, id2,
304  invlists->get_single_code (il, l - 1));
305  }
306  invlists->resize (il, l - 1);
307  }
308  { // insert new one
309  long il = assign[i];
310  size_t l = invlists->list_size (il);
311  long dm = (il << 32) | l;
312  direct_map[id] = dm;
313  invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
314  }
315  }
316 
317 }
318 
319 void IndexIVFFlat::reconstruct_from_offset (long list_no, long offset,
320  float* recons) const
321 {
322  memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
323 }
324 
325 
326 
327 } // namespace faiss
result structure for a single query
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:574
virtual const idx_t * get_ids(size_t list_no) const =0
T * get_val(size_t key)
Return the list of values for a heap.
Definition: Heap.h:360
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:173
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
size_t k
allocated size per heap
Definition: Heap.h:355
virtual idx_t get_single_id(size_t list_no, size_t offset) const
Definition: IndexIVF.cpp:118
int d
vector dimension
Definition: Index.h:64
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
Definition: IndexIVF.cpp:129
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
Definition: IndexIVF.cpp:136
long idx_t
all indices are this type
Definition: Index.h:62
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
bool verbose
verbosity level
Definition: Index.h:66
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
QueryResult & new_result(idx_t qno)
begin a new result
void update_vectors(int nv, idx_t *idx, const float *v)
the entries in the buffers are split per query
TI * get_ids(size_t key)
Correspponding identifiers.
Definition: Heap.h:363
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:168
virtual const uint8_t * get_codes(size_t list_no) const =0
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:174
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:177
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:171
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:43
virtual void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer