Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
test_lowlevel_ivf.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <cstdio>
10 #include <cstdlib>
11 
12 #include <memory>
13 #include <vector>
14 #include <thread>
15 
16 #include <gtest/gtest.h>
17 
18 #include <faiss/IndexIVF.h>
19 #include <faiss/IndexBinaryIVF.h>
20 #include <faiss/AutoTune.h>
21 #include <faiss/index_io.h>
22 #include <faiss/IVFlib.h>
23 #include <faiss/VectorTransform.h>
24 
25 using namespace faiss;
26 
27 namespace {
28 
29 typedef Index::idx_t idx_t;
30 
31 
32 // dimension of the vectors to index
33 int d = 32;
34 
35 // nb of training vectors
36 size_t nt = 5000;
37 
38 // size of the database points per window step
39 size_t nb = 1000;
40 
41 // nb of queries
42 size_t nq = 200;
43 
44 int k = 10;
45 
46 
47 std::vector<float> make_data(size_t n)
48 {
49  std::vector <float> database (n * d);
50  for (size_t i = 0; i < n * d; i++) {
51  database[i] = drand48();
52  }
53  return database;
54 }
55 
56 std::unique_ptr<Index> make_trained_index(const char *index_type,
57  MetricType metric_type)
58 {
59  auto index = std::unique_ptr<Index>(index_factory(
60  d, index_type, metric_type));
61  auto xt = make_data(nt);
62  index->train(nt, xt.data());
63  ParameterSpace().set_index_parameter (index.get(), "nprobe", 4);
64  return index;
65 }
66 
67 std::vector<idx_t> search_index(Index *index, const float *xq) {
68  std::vector<idx_t> I(k * nq);
69  std::vector<float> D(k * nq);
70  index->search (nq, xq, k, D.data(), I.data());
71  return I;
72 }
73 
74 
75 
76 
77 /*************************************************************
78  * Test functions for a given index type
79  *************************************************************/
80 
81 
82 
83 void test_lowlevel_access (const char *index_key, MetricType metric) {
84  std::unique_ptr<Index> index = make_trained_index(index_key, metric);
85 
86  auto xb = make_data (nb);
87  index->add(nb, xb.data());
88 
89  /** handle the case if we have a preprocessor */
90 
91  const IndexPreTransform *index_pt =
92  dynamic_cast<const IndexPreTransform*> (index.get());
93 
94  int dt = index->d;
95  const float * xbt = xb.data();
96  std::unique_ptr<float []> del_xbt;
97 
98  if (index_pt) {
99  dt = index_pt->index->d;
100  xbt = index_pt->apply_chain (nb, xb.data());
101  if (xbt != xb.data()) {
102  del_xbt.reset((float*)xbt);
103  }
104  }
105 
106  IndexIVF * index_ivf = ivflib::extract_index_ivf (index.get());
107 
108  /** Test independent encoding
109  *
110  * Makes it possible to do additions on a custom inverted list
111  * implementation. From a set of vectors, computes the inverted
112  * list ids + the codes corresponding to each vector.
113  */
114 
115  std::vector<idx_t> list_nos (nb);
116  std::vector<uint8_t> codes (index_ivf->code_size * nb);
117  index_ivf->quantizer->assign(nb, xbt, list_nos.data());
118  index_ivf->encode_vectors (nb, xbt, list_nos.data(), codes.data());
119 
120  // compare with normal IVF addition
121 
122  const InvertedLists *il = index_ivf->invlists;
123 
124  for (int list_no = 0; list_no < index_ivf->nlist; list_no++) {
125  InvertedLists::ScopedCodes ivf_codes (il, list_no);
126  InvertedLists::ScopedIds ivf_ids (il, list_no);
127  size_t list_size = il->list_size (list_no);
128  for (int i = 0; i < list_size; i++) {
129  const uint8_t *ref_code = ivf_codes.get() + i * il->code_size;
130  const uint8_t *new_code =
131  codes.data() + ivf_ids[i] * il->code_size;
132  EXPECT_EQ (memcmp(ref_code, new_code, il->code_size), 0);
133  }
134  }
135 
136  /** Test independent search
137  *
138  * Manually scans through inverted lists, computing distances and
139  * ordering results organized in a heap.
140  */
141 
142  // sample some example queries and get reference search results.
143  auto xq = make_data (nq);
144  auto ref_I = search_index (index.get(), xq.data());
145 
146  // handle preprocessing
147  const float * xqt = xq.data();
148  std::unique_ptr<float []> del_xqt;
149 
150  if (index_pt) {
151  xqt = index_pt->apply_chain (nq, xq.data());
152  if (xqt != xq.data()) {
153  del_xqt.reset((float*)xqt);
154  }
155  }
156 
157  // quantize the queries to get the inverted list ids to visit.
158  int nprobe = index_ivf->nprobe;
159 
160  std::vector<idx_t> q_lists (nq * nprobe);
161  std::vector<float> q_dis (nq * nprobe);
162 
163  index_ivf->quantizer->search (nq, xqt, nprobe,
164  q_dis.data(), q_lists.data());
165 
166  // object that does the scanning and distance computations.
167  std::unique_ptr<InvertedListScanner> scanner (
168  index_ivf->get_InvertedListScanner());
169 
170  for (int i = 0; i < nq; i++) {
171  std::vector<idx_t> I (k, -1);
172  float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
173  std::vector<float> D (k, default_dis);
174 
175  scanner->set_query (xqt + i * dt);
176 
177  for (int j = 0; j < nprobe; j++) {
178  int list_no = q_lists[i * nprobe + j];
179  if (list_no < 0) continue;
180  scanner->set_list (list_no, q_dis[i * nprobe + j]);
181 
182  // here we get the inverted lists from the InvertedLists
183  // object but they could come from anywhere
184 
185  scanner->scan_codes (
186  il->list_size (list_no),
187  InvertedLists::ScopedCodes(il, list_no).get(),
188  InvertedLists::ScopedIds(il, list_no).get(),
189  D.data(), I.data(), k);
190 
191  if (j == 0) {
192  // all results so far come from list_no, so let's check if
193  // the distance function works
194  for (int jj = 0; jj < k; jj++) {
195  int vno = I[jj];
196  if (vno < 0) break; // heap is not full yet
197 
198  // we have the codes from the addition test
199  float computed_D = scanner->distance_to_code (
200  codes.data() + vno * il->code_size);
201 
202  EXPECT_EQ (computed_D, D[jj]);
203  }
204  }
205  }
206 
207  // re-order heap
208  if (metric == METRIC_L2) {
209  maxheap_reorder (k, D.data(), I.data());
210  } else {
211  minheap_reorder (k, D.data(), I.data());
212  }
213 
214  // check that we have the same results as the reference search
215  for (int j = 0; j < k; j++) {
216  EXPECT_EQ (I[j], ref_I[i * k + j]);
217  }
218  }
219 
220 
221 }
222 
223 } // anonymous namespace
224 
225 
226 
227 /*************************************************************
228  * Test entry points
229  *************************************************************/
230 
231 TEST(TestLowLevelIVF, IVFFlatL2) {
232  test_lowlevel_access ("IVF32,Flat", METRIC_L2);
233 }
234 
235 TEST(TestLowLevelIVF, PCAIVFFlatL2) {
236  test_lowlevel_access ("PCAR16,IVF32,Flat", METRIC_L2);
237 }
238 
239 TEST(TestLowLevelIVF, IVFFlatIP) {
240  test_lowlevel_access ("IVF32,Flat", METRIC_INNER_PRODUCT);
241 }
242 
243 TEST(TestLowLevelIVF, IVFSQL2) {
244  test_lowlevel_access ("IVF32,SQ8", METRIC_L2);
245 }
246 
247 TEST(TestLowLevelIVF, IVFSQIP) {
248  test_lowlevel_access ("IVF32,SQ8", METRIC_INNER_PRODUCT);
249 }
250 
251 
252 TEST(TestLowLevelIVF, IVFPQL2) {
253  test_lowlevel_access ("IVF32,PQ4np", METRIC_L2);
254 }
255 
256 TEST(TestLowLevelIVF, IVFPQIP) {
257  test_lowlevel_access ("IVF32,PQ4np", METRIC_INNER_PRODUCT);
258 }
259 
260 
261 /*************************************************************
262  * Same for binary (a bit simpler)
263  *************************************************************/
264 
265 namespace {
266 
267 int nbit = 256;
268 
269 // here d is used the number of ints -> d=32 means 128 bits
270 
271 std::vector<uint8_t> make_data_binary(size_t n)
272 {
273 
274  std::vector <uint8_t> database (n * nbit / 8);
275  for (size_t i = 0; i < n * d; i++) {
276  database[i] = lrand48();
277  }
278  return database;
279 }
280 
281 std::unique_ptr<IndexBinary> make_trained_index_binary(const char *index_type)
282 {
283  auto index = std::unique_ptr<IndexBinary>(index_binary_factory(
284  nbit, index_type));
285  auto xt = make_data_binary (nt);
286  index->train(nt, xt.data());
287  return index;
288 }
289 
290 
291 void test_lowlevel_access_binary (const char *index_key) {
292  std::unique_ptr<IndexBinary> index =
293  make_trained_index_binary (index_key);
294 
295  IndexBinaryIVF * index_ivf = dynamic_cast<IndexBinaryIVF*>
296  (index.get());
297  assert (index_ivf);
298 
299  index_ivf->nprobe = 4;
300 
301  auto xb = make_data_binary (nb);
302  index->add(nb, xb.data());
303 
304  std::vector<idx_t> list_nos (nb);
305  index_ivf->quantizer->assign(nb, xb.data(), list_nos.data());
306 
307  /* For binary there is no test for encoding because binary vectors
308  * are copied verbatim to the inverted lists */
309 
310  const InvertedLists *il = index_ivf->invlists;
311 
312  /** Test independent search
313  *
314  * Manually scans through inverted lists, computing distances and
315  * ordering results organized in a heap.
316  */
317 
318  // sample some example queries and get reference search results.
319  auto xq = make_data_binary (nq);
320 
321  std::vector<idx_t> I_ref(k * nq);
322  std::vector<int32_t> D_ref(k * nq);
323  index->search (nq, xq.data(), k, D_ref.data(), I_ref.data());
324 
325  // quantize the queries to get the inverted list ids to visit.
326  int nprobe = index_ivf->nprobe;
327 
328  std::vector<idx_t> q_lists (nq * nprobe);
329  std::vector<int32_t> q_dis (nq * nprobe);
330 
331  // quantize queries
332  index_ivf->quantizer->search (nq, xq.data(), nprobe,
333  q_dis.data(), q_lists.data());
334 
335  // object that does the scanning and distance computations.
336  std::unique_ptr<BinaryInvertedListScanner> scanner (
337  index_ivf->get_InvertedListScanner());
338 
339  for (int i = 0; i < nq; i++) {
340  std::vector<idx_t> I (k, -1);
341  uint32_t default_dis = 1 << 30;
342  std::vector<int32_t> D (k, default_dis);
343 
344  scanner->set_query (xq.data() + i * index_ivf->code_size);
345 
346  for (int j = 0; j < nprobe; j++) {
347  int list_no = q_lists[i * nprobe + j];
348  if (list_no < 0) continue;
349  scanner->set_list (list_no, q_dis[i * nprobe + j]);
350 
351  // here we get the inverted lists from the InvertedLists
352  // object but they could come from anywhere
353 
354  scanner->scan_codes (
355  il->list_size (list_no),
356  InvertedLists::ScopedCodes(il, list_no).get(),
357  InvertedLists::ScopedIds(il, list_no).get(),
358  D.data(), I.data(), k);
359 
360  if (j == 0) {
361  // all results so far come from list_no, so let's check if
362  // the distance function works
363  for (int jj = 0; jj < k; jj++) {
364  int vno = I[jj];
365  if (vno < 0) break; // heap is not full yet
366 
367  // we have the codes from the addition test
368  float computed_D = scanner->distance_to_code (
369  xb.data() + vno * il->code_size);
370 
371  EXPECT_EQ (computed_D, D[jj]);
372  }
373  }
374  }
375 
376  printf("new before reroder: [");
377  for (int j = 0; j < k; j++)
378  printf("%ld,%d ", I[j], D[j]);
379  printf("]\n");
380 
381  // re-order heap
382  heap_reorder<CMax<int32_t, int64_t> > (k, D.data(), I.data());
383 
384  printf("ref: [");
385  for (int j = 0; j < k; j++)
386  printf("%ld,%d ", I_ref[j], D_ref[j]);
387  printf("]\nnew: [");
388  for (int j = 0; j < k; j++)
389  printf("%ld,%d ", I[j], D[j]);
390  printf("]\n");
391 
392  // check that we have the same results as the reference search
393  for (int j = 0; j < k; j++) {
394  // here the order is not guaranteed to be the same
395  // so we scan through ref results
396  // EXPECT_EQ (I[j], I_ref[i * k + j]);
397  EXPECT_LE (D[j], D_ref[i * k + k - 1]);
398  if (D[j] < D_ref[i * k + k - 1]) {
399  int j2 = 0;
400  while (j2 < k) {
401  if (I[j] == I_ref[i * k + j2]) break;
402  j2++;
403  }
404  EXPECT_LT(j2, k); // it was found
405  if (j2 < k) {
406  EXPECT_EQ(D[j], D_ref[i * k + j2]);
407  }
408  }
409 
410  }
411 
412  }
413 
414 
415 }
416 
417 } // anonymous namespace
418 
419 
420 TEST(TestLowLevelIVF, IVFBinary) {
421  test_lowlevel_access_binary ("BIVF32");
422 }
423 
424 
425 namespace {
426 
427 void test_threaded_search (const char *index_key, MetricType metric) {
428  std::unique_ptr<Index> index = make_trained_index(index_key, metric);
429 
430  auto xb = make_data (nb);
431  index->add(nb, xb.data());
432 
433  /** handle the case if we have a preprocessor */
434 
435  const IndexPreTransform *index_pt =
436  dynamic_cast<const IndexPreTransform*> (index.get());
437 
438  int dt = index->d;
439  const float * xbt = xb.data();
440  std::unique_ptr<float []> del_xbt;
441 
442  if (index_pt) {
443  dt = index_pt->index->d;
444  xbt = index_pt->apply_chain (nb, xb.data());
445  if (xbt != xb.data()) {
446  del_xbt.reset((float*)xbt);
447  }
448  }
449 
450  IndexIVF * index_ivf = ivflib::extract_index_ivf (index.get());
451 
452  /** Test independent search
453  *
454  * Manually scans through inverted lists, computing distances and
455  * ordering results organized in a heap.
456  */
457 
458  // sample some example queries and get reference search results.
459  auto xq = make_data (nq);
460  auto ref_I = search_index (index.get(), xq.data());
461 
462  // handle preprocessing
463  const float * xqt = xq.data();
464  std::unique_ptr<float []> del_xqt;
465 
466  if (index_pt) {
467  xqt = index_pt->apply_chain (nq, xq.data());
468  if (xqt != xq.data()) {
469  del_xqt.reset((float*)xqt);
470  }
471  }
472 
473  // quantize the queries to get the inverted list ids to visit.
474  int nprobe = index_ivf->nprobe;
475 
476  std::vector<idx_t> q_lists (nq * nprobe);
477  std::vector<float> q_dis (nq * nprobe);
478 
479  index_ivf->quantizer->search (nq, xqt, nprobe,
480  q_dis.data(), q_lists.data());
481 
482  // now run search in this many threads
483  int nproc = 3;
484 
485 
486  for (int i = 0; i < nq; i++) {
487 
488  // one result table per thread
489  std::vector<idx_t> I (k * nproc, -1);
490  float default_dis = metric == METRIC_L2 ? HUGE_VAL : -HUGE_VAL;
491  std::vector<float> D (k * nproc, default_dis);
492 
493  auto search_function = [index_ivf, &I, &D, dt, i, nproc,
494  xqt, nprobe, &q_dis, &q_lists]
495  (int rank) {
496  const InvertedLists *il = index_ivf->invlists;
497 
498  // object that does the scanning and distance computations.
499  std::unique_ptr<InvertedListScanner> scanner (
500  index_ivf->get_InvertedListScanner());
501 
502  idx_t *local_I = I.data() + rank * k;
503  float *local_D = D.data() + rank * k;
504 
505  scanner->set_query (xqt + i * dt);
506 
507  for (int j = rank; j < nprobe; j += nproc) {
508  int list_no = q_lists[i * nprobe + j];
509  if (list_no < 0) continue;
510  scanner->set_list (list_no, q_dis[i * nprobe + j]);
511 
512  scanner->scan_codes (
513  il->list_size (list_no),
514  InvertedLists::ScopedCodes(il, list_no).get(),
515  InvertedLists::ScopedIds(il, list_no).get(),
516  local_D, local_I, k);
517  }
518  };
519 
520  // start the threads. Threads are numbered rank=0..nproc-1 (a la MPI)
521  // thread rank takes care of inverted lists
522  // rank, rank+nproc, rank+2*nproc,...
523  std::vector<std::thread> threads;
524  for (int rank = 0; rank < nproc; rank++) {
525  threads.emplace_back(search_function, rank);
526  }
527 
528  // join threads, merge heaps
529  for (int rank = 0; rank < nproc; rank++) {
530  threads[rank].join();
531  if (rank == 0) continue; // nothing to merge
532  // merge into first result
533  if (metric == METRIC_L2) {
534  maxheap_addn (k, D.data(), I.data(),
535  D.data() + rank * k,
536  I.data() + rank * k, k);
537  } else {
538  minheap_addn (k, D.data(), I.data(),
539  D.data() + rank * k,
540  I.data() + rank * k, k);
541  }
542  }
543 
544  // re-order heap
545  if (metric == METRIC_L2) {
546  maxheap_reorder (k, D.data(), I.data());
547  } else {
548  minheap_reorder (k, D.data(), I.data());
549  }
550 
551  // check that we have the same results as the reference search
552  for (int j = 0; j < k; j++) {
553  EXPECT_EQ (I[j], ref_I[i * k + j]);
554  }
555  }
556 
557 
558 }
559 
560 } // anonymous namepace
561 
562 
563 TEST(TestLowLevelIVF, ThreadedSearch) {
564  test_threaded_search ("IVF32,Flat", METRIC_L2);
565 }
virtual void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const =0
void train(idx_t n, const float *x) override
virtual void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const =0
size_t nprobe
number of probes at query time
Index * index
! chain of tranforms
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:98
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:35
virtual size_t list_size(size_t list_no) const =0
get the size of a list
IndexBinary * quantizer
quantizer that maps vectors to inverted lists
int code_size
number of bytes per vector ( = d / 8 )
Definition: IndexBinary.h:41
const float * apply_chain(idx_t n, const float *x) const
int d
vector dimension
Definition: Index.h:66
size_t code_size
code size per vector in bytes
Definition: InvertedLists.h:36
void assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k=1)
Definition: IndexBinary.cpp:29
long idx_t
all indices are this type
Definition: Index.h:64
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
virtual void set_index_parameter(Index *index, const std::string &name, double val) const
set one of the parameters
Definition: AutoTune.cpp:452
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:93
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
virtual InvertedListScanner * get_InvertedListScanner(bool store_pairs=false) const
get a scanner for this index (store_pairs means ignore labels)
Definition: IndexIVF.h:167
Index * index_factory(int d, const char *description_in, MetricType metric)
Definition: AutoTune.cpp:722
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:96
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45
InvertedLists * invlists
Acess to the actual data.