Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_imi_flat.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 
12 
13 #include <cmath>
14 #include <cstdio>
15 #include <cstdlib>
16 
17 #include <sys/time.h>
18 
19 
20 #include <faiss/IndexPQ.h>
21 #include <faiss/IndexIVF.h>
22 #include <faiss/IndexFlat.h>
23 #include <faiss/index_io.h>
24 
25 double elapsed ()
26 {
27  struct timeval tv;
28  gettimeofday (&tv, nullptr);
29  return tv.tv_sec + tv.tv_usec * 1e-6;
30 }
31 
32 
33 int main ()
34 {
35  double t0 = elapsed();
36 
37  // dimension of the vectors to index
38  int d = 128;
39 
40  // size of the database we plan to index
41  size_t nb = 1000 * 1000;
42 
43  // make a set of nt training vectors in the unit cube
44  // (could be the database)
45  size_t nt = 100 * 1000;
46 
47  //---------------------------------------------------------------
48  // Define the core quantizer
49  // We choose a multiple inverted index for faster training with less data
50  // and because it usually offers best accuracy/speed trade-offs
51  //
52  // We here assume that its lifespan of this coarse quantizer will cover the
53  // lifespan of the inverted-file quantizer IndexIVFFlat below
54  // With dynamic allocation, one may give the responsability to free the
55  // quantizer to the inverted-file index (with attribute do_delete_quantizer)
56  //
57  // Note: a regular clustering algorithm would be defined as:
58  // faiss::IndexFlatL2 coarse_quantizer (d);
59  //
60  // Use nhash=2 subquantizers used to define the product coarse quantizer
61  // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
62  // meaning (2^12)^nhash distinct inverted lists
63  size_t nhash = 2;
64  size_t nbits_subq = int (log2 (nb+1) / 2); // good choice in general
65  size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
66 
67  faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
68 
69  printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
70  nhash, nbits_subq, ncentroids, nb);
71 
72  // the coarse quantizer should not be dealloced before the index
73  // 4 = nb of bytes per code (d must be a multiple of this)
74  // 8 = nb of bits per sub-code (almost always 8)
75  faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
76  faiss::IndexIVFFlat index (&coarse_quantizer, d, ncentroids, metric);
77  index.quantizer_trains_alone = true;
78 
79  // define the number of probes. 2048 is for high-dim, overkilled in practice
80  // Use 4-1024 depending on the trade-off speed accuracy that you want
81  index.nprobe = 2048;
82 
83 
84  { // training
85  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
86  elapsed() - t0, nt, d);
87 
88  std::vector <float> trainvecs (nt * d);
89  for (size_t i = 0; i < nt * d; i++) {
90  trainvecs[i] = drand48();
91  }
92 
93  printf ("[%.3f s] Training the index\n", elapsed() - t0);
94  index.verbose = true;
95  index.train (nt, trainvecs.data());
96  }
97 
98  size_t nq;
99  std::vector<float> queries;
100 
101  { // populating the database
102  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
103  elapsed() - t0, nb);
104 
105  std::vector <float> database (nb * d);
106  for (size_t i = 0; i < nb * d; i++) {
107  database[i] = drand48();
108  }
109 
110  printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
111 
112  index.add (nb, database.data());
113 
114  // remember a few elements from the database as queries
115  int i0 = 1234;
116  int i1 = 1244;
117 
118  nq = i1 - i0;
119  queries.resize (nq * d);
120  for (int i = i0; i < i1; i++) {
121  for (int j = 0; j < d; j++) {
122  queries [(i - i0) * d + j] = database [i * d + j];
123  }
124  }
125  }
126 
127  { // searching the database
128  int k = 5;
129  printf ("[%.3f s] Searching the %d nearest neighbors "
130  "of %ld vectors in the index\n",
131  elapsed() - t0, k, nq);
132 
133  std::vector<faiss::Index::idx_t> nns (k * nq);
134  std::vector<float> dis (k * nq);
135 
136  index.search (nq, queries.data(), k, dis.data(), nns.data());
137 
138  printf ("[%.3f s] Query results (vector ids, then distances):\n",
139  elapsed() - t0);
140 
141  for (int i = 0; i < nq; i++) {
142  printf ("query %2d: ", i);
143  for (int j = 0; j < k; j++) {
144  printf ("%7ld ", nns[j + i * k]);
145  }
146  printf ("\n dis: ");
147  for (int j = 0; j < k; j++) {
148  printf ("%7g ", dis[j + i * k]);
149  }
150  printf ("\n");
151  }
152  }
153  return 0;
154 }
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:44