Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_imi_pq.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 
11 #include <cmath>
12 #include <cstdio>
13 #include <cstdlib>
14 
15 #include <sys/time.h>
16 
17 
18 #include <faiss/IndexPQ.h>
19 #include <faiss/IndexIVFPQ.h>
20 #include <faiss/IndexFlat.h>
21 #include <faiss/index_io.h>
22 
23 double elapsed ()
24 {
25  struct timeval tv;
26  gettimeofday (&tv, nullptr);
27  return tv.tv_sec + tv.tv_usec * 1e-6;
28 }
29 
30 
31 int main ()
32 {
33  double t0 = elapsed();
34 
35  // dimension of the vectors to index
36  int d = 64;
37 
38  // size of the database we plan to index
39  size_t nb = 1000 * 1000;
40  size_t add_bs = 10000; // # size of the blocks to add
41 
42  // make a set of nt training vectors in the unit cube
43  // (could be the database)
44  size_t nt = 100 * 1000;
45 
46  //---------------------------------------------------------------
47  // Define the core quantizer
48  // We choose a multiple inverted index for faster training with less data
49  // and because it usually offers best accuracy/speed trade-offs
50  //
51  // We here assume that its lifespan of this coarse quantizer will cover the
52  // lifespan of the inverted-file quantizer IndexIVFFlat below
53  // With dynamic allocation, one may give the responsability to free the
54  // quantizer to the inverted-file index (with attribute do_delete_quantizer)
55  //
56  // Note: a regular clustering algorithm would be defined as:
57  // faiss::IndexFlatL2 coarse_quantizer (d);
58  //
59  // Use nhash=2 subquantizers used to define the product coarse quantizer
60  // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
61  // meaning (2^12)^nhash distinct inverted lists
62  //
63  // The parameter bytes_per_code is determined by the memory
64  // constraint, the dataset will use nb * (bytes_per_code + 8)
65  // bytes.
66  //
67  // The parameter nbits_subq is determined by the size of the dataset to index.
68  //
69  size_t nhash = 2;
70  size_t nbits_subq = 9;
71  size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
72  int bytes_per_code = 16;
73 
74  faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
75 
76  printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
77  nhash, nbits_subq, ncentroids, nb);
78 
79  // the coarse quantizer should not be dealloced before the index
80  // 4 = nb of bytes per code (d must be a multiple of this)
81  // 8 = nb of bits per sub-code (almost always 8)
82  faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
83  faiss::IndexIVFPQ index (&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
84  index.quantizer_trains_alone = true;
85 
86  // define the number of probes. 2048 is for high-dim, overkill in practice
87  // Use 4-1024 depending on the trade-off speed accuracy that you want
88  index.nprobe = 2048;
89 
90 
91  { // training.
92 
93  // The distribution of the training vectors should be the same
94  // as the database vectors. It could be a sub-sample of the
95  // database vectors, if sampling is not biased. Here we just
96  // randomly generate the vectors.
97 
98  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
99  elapsed() - t0, nt, d);
100 
101  std::vector <float> trainvecs (nt * d);
102  for (size_t i = 0; i < nt; i++) {
103  for (size_t j = 0; j < d; j++) {
104  trainvecs[i * d + j] = drand48();
105  }
106  }
107 
108  printf ("[%.3f s] Training the index\n", elapsed() - t0);
109  index.verbose = true;
110  index.train (nt, trainvecs.data());
111  }
112 
113  // the index can be re-loaded later with
114  // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
115  faiss::write_index(&index, "/tmp/trained_index.faissindex");
116 
117  size_t nq;
118  std::vector<float> queries;
119 
120  { // populating the database
121  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
122  elapsed() - t0, nb);
123 
124  std::vector <float> database (nb * d);
125  std::vector <long> ids (nb);
126  for (size_t i = 0; i < nb; i++) {
127  for (size_t j = 0; j < d; j++) {
128  database[i * d + j] = drand48();
129  }
130  ids[i] = 8760000000L + i;
131  }
132 
133  printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
134 
135  for (size_t begin = 0; begin < nb; begin += add_bs) {
136  size_t end = std::min (begin + add_bs, nb);
137  index.add_with_ids (end - begin,
138  database.data() + d * begin,
139  ids.data() + begin);
140  }
141 
142  // remember a few elements from the database as queries
143  int i0 = 1234;
144  int i1 = 1244;
145 
146  nq = i1 - i0;
147  queries.resize (nq * d);
148  for (int i = i0; i < i1; i++) {
149  for (int j = 0; j < d; j++) {
150  queries [(i - i0) * d + j] = database [i * d + j];
151  }
152  }
153  }
154 
155  // A few notes on the internal format of the index:
156  //
157  // - the positing lists for PQ codes are index.codes, which is a
158  // std::vector < std::vector<uint8_t> >
159  // if n is the length of posting list #i, codes[i] has length bytes_per_code * n
160  //
161  // - the corresponding ids are stored in index.ids
162  //
163  // - given a vector float *x, finding which k centroids are
164  // closest to it (ie to find the nearest neighbors) can be done with
165  //
166  // long *centroid_ids = new long[k];
167  // float *distances = new float[k];
168  // index.quantizer->search (1, x, k, dis, centroids_ids);
169  //
170 
171  faiss::write_index(&index, "/tmp/populated_index.faissindex");
172 
173  { // searching the database
174  int k = 5;
175  printf ("[%.3f s] Searching the %d nearest neighbors "
176  "of %ld vectors in the index\n",
177  elapsed() - t0, k, nq);
178 
179  std::vector<faiss::Index::idx_t> nns (k * nq);
180  std::vector<float> dis (k * nq);
181 
182  index.search (nq, queries.data(), k, dis.data(), nns.data());
183 
184  printf ("[%.3f s] Query results (vector ids, then distances):\n",
185  elapsed() - t0);
186 
187  for (int i = 0; i < nq; i++) {
188  printf ("query %2d: ", i);
189  for (int j = 0; j < k; j++) {
190  printf ("%7ld ", nns[j + i * k]);
191  }
192  printf ("\n dis: ");
193  for (int j = 0; j < k; j++) {
194  printf ("%7g ", dis[j + i * k]);
195  }
196  printf ("\n");
197  }
198  }
199  return 0;
200 }
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45