Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_imi_pq.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 
12 
13 #include <cmath>
14 #include <cstdio>
15 #include <cstdlib>
16 
17 #include <sys/time.h>
18 
19 
20 #include <faiss/IndexPQ.h>
21 #include <faiss/IndexIVFPQ.h>
22 #include <faiss/IndexFlat.h>
23 #include <faiss/index_io.h>
24 
25 double elapsed ()
26 {
27  struct timeval tv;
28  gettimeofday (&tv, nullptr);
29  return tv.tv_sec + tv.tv_usec * 1e-6;
30 }
31 
32 
33 int main ()
34 {
35  double t0 = elapsed();
36 
37  // dimension of the vectors to index
38  int d = 64;
39 
40  // size of the database we plan to index
41  size_t nb = 1000 * 1000;
42  size_t add_bs = 10000; // # size of the blocks to add
43 
44  // make a set of nt training vectors in the unit cube
45  // (could be the database)
46  size_t nt = 100 * 1000;
47 
48  //---------------------------------------------------------------
49  // Define the core quantizer
50  // We choose a multiple inverted index for faster training with less data
51  // and because it usually offers best accuracy/speed trade-offs
52  //
53  // We here assume that its lifespan of this coarse quantizer will cover the
54  // lifespan of the inverted-file quantizer IndexIVFFlat below
55  // With dynamic allocation, one may give the responsability to free the
56  // quantizer to the inverted-file index (with attribute do_delete_quantizer)
57  //
58  // Note: a regular clustering algorithm would be defined as:
59  // faiss::IndexFlatL2 coarse_quantizer (d);
60  //
61  // Use nhash=2 subquantizers used to define the product coarse quantizer
62  // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
63  // meaning (2^12)^nhash distinct inverted lists
64  //
65  // The parameter bytes_per_code is determined by the memory
66  // constraint, the dataset will use nb * (bytes_per_code + 8)
67  // bytes.
68  //
69  // The parameter nbits_subq is determined by the size of the dataset to index.
70  //
71  size_t nhash = 2;
72  size_t nbits_subq = 9;
73  size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
74  int bytes_per_code = 16;
75 
76  faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
77 
78  printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
79  nhash, nbits_subq, ncentroids, nb);
80 
81  // the coarse quantizer should not be dealloced before the index
82  // 4 = nb of bytes per code (d must be a multiple of this)
83  // 8 = nb of bits per sub-code (almost always 8)
84  faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
85  faiss::IndexIVFPQ index (&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
86  index.quantizer_trains_alone = true;
87 
88  // define the number of probes. 2048 is for high-dim, overkill in practice
89  // Use 4-1024 depending on the trade-off speed accuracy that you want
90  index.nprobe = 2048;
91 
92 
93  { // training.
94 
95  // The distribution of the training vectors should be the same
96  // as the database vectors. It could be a sub-sample of the
97  // database vectors, if sampling is not biased. Here we just
98  // randomly generate the vectors.
99 
100  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
101  elapsed() - t0, nt, d);
102 
103  std::vector <float> trainvecs (nt * d);
104  for (size_t i = 0; i < nt; i++) {
105  for (size_t j = 0; j < d; j++) {
106  trainvecs[i * d + j] = drand48();
107  }
108  }
109 
110  printf ("[%.3f s] Training the index\n", elapsed() - t0);
111  index.verbose = true;
112  index.train (nt, trainvecs.data());
113  }
114 
115  // the index can be re-loaded later with
116  // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
117  faiss::write_index(&index, "/tmp/trained_index.faissindex");
118 
119  size_t nq;
120  std::vector<float> queries;
121 
122  { // populating the database
123  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
124  elapsed() - t0, nb);
125 
126  std::vector <float> database (nb * d);
127  std::vector <long> ids (nb);
128  for (size_t i = 0; i < nb; i++) {
129  for (size_t j = 0; j < d; j++) {
130  database[i * d + j] = drand48();
131  }
132  ids[i] = 8760000000L + i;
133  }
134 
135  printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
136 
137  for (size_t begin = 0; begin < nb; begin += add_bs) {
138  size_t end = std::min (begin + add_bs, nb);
139  index.add_with_ids (end - begin,
140  database.data() + d * begin,
141  ids.data() + begin);
142  }
143 
144  // remember a few elements from the database as queries
145  int i0 = 1234;
146  int i1 = 1244;
147 
148  nq = i1 - i0;
149  queries.resize (nq * d);
150  for (int i = i0; i < i1; i++) {
151  for (int j = 0; j < d; j++) {
152  queries [(i - i0) * d + j] = database [i * d + j];
153  }
154  }
155  }
156 
157  // A few notes on the internal format of the index:
158  //
159  // - the positing lists for PQ codes are index.codes, which is a
160  // std::vector < std::vector<uint8_t> >
161  // if n is the length of posting list #i, codes[i] has length bytes_per_code * n
162  //
163  // - the corresponding ids are stored in index.ids
164  //
165  // - given a vector float *x, finding which k centroids are
166  // closest to it (ie to find the nearest neighbors) can be done with
167  //
168  // long *centroid_ids = new long[k];
169  // float *distances = new float[k];
170  // index.quantizer->search (1, x, k, dis, centroids_ids);
171  //
172 
173  faiss::write_index(&index, "/tmp/populated_index.faissindex");
174 
175  { // searching the database
176  int k = 5;
177  printf ("[%.3f s] Searching the %d nearest neighbors "
178  "of %ld vectors in the index\n",
179  elapsed() - t0, k, nq);
180 
181  std::vector<faiss::Index::idx_t> nns (k * nq);
182  std::vector<float> dis (k * nq);
183 
184  index.search (nq, queries.data(), k, dis.data(), nns.data());
185 
186  printf ("[%.3f s] Query results (vector ids, then distances):\n",
187  elapsed() - t0);
188 
189  for (int i = 0; i < nq; i++) {
190  printf ("query %2d: ", i);
191  for (int j = 0; j < k; j++) {
192  printf ("%7ld ", nns[j + i * k]);
193  }
194  printf ("\n dis: ");
195  for (int j = 0; j < k; j++) {
196  printf ("%7g ", dis[j + i * k]);
197  }
198  printf ("\n");
199  }
200  }
201  return 0;
202 }
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:44