Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_ivfpq_indexing.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 
11 #include <cmath>
12 #include <cstdio>
13 #include <cstdlib>
14 
15 #include <sys/time.h>
16 
17 
18 #include "../IndexIVFPQ.h"
19 #include "../IndexFlat.h"
20 #include "../index_io.h"
21 
22 double elapsed ()
23 {
24  struct timeval tv;
25  gettimeofday (&tv, NULL);
26  return tv.tv_sec + tv.tv_usec * 1e-6;
27 }
28 
29 
30 int main ()
31 {
32 
33  double t0 = elapsed();
34 
35  // dimension of the vectors to index
36  int d = 128;
37 
38  // size of the database we plan to index
39  size_t nb = 200 * 1000;
40 
41  // make a set of nt training vectors in the unit cube
42  // (could be the database)
43  size_t nt = 100 * 1000;
44 
45  // make the index object and train it
46  faiss::IndexFlatL2 coarse_quantizer (d);
47 
48  // a reasonable number of centroids to index nb vectors
49  int ncentroids = int (4 * sqrt (nb));
50 
51  // the coarse quantizer should not be dealloced before the index
52  // 4 = nb of bytes per code (d must be a multiple of this)
53  // 8 = nb of bits per sub-code (almost always 8)
54  faiss::IndexIVFPQ index (&coarse_quantizer, d,
55  ncentroids, 4, 8);
56 
57 
58  { // training
59  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
60  elapsed() - t0, nt, d);
61 
62  std::vector <float> trainvecs (nt * d);
63  for (size_t i = 0; i < nt * d; i++) {
64  trainvecs[i] = drand48();
65  }
66 
67  printf ("[%.3f s] Training the index\n",
68  elapsed() - t0);
69  index.verbose = true;
70 
71  index.train (nt, trainvecs.data());
72  }
73 
74  { // I/O demo
75  const char *outfilename = "/tmp/index_trained.faissindex";
76  printf ("[%.3f s] storing the pre-trained index to %s\n",
77  elapsed() - t0, outfilename);
78 
79  write_index (&index, outfilename);
80  }
81 
82  size_t nq;
83  std::vector<float> queries;
84 
85  { // populating the database
86  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
87  elapsed() - t0, nb);
88 
89  std::vector <float> database (nb * d);
90  for (size_t i = 0; i < nb * d; i++) {
91  database[i] = drand48();
92  }
93 
94  printf ("[%.3f s] Adding the vectors to the index\n",
95  elapsed() - t0);
96 
97  index.add (nb, database.data());
98 
99  printf ("[%.3f s] imbalance factor: %g\n",
100  elapsed() - t0, index.imbalance_factor ());
101 
102  // remember a few elements from the database as queries
103  int i0 = 1234;
104  int i1 = 1243;
105 
106  nq = i1 - i0;
107  queries.resize (nq * d);
108  for (int i = i0; i < i1; i++) {
109  for (int j = 0; j < d; j++) {
110  queries [(i - i0) * d + j] = database [i * d + j];
111  }
112  }
113 
114  }
115 
116  { // searching the database
117  int k = 5;
118  printf ("[%.3f s] Searching the %d nearest neighbors "
119  "of %ld vectors in the index\n",
120  elapsed() - t0, k, nq);
121 
122  std::vector<faiss::Index::idx_t> nns (k * nq);
123  std::vector<float> dis (k * nq);
124 
125  index.search (nq, queries.data(), k, dis.data(), nns.data());
126 
127  printf ("[%.3f s] Query results (vector ids, then distances):\n",
128  elapsed() - t0);
129 
130  for (int i = 0; i < nq; i++) {
131  printf ("query %2d: ", i);
132  for (int j = 0; j < k; j++) {
133  printf ("%7ld ", nns[j + i * k]);
134  }
135  printf ("\n dis: ");
136  for (int j = 0; j < k; j++) {
137  printf ("%7g ", dis[j + i * k]);
138  }
139  printf ("\n");
140  }
141 
142  printf ("note that the nearest neighbor is not at "
143  "distance 0 due to quantization errors\n");
144  }
145 
146  return 0;
147 }