Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_ivfpq_indexing.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 
12 
13 #include <cmath>
14 #include <cstdio>
15 #include <cstdlib>
16 
17 #include <sys/time.h>
18 
19 
20 #include "../IndexIVFPQ.h"
21 #include "../IndexFlat.h"
22 #include "../index_io.h"
23 
24 double elapsed ()
25 {
26  struct timeval tv;
27  gettimeofday (&tv, NULL);
28  return tv.tv_sec + tv.tv_usec * 1e-6;
29 }
30 
31 
32 int main ()
33 {
34 
35  double t0 = elapsed();
36 
37  // dimension of the vectors to index
38  int d = 128;
39 
40  // size of the database we plan to index
41  size_t nb = 200 * 1000;
42 
43  // make a set of nt training vectors in the unit cube
44  // (could be the database)
45  size_t nt = 100 * 1000;
46 
47  // make the index object and train it
48  faiss::IndexFlatL2 coarse_quantizer (d);
49 
50  // a reasonable number of centroids to index nb vectors
51  int ncentroids = int (4 * sqrt (nb));
52 
53  // the coarse quantizer should not be dealloced before the index
54  // 4 = nb of bytes per code (d must be a multiple of this)
55  // 8 = nb of bits per sub-code (almost always 8)
56  faiss::IndexIVFPQ index (&coarse_quantizer, d,
57  ncentroids, 4, 8);
58 
59 
60  { // training
61  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
62  elapsed() - t0, nt, d);
63 
64  std::vector <float> trainvecs (nt * d);
65  for (size_t i = 0; i < nt * d; i++) {
66  trainvecs[i] = drand48();
67  }
68 
69  printf ("[%.3f s] Training the index\n",
70  elapsed() - t0);
71  index.verbose = true;
72 
73  index.train (nt, trainvecs.data());
74  }
75 
76  { // I/O demo
77  const char *outfilename = "/tmp/index_trained.faissindex";
78  printf ("[%.3f s] storing the pre-trained index to %s\n",
79  elapsed() - t0, outfilename);
80 
81  write_index (&index, outfilename);
82  }
83 
84  size_t nq;
85  std::vector<float> queries;
86 
87  { // populating the database
88  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
89  elapsed() - t0, nb);
90 
91  std::vector <float> database (nb * d);
92  for (size_t i = 0; i < nb * d; i++) {
93  database[i] = drand48();
94  }
95 
96  printf ("[%.3f s] Adding the vectors to the index\n",
97  elapsed() - t0);
98 
99  index.add (nb, database.data());
100 
101  printf ("[%.3f s] imbalance factor: %g\n",
102  elapsed() - t0, index.imbalance_factor ());
103 
104  // remember a few elements from the database as queries
105  int i0 = 1234;
106  int i1 = 1243;
107 
108  nq = i1 - i0;
109  queries.resize (nq * d);
110  for (int i = i0; i < i1; i++) {
111  for (int j = 0; j < d; j++) {
112  queries [(i - i0) * d + j] = database [i * d + j];
113  }
114  }
115 
116  }
117 
118  { // searching the database
119  int k = 5;
120  printf ("[%.3f s] Searching the %d nearest neighbors "
121  "of %ld vectors in the index\n",
122  elapsed() - t0, k, nq);
123 
124  std::vector<faiss::Index::idx_t> nns (k * nq);
125  std::vector<float> dis (k * nq);
126 
127  index.search (nq, queries.data(), k, dis.data(), nns.data());
128 
129  printf ("[%.3f s] Query results (vector ids, then distances):\n",
130  elapsed() - t0);
131 
132  for (int i = 0; i < nq; i++) {
133  printf ("query %2d: ", i);
134  for (int j = 0; j < k; j++) {
135  printf ("%7ld ", nns[j + i * k]);
136  }
137  printf ("\n dis: ");
138  for (int j = 0; j < k; j++) {
139  printf ("%7g ", dis[j + i * k]);
140  }
141  printf ("\n");
142  }
143 
144  printf ("note that the nearest neighbor is not at "
145  "distance 0 due to quantization errors\n");
146  }
147 
148  return 0;
149 }