Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_ivfpq_indexing.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 
11 
12 #include <cmath>
13 #include <cstdio>
14 #include <cstdlib>
15 
16 #include <sys/time.h>
17 
18 
19 #include "../IndexIVFPQ.h"
20 #include "../IndexFlat.h"
21 #include "../index_io.h"
22 
23 double elapsed ()
24 {
25  struct timeval tv;
26  gettimeofday (&tv, NULL);
27  return tv.tv_sec + tv.tv_usec * 1e-6;
28 }
29 
30 
31 int main ()
32 {
33 
34  double t0 = elapsed();
35 
36  // dimension of the vectors to index
37  int d = 128;
38 
39  // size of the database we plan to index
40  size_t nb = 200 * 1000;
41 
42  // make a set of nt training vectors in the unit cube
43  // (could be the database)
44  size_t nt = 100 * 1000;
45 
46  // make the index object and train it
47  faiss::IndexFlatL2 coarse_quantizer (d);
48 
49  // a reasonable number of centroids to index nb vectors
50  int ncentroids = int (4 * sqrt (nb));
51 
52  // the coarse quantizer should not be dealloced before the index
53  // 4 = nb of bytes per code (d must be a multiple of this)
54  // 8 = nb of bits per sub-code (almost always 8)
55  faiss::IndexIVFPQ index (&coarse_quantizer, d,
56  ncentroids, 4, 8);
57 
58 
59  { // training
60  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
61  elapsed() - t0, nt, d);
62 
63  std::vector <float> trainvecs (nt * d);
64  for (size_t i = 0; i < nt * d; i++) {
65  trainvecs[i] = drand48();
66  }
67 
68  printf ("[%.3f s] Training the index\n",
69  elapsed() - t0);
70  index.verbose = true;
71 
72  index.train (nt, trainvecs.data());
73  }
74 
75  { // I/O demo
76  const char *outfilename = "/tmp/index_trained.faissindex";
77  printf ("[%.3f s] storing the pre-trained index to %s\n",
78  elapsed() - t0, outfilename);
79 
80  write_index (&index, outfilename);
81  }
82 
83  size_t nq;
84  std::vector<float> queries;
85 
86  { // populating the database
87  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
88  elapsed() - t0, nb);
89 
90  std::vector <float> database (nb * d);
91  for (size_t i = 0; i < nb * d; i++) {
92  database[i] = drand48();
93  }
94 
95  printf ("[%.3f s] Adding the vectors to the index\n",
96  elapsed() - t0);
97 
98  index.add (nb, database.data());
99 
100  printf ("[%.3f s] imbalance factor: %g\n",
101  elapsed() - t0, index.imbalance_factor ());
102 
103  // remember a few elements from the database as queries
104  int i0 = 1234;
105  int i1 = 1243;
106 
107  nq = i1 - i0;
108  queries.resize (nq * d);
109  for (int i = i0; i < i1; i++) {
110  for (int j = 0; j < d; j++) {
111  queries [(i - i0) * d + j] = database [i * d + j];
112  }
113  }
114 
115  }
116 
117  { // searching the database
118  int k = 5;
119  printf ("[%.3f s] Searching the %d nearest neighbors "
120  "of %ld vectors in the index\n",
121  elapsed() - t0, k, nq);
122 
123  std::vector<faiss::Index::idx_t> nns (k * nq);
124  std::vector<float> dis (k * nq);
125 
126  index.search (nq, queries.data(), k, dis.data(), nns.data());
127 
128  printf ("[%.3f s] Query results (vector ids, then distances):\n",
129  elapsed() - t0);
130 
131  for (int i = 0; i < nq; i++) {
132  printf ("query %2d: ", i);
133  for (int j = 0; j < k; j++) {
134  printf ("%7ld ", nns[j + i * k]);
135  }
136  printf ("\n dis: ");
137  for (int j = 0; j < k; j++) {
138  printf ("%7g ", dis[j + i * k]);
139  }
140  printf ("\n");
141  }
142 
143  printf ("note that the nearest neighbor is not at "
144  "distance 0 due to quantization errors\n");
145  }
146 
147  return 0;
148 }