Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_imi_flat.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 
11 #include <cmath>
12 #include <cstdio>
13 #include <cstdlib>
14 
15 #include <sys/time.h>
16 
17 
18 #include <faiss/IndexPQ.h>
19 #include <faiss/IndexIVFFlat.h>
20 #include <faiss/IndexFlat.h>
21 #include <faiss/index_io.h>
22 
23 double elapsed ()
24 {
25  struct timeval tv;
26  gettimeofday (&tv, nullptr);
27  return tv.tv_sec + tv.tv_usec * 1e-6;
28 }
29 
30 
31 int main ()
32 {
33  double t0 = elapsed();
34 
35  // dimension of the vectors to index
36  int d = 128;
37 
38  // size of the database we plan to index
39  size_t nb = 1000 * 1000;
40 
41  // make a set of nt training vectors in the unit cube
42  // (could be the database)
43  size_t nt = 100 * 1000;
44 
45  //---------------------------------------------------------------
46  // Define the core quantizer
47  // We choose a multiple inverted index for faster training with less data
48  // and because it usually offers best accuracy/speed trade-offs
49  //
50  // We here assume that its lifespan of this coarse quantizer will cover the
51  // lifespan of the inverted-file quantizer IndexIVFFlat below
52  // With dynamic allocation, one may give the responsability to free the
53  // quantizer to the inverted-file index (with attribute do_delete_quantizer)
54  //
55  // Note: a regular clustering algorithm would be defined as:
56  // faiss::IndexFlatL2 coarse_quantizer (d);
57  //
58  // Use nhash=2 subquantizers used to define the product coarse quantizer
59  // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
60  // meaning (2^12)^nhash distinct inverted lists
61  size_t nhash = 2;
62  size_t nbits_subq = int (log2 (nb+1) / 2); // good choice in general
63  size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
64 
65  faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
66 
67  printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
68  nhash, nbits_subq, ncentroids, nb);
69 
70  // the coarse quantizer should not be dealloced before the index
71  // 4 = nb of bytes per code (d must be a multiple of this)
72  // 8 = nb of bits per sub-code (almost always 8)
73  faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
74  faiss::IndexIVFFlat index (&coarse_quantizer, d, ncentroids, metric);
75  index.quantizer_trains_alone = true;
76 
77  // define the number of probes. 2048 is for high-dim, overkilled in practice
78  // Use 4-1024 depending on the trade-off speed accuracy that you want
79  index.nprobe = 2048;
80 
81 
82  { // training
83  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
84  elapsed() - t0, nt, d);
85 
86  std::vector <float> trainvecs (nt * d);
87  for (size_t i = 0; i < nt * d; i++) {
88  trainvecs[i] = drand48();
89  }
90 
91  printf ("[%.3f s] Training the index\n", elapsed() - t0);
92  index.verbose = true;
93  index.train (nt, trainvecs.data());
94  }
95 
96  size_t nq;
97  std::vector<float> queries;
98 
99  { // populating the database
100  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
101  elapsed() - t0, nb);
102 
103  std::vector <float> database (nb * d);
104  for (size_t i = 0; i < nb * d; i++) {
105  database[i] = drand48();
106  }
107 
108  printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
109 
110  index.add (nb, database.data());
111 
112  // remember a few elements from the database as queries
113  int i0 = 1234;
114  int i1 = 1244;
115 
116  nq = i1 - i0;
117  queries.resize (nq * d);
118  for (int i = i0; i < i1; i++) {
119  for (int j = 0; j < d; j++) {
120  queries [(i - i0) * d + j] = database [i * d + j];
121  }
122  }
123  }
124 
125  { // searching the database
126  int k = 5;
127  printf ("[%.3f s] Searching the %d nearest neighbors "
128  "of %ld vectors in the index\n",
129  elapsed() - t0, k, nq);
130 
131  std::vector<faiss::Index::idx_t> nns (k * nq);
132  std::vector<float> dis (k * nq);
133 
134  index.search (nq, queries.data(), k, dis.data(), nns.data());
135 
136  printf ("[%.3f s] Query results (vector ids, then distances):\n",
137  elapsed() - t0);
138 
139  for (int i = 0; i < nq; i++) {
140  printf ("query %2d: ", i);
141  for (int j = 0; j < k; j++) {
142  printf ("%7ld ", nns[j + i * k]);
143  }
144  printf ("\n dis: ");
145  for (int j = 0; j < k; j++) {
146  printf ("%7g ", dis[j + i * k]);
147  }
148  printf ("\n");
149  }
150  }
151  return 0;
152 }
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45