Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_imi_flat.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 
11 
12 #include <cmath>
13 #include <cstdio>
14 #include <cstdlib>
15 
16 #include <sys/time.h>
17 
18 
19 #include <faiss/IndexPQ.h>
20 #include <faiss/IndexIVF.h>
21 #include <faiss/IndexFlat.h>
22 #include <faiss/index_io.h>
23 
24 double elapsed ()
25 {
26  struct timeval tv;
27  gettimeofday (&tv, nullptr);
28  return tv.tv_sec + tv.tv_usec * 1e-6;
29 }
30 
31 
32 int main ()
33 {
34  double t0 = elapsed();
35 
36  // dimension of the vectors to index
37  int d = 128;
38 
39  // size of the database we plan to index
40  size_t nb = 1000 * 1000;
41 
42  // make a set of nt training vectors in the unit cube
43  // (could be the database)
44  size_t nt = 100 * 1000;
45 
46  //---------------------------------------------------------------
47  // Define the core quantizer
48  // We choose a multiple inverted index for faster training with less data
49  // and because it usually offers best accuracy/speed trade-offs
50  //
51  // We here assume that its lifespan of this coarse quantizer will cover the
52  // lifespan of the inverted-file quantizer IndexIVFFlat below
53  // With dynamic allocation, one may give the responsability to free the
54  // quantizer to the inverted-file index (with attribute do_delete_quantizer)
55  //
56  // Note: a regular clustering algorithm would be defined as:
57  // faiss::IndexFlatL2 coarse_quantizer (d);
58  //
59  // Use nhash=2 subquantizers used to define the product coarse quantizer
60  // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
61  // meaning (2^12)^nhash distinct inverted lists
62  size_t nhash = 2;
63  size_t nbits_subq = int (log2 (nb+1) / 2); // good choice in general
64  size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
65 
66  faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
67 
68  printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
69  nhash, nbits_subq, ncentroids, nb);
70 
71  // the coarse quantizer should not be dealloced before the index
72  // 4 = nb of bytes per code (d must be a multiple of this)
73  // 8 = nb of bits per sub-code (almost always 8)
74  faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
75  faiss::IndexIVFFlat index (&coarse_quantizer, d, ncentroids, metric);
76  index.quantizer_trains_alone = true;
77 
78  // define the number of probes. 2048 is for high-dim, overkilled in practice
79  // Use 4-1024 depending on the trade-off speed accuracy that you want
80  index.nprobe = 2048;
81 
82 
83  { // training
84  printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
85  elapsed() - t0, nt, d);
86 
87  std::vector <float> trainvecs (nt * d);
88  for (size_t i = 0; i < nt * d; i++) {
89  trainvecs[i] = drand48();
90  }
91 
92  printf ("[%.3f s] Training the index\n", elapsed() - t0);
93  index.verbose = true;
94  index.train (nt, trainvecs.data());
95  }
96 
97  size_t nq;
98  std::vector<float> queries;
99 
100  { // populating the database
101  printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
102  elapsed() - t0, nb);
103 
104  std::vector <float> database (nb * d);
105  for (size_t i = 0; i < nb * d; i++) {
106  database[i] = drand48();
107  }
108 
109  printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
110 
111  index.add (nb, database.data());
112 
113  // remember a few elements from the database as queries
114  int i0 = 1234;
115  int i1 = 1244;
116 
117  nq = i1 - i0;
118  queries.resize (nq * d);
119  for (int i = i0; i < i1; i++) {
120  for (int j = 0; j < d; j++) {
121  queries [(i - i0) * d + j] = database [i * d + j];
122  }
123  }
124  }
125 
126  { // searching the database
127  int k = 5;
128  printf ("[%.3f s] Searching the %d nearest neighbors "
129  "of %ld vectors in the index\n",
130  elapsed() - t0, k, nq);
131 
132  std::vector<faiss::Index::idx_t> nns (k * nq);
133  std::vector<float> dis (k * nq);
134 
135  index.search (nq, queries.data(), k, dis.data(), nns.data());
136 
137  printf ("[%.3f s] Query results (vector ids, then distances):\n",
138  elapsed() - t0);
139 
140  for (int i = 0; i < nq; i++) {
141  printf ("query %2d: ", i);
142  for (int j = 0; j < k; j++) {
143  printf ("%7ld ", nns[j + i * k]);
144  }
145  printf ("\n dis: ");
146  for (int j = 0; j < k; j++) {
147  printf ("%7g ", dis[j + i * k]);
148  }
149  printf ("\n");
150  }
151  }
152  return 0;
153 }
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:43