Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_sift1M.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 
11 #include <cmath>
12 #include <cstdio>
13 #include <cstdlib>
14 #include <cassert>
15 #include <cstring>
16 
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <unistd.h>
20 
21 #include <sys/time.h>
22 
23 #include "../AutoTune.h"
24 
25 
26 /**
27  * To run this demo, please download the ANN_SIFT1M dataset from
28  *
29  * http://corpus-texmex.irisa.fr/
30  *
31  * and unzip it to the sudirectory sift1M.
32  **/
33 
34 /*****************************************************
35  * I/O functions for fvecs and ivecs
36  *****************************************************/
37 
38 
39 float * fvecs_read (const char *fname,
40  size_t *d_out, size_t *n_out)
41 {
42  FILE *f = fopen(fname, "r");
43  if(!f) {
44  fprintf(stderr, "could not open %s\n", fname);
45  perror("");
46  abort();
47  }
48  int d;
49  fread(&d, 1, sizeof(int), f);
50  assert((d > 0 && d < 1000000) || !"unreasonable dimension");
51  fseek(f, 0, SEEK_SET);
52  struct stat st;
53  fstat(fileno(f), &st);
54  size_t sz = st.st_size;
55  assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
56  size_t n = sz / ((d + 1) * 4);
57 
58  *d_out = d; *n_out = n;
59  float *x = new float[n * (d + 1)];
60  size_t nr = fread(x, sizeof(float), n * (d + 1), f);
61  assert(nr == n * (d + 1) || !"could not read whole file");
62 
63  // shift array to remove row headers
64  for(size_t i = 0; i < n; i++)
65  memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
66 
67  fclose(f);
68  return x;
69 }
70 
71 // not very clean, but works as long as sizeof(int) == sizeof(float)
72 int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
73 {
74  return (int*)fvecs_read(fname, d_out, n_out);
75 }
76 
77 double elapsed ()
78 {
79  struct timeval tv;
80  gettimeofday (&tv, nullptr);
81  return tv.tv_sec + tv.tv_usec * 1e-6;
82 }
83 
84 
85 
86 int main()
87 {
88  double t0 = elapsed();
89 
90  // this is typically the fastest one.
91  const char *index_key = "IVF4096,Flat";
92 
93  // these ones have better memory usage
94  // const char *index_key = "Flat";
95  // const char *index_key = "PQ32";
96  // const char *index_key = "PCA80,Flat";
97  // const char *index_key = "IVF4096,PQ8+16";
98  // const char *index_key = "IVF4096,PQ32";
99  // const char *index_key = "IMI2x8,PQ32";
100  // const char *index_key = "IMI2x8,PQ8+16";
101  // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
102 
103  faiss::Index * index;
104 
105  size_t d;
106 
107  {
108  printf ("[%.3f s] Loading train set\n", elapsed() - t0);
109 
110  size_t nt;
111  float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
112 
113  printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
114  elapsed() - t0, index_key, d);
115  index = faiss::index_factory(d, index_key);
116 
117  printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
118 
119  index->train(nt, xt);
120  delete [] xt;
121  }
122 
123 
124  {
125  printf ("[%.3f s] Loading database\n", elapsed() - t0);
126 
127  size_t nb, d2;
128  float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
129  assert(d == d2 || !"dataset does not have same dimension as train set");
130 
131  printf ("[%.3f s] Indexing database, size %ld*%ld\n",
132  elapsed() - t0, nb, d);
133 
134  index->add(nb, xb);
135 
136  delete [] xb;
137  }
138 
139  size_t nq;
140  float *xq;
141 
142  {
143  printf ("[%.3f s] Loading queries\n", elapsed() - t0);
144 
145  size_t d2;
146  xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
147  assert(d == d2 || !"query does not have same dimension as train set");
148 
149  }
150 
151  size_t k; // nb of results per query in the GT
152  faiss::Index::idx_t *gt; // nq * k matrix of ground-truth nearest-neighbors
153 
154  {
155  printf ("[%.3f s] Loading ground truth for %ld queries\n",
156  elapsed() - t0, nq);
157 
158  // load ground-truth and convert int to long
159  size_t nq2;
160  int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
161  assert(nq2 == nq || !"incorrect nb of ground truth entries");
162 
163  gt = new faiss::Index::idx_t[k * nq];
164  for(int i = 0; i < k * nq; i++) {
165  gt[i] = gt_int[i];
166  }
167  delete [] gt_int;
168  }
169 
170  // Result of the auto-tuning
171  std::string selected_params;
172 
173  { // run auto-tuning
174 
175  printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
176  "criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
177 
178  faiss::OneRecallAtRCriterion crit(nq, 1);
179  crit.set_groundtruth (k, nullptr, gt);
180  crit.nnn = k; // by default, the criterion will request only 1 NN
181 
182  printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
183 
184  faiss::ParameterSpace params;
185  params.initialize(index);
186 
187  printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
188  elapsed() - t0, params.parameter_ranges.size(),
189  params.n_combinations());
190 
192  params.explore (index, nq, xq, crit, &ops);
193 
194  printf ("[%.3f s] Found the following operating points: \n",
195  elapsed() - t0);
196 
197  ops.display ();
198 
199  // keep the first parameter that obtains > 0.5 1-recall@1
200  for (int i = 0; i < ops.optimal_pts.size(); i++) {
201  if (ops.optimal_pts[i].perf > 0.5) {
202  selected_params = ops.optimal_pts[i].key;
203  break;
204  }
205  }
206  assert (selected_params.size() >= 0 ||
207  !"could not find good enough op point");
208  }
209 
210 
211  { // Use the found configuration to perform a search
212 
213  faiss::ParameterSpace params;
214 
215  printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
216  elapsed() - t0, selected_params.c_str());
217 
218  params.set_index_parameters (index, selected_params.c_str());
219 
220  printf ("[%.3f s] Perform a search on %ld queries\n",
221  elapsed() - t0, nq);
222 
223  // output buffers
224  faiss::Index::idx_t *I = new faiss::Index::idx_t[nq * k];
225  float *D = new float[nq * k];
226 
227  index->search(nq, xq, k, D, I);
228 
229  printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
230 
231  // evaluate result by hand.
232  int n_1 = 0, n_10 = 0, n_100 = 0;
233  for(int i = 0; i < nq; i++) {
234  int gt_nn = gt[i * k];
235  for(int j = 0; j < k; j++) {
236  if (I[i * k + j] == gt_nn) {
237  if(j < 1) n_1++;
238  if(j < 10) n_10++;
239  if(j < 100) n_100++;
240  }
241  }
242  }
243  printf("R@1 = %.4f\n", n_1 / float(nq));
244  printf("R@10 = %.4f\n", n_10 / float(nq));
245  printf("R@100 = %.4f\n", n_100 / float(nq));
246 
247  }
248 
249  delete [] xq;
250  delete [] gt;
251  delete index;
252  return 0;
253 }
void explore(Index *index, size_t nq, const float *xq, const AutoTuneCriterion &crit, OperatingPoints *ops) const
Definition: AutoTune.cpp:578
std::vector< ParameterRange > parameter_ranges
all tunable parameters
Definition: AutoTune.h:134
virtual void train(idx_t n, const float *x)
Definition: Index.cpp:24
virtual void add(idx_t n, const float *x)=0
long idx_t
all indices are this type
Definition: Index.h:64
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
virtual void initialize(const Index *index)
initialize with reasonable parameters for the index
Definition: AutoTune.cpp:346
size_t n_combinations() const
nb of combinations, = product of values sizes
Definition: AutoTune.cpp:278
void set_index_parameters(Index *index, size_t cno) const
set a combination of parameters on an index
Definition: AutoTune.cpp:419
Index * index_factory(int d, const char *description_in, MetricType metric)
Definition: AutoTune.cpp:722