Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_sift1M.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10  // Copyright 2004-present Facebook. All Rights Reserved
11 
12 
13 #include <cmath>
14 #include <cstdio>
15 #include <cstdlib>
16 #include <cassert>
17 #include <cstring>
18 
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #include <unistd.h>
22 
23 #include <sys/time.h>
24 
25 #include "../AutoTune.h"
26 
27 
28 /**
29  * To run this demo, please download the ANN_SIFT1M dataset from
30  *
31  * http://corpus-texmex.irisa.fr/
32  *
33  * and unzip it to the sudirectory sift1M.
34  **/
35 
36 /*****************************************************
37  * I/O functions for fvecs and ivecs
38  *****************************************************/
39 
40 
41 float * fvecs_read (const char *fname,
42  size_t *d_out, size_t *n_out)
43 {
44  FILE *f = fopen(fname, "r");
45  if(!f) {
46  fprintf(stderr, "could not open %s\n", fname);
47  perror("");
48  abort();
49  }
50  int d;
51  fread(&d, 1, sizeof(int), f);
52  assert((d > 0 && d < 1000000) || !"unreasonable dimension");
53  fseek(f, 0, SEEK_SET);
54  struct stat st;
55  fstat(fileno(f), &st);
56  size_t sz = st.st_size;
57  assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
58  size_t n = sz / ((d + 1) * 4);
59 
60  *d_out = d; *n_out = n;
61  float *x = new float[n * (d + 1)];
62  size_t nr = fread(x, sizeof(float), n * (d + 1), f);
63  assert(nr == n * (d + 1) || !"could not read whole file");
64 
65  // shift array to remove row headers
66  for(size_t i = 0; i < n; i++)
67  memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
68 
69  fclose(f);
70  return x;
71 }
72 
73 // not very clean, but works as long as sizeof(int) == sizeof(float)
74 int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
75 {
76  return (int*)fvecs_read(fname, d_out, n_out);
77 }
78 
79 double elapsed ()
80 {
81  struct timeval tv;
82  gettimeofday (&tv, nullptr);
83  return tv.tv_sec + tv.tv_usec * 1e-6;
84 }
85 
86 
87 
88 int main()
89 {
90  double t0 = elapsed();
91 
92  // this is typically the fastest one.
93  const char *index_key = "IVF4096,Flat";
94 
95  // these ones have better memory usage
96  // const char *index_key = "Flat";
97  // const char *index_key = "PQ32";
98  // const char *index_key = "PCA80,Flat";
99  // const char *index_key = "IVF4096,PQ8+16";
100  // const char *index_key = "IVF4096,PQ32";
101  // const char *index_key = "IMI2x8,PQ32";
102  // const char *index_key = "IMI2x8,PQ8+16";
103  // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
104 
105  faiss::Index * index;
106 
107  size_t d;
108 
109  {
110  printf ("[%.3f s] Loading train set\n", elapsed() - t0);
111 
112  size_t nt;
113  float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
114 
115  printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
116  elapsed() - t0, index_key, d);
117  index = faiss::index_factory(d, index_key);
118 
119  printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
120 
121  index->train(nt, xt);
122  delete [] xt;
123  }
124 
125 
126  {
127  printf ("[%.3f s] Loading database\n", elapsed() - t0);
128 
129  size_t nb, d2;
130  float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
131  assert(d == d2 || !"dataset does not have same dimension as train set");
132 
133  printf ("[%.3f s] Indexing database, size %ld*%ld\n",
134  elapsed() - t0, nb, d);
135 
136  index->add(nb, xb);
137 
138  delete [] xb;
139  }
140 
141  size_t nq;
142  float *xq;
143 
144  {
145  printf ("[%.3f s] Loading queries\n", elapsed() - t0);
146 
147  size_t d2;
148  xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
149  assert(d == d2 || !"query does not have same dimension as train set");
150 
151  }
152 
153  size_t k; // nb of results per query in the GT
154  faiss::Index::idx_t *gt; // nq * k matrix of ground-truth nearest-neighbors
155 
156  {
157  printf ("[%.3f s] Loading ground truth for %ld queries\n",
158  elapsed() - t0, nq);
159 
160  // load ground-truth and convert int to long
161  size_t nq2;
162  int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
163  assert(nq2 == nq || !"incorrect nb of ground truth entries");
164 
165  gt = new faiss::Index::idx_t[k * nq];
166  for(int i = 0; i < k * nq; i++) {
167  gt[i] = gt_int[i];
168  }
169  delete [] gt_int;
170  }
171 
172  // Result of the auto-tuning
173  std::string selected_params;
174 
175  { // run auto-tuning
176 
177  printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
178  "criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
179 
180  faiss::OneRecallAtRCriterion crit(nq, 1);
181  crit.set_groundtruth (k, nullptr, gt);
182  crit.nnn = k; // by default, the criterion will request only 1 NN
183 
184  printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
185 
186  faiss::ParameterSpace params;
187  params.initialize(index);
188 
189  printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
190  elapsed() - t0, params.parameter_ranges.size(),
191  params.n_combinations());
192 
194  params.explore (index, nq, xq, crit, &ops);
195 
196  printf ("[%.3f s] Found the following operating points: \n",
197  elapsed() - t0);
198 
199  ops.display ();
200 
201  // keep the first parameter that obtains > 0.5 1-recall@1
202  for (int i = 0; i < ops.optimal_pts.size(); i++) {
203  if (ops.optimal_pts[i].perf > 0.5) {
204  selected_params = ops.optimal_pts[i].key;
205  break;
206  }
207  }
208  assert (selected_params.size() >= 0 ||
209  !"could not find good enough op point");
210  }
211 
212 
213  { // Use the found configuration to perform a search
214 
215  faiss::ParameterSpace params;
216 
217  printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
218  elapsed() - t0, selected_params.c_str());
219 
220  params.set_index_parameters (index, selected_params.c_str());
221 
222  printf ("[%.3f s] Perform a search on %ld queries\n",
223  elapsed() - t0, nq);
224 
225  // output buffers
226  faiss::Index::idx_t *I = new faiss::Index::idx_t[nq * k];
227  float *D = new float[nq * k];
228 
229  index->search(nq, xq, k, D, I);
230 
231  printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
232 
233  // evaluate result by hand.
234  int n_1 = 0, n_10 = 0, n_100 = 0;
235  for(int i = 0; i < nq; i++) {
236  int gt_nn = gt[i * k];
237  for(int j = 0; j < k; j++) {
238  if (I[i * k + j] == gt_nn) {
239  if(j < 1) n_1++;
240  if(j < 10) n_10++;
241  if(j < 100) n_100++;
242  }
243  }
244  }
245  printf("R@1 = %.4f\n", n_1 / float(nq));
246  printf("R@10 = %.4f\n", n_10 / float(nq));
247  printf("R@100 = %.4f\n", n_100 / float(nq));
248 
249  }
250 
251  delete [] xq;
252  delete [] gt;
253  delete index;
254  return 0;
255 }
void explore(Index *index, size_t nq, const float *xq, const AutoTuneCriterion &crit, OperatingPoints *ops) const
Definition: AutoTune.cpp:512
std::vector< ParameterRange > parameter_ranges
all tunable parameters
Definition: AutoTune.h:135
virtual void add(idx_t n, const float *x)=0
long idx_t
all indices are this type
Definition: Index.h:64
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
virtual void initialize(const Index *index)
initialize with reasonable parameters for the index
Definition: AutoTune.cpp:330
size_t n_combinations() const
nb of combinations, = product of values sizes
Definition: AutoTune.cpp:267
void set_index_parameters(Index *index, size_t cno) const
set a combination of parameters on an index
Definition: AutoTune.cpp:388
virtual void train(idx_t n, const float *x)
Definition: Index.h:92
Index * index_factory(int d, const char *description_in, MetricType metric)
Definition: AutoTune.cpp:623