Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
demo_sift1M.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9  // Copyright 2004-present Facebook. All Rights Reserved
10 
11 
12 #include <cmath>
13 #include <cstdio>
14 #include <cstdlib>
15 #include <cassert>
16 #include <cstring>
17 
18 #include <sys/types.h>
19 #include <sys/stat.h>
20 #include <unistd.h>
21 
22 #include <sys/time.h>
23 
24 #include "../AutoTune.h"
25 
26 
27 /**
28  * To run this demo, please download the ANN_SIFT1M dataset from
29  *
30  * http://corpus-texmex.irisa.fr/
31  *
32  * and unzip it to the sudirectory sift1M.
33  **/
34 
35 /*****************************************************
36  * I/O functions for fvecs and ivecs
37  *****************************************************/
38 
39 
40 float * fvecs_read (const char *fname,
41  size_t *d_out, size_t *n_out)
42 {
43  FILE *f = fopen(fname, "r");
44  if(!f) {
45  fprintf(stderr, "could not open %s\n", fname);
46  perror("");
47  abort();
48  }
49  int d;
50  fread(&d, 1, sizeof(int), f);
51  assert((d > 0 && d < 1000000) || !"unreasonable dimension");
52  fseek(f, 0, SEEK_SET);
53  struct stat st;
54  fstat(fileno(f), &st);
55  size_t sz = st.st_size;
56  assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
57  size_t n = sz / ((d + 1) * 4);
58 
59  *d_out = d; *n_out = n;
60  float *x = new float[n * (d + 1)];
61  size_t nr = fread(x, sizeof(float), n * (d + 1), f);
62  assert(nr == n * (d + 1) || !"could not read whole file");
63 
64  // shift array to remove row headers
65  for(size_t i = 0; i < n; i++)
66  memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
67 
68  fclose(f);
69  return x;
70 }
71 
72 // not very clean, but works as long as sizeof(int) == sizeof(float)
73 int *ivecs_read(const char *fname, size_t *d_out, size_t *n_out)
74 {
75  return (int*)fvecs_read(fname, d_out, n_out);
76 }
77 
78 double elapsed ()
79 {
80  struct timeval tv;
81  gettimeofday (&tv, nullptr);
82  return tv.tv_sec + tv.tv_usec * 1e-6;
83 }
84 
85 
86 
87 int main()
88 {
89  double t0 = elapsed();
90 
91  // this is typically the fastest one.
92  const char *index_key = "IVF4096,Flat";
93 
94  // these ones have better memory usage
95  // const char *index_key = "Flat";
96  // const char *index_key = "PQ32";
97  // const char *index_key = "PCA80,Flat";
98  // const char *index_key = "IVF4096,PQ8+16";
99  // const char *index_key = "IVF4096,PQ32";
100  // const char *index_key = "IMI2x8,PQ32";
101  // const char *index_key = "IMI2x8,PQ8+16";
102  // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
103 
104  faiss::Index * index;
105 
106  size_t d;
107 
108  {
109  printf ("[%.3f s] Loading train set\n", elapsed() - t0);
110 
111  size_t nt;
112  float *xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
113 
114  printf ("[%.3f s] Preparing index \"%s\" d=%ld\n",
115  elapsed() - t0, index_key, d);
116  index = faiss::index_factory(d, index_key);
117 
118  printf ("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
119 
120  index->train(nt, xt);
121  delete [] xt;
122  }
123 
124 
125  {
126  printf ("[%.3f s] Loading database\n", elapsed() - t0);
127 
128  size_t nb, d2;
129  float *xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
130  assert(d == d2 || !"dataset does not have same dimension as train set");
131 
132  printf ("[%.3f s] Indexing database, size %ld*%ld\n",
133  elapsed() - t0, nb, d);
134 
135  index->add(nb, xb);
136 
137  delete [] xb;
138  }
139 
140  size_t nq;
141  float *xq;
142 
143  {
144  printf ("[%.3f s] Loading queries\n", elapsed() - t0);
145 
146  size_t d2;
147  xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
148  assert(d == d2 || !"query does not have same dimension as train set");
149 
150  }
151 
152  size_t k; // nb of results per query in the GT
153  faiss::Index::idx_t *gt; // nq * k matrix of ground-truth nearest-neighbors
154 
155  {
156  printf ("[%.3f s] Loading ground truth for %ld queries\n",
157  elapsed() - t0, nq);
158 
159  // load ground-truth and convert int to long
160  size_t nq2;
161  int *gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
162  assert(nq2 == nq || !"incorrect nb of ground truth entries");
163 
164  gt = new faiss::Index::idx_t[k * nq];
165  for(int i = 0; i < k * nq; i++) {
166  gt[i] = gt_int[i];
167  }
168  delete [] gt_int;
169  }
170 
171  // Result of the auto-tuning
172  std::string selected_params;
173 
174  { // run auto-tuning
175 
176  printf ("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
177  "criterion, with k=%ld nq=%ld\n", elapsed() - t0, k, nq);
178 
179  faiss::OneRecallAtRCriterion crit(nq, 1);
180  crit.set_groundtruth (k, nullptr, gt);
181  crit.nnn = k; // by default, the criterion will request only 1 NN
182 
183  printf ("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
184 
185  faiss::ParameterSpace params;
186  params.initialize(index);
187 
188  printf ("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
189  elapsed() - t0, params.parameter_ranges.size(),
190  params.n_combinations());
191 
193  params.explore (index, nq, xq, crit, &ops);
194 
195  printf ("[%.3f s] Found the following operating points: \n",
196  elapsed() - t0);
197 
198  ops.display ();
199 
200  // keep the first parameter that obtains > 0.5 1-recall@1
201  for (int i = 0; i < ops.optimal_pts.size(); i++) {
202  if (ops.optimal_pts[i].perf > 0.5) {
203  selected_params = ops.optimal_pts[i].key;
204  break;
205  }
206  }
207  assert (selected_params.size() >= 0 ||
208  !"could not find good enough op point");
209  }
210 
211 
212  { // Use the found configuration to perform a search
213 
214  faiss::ParameterSpace params;
215 
216  printf ("[%.3f s] Setting parameter configuration \"%s\" on index\n",
217  elapsed() - t0, selected_params.c_str());
218 
219  params.set_index_parameters (index, selected_params.c_str());
220 
221  printf ("[%.3f s] Perform a search on %ld queries\n",
222  elapsed() - t0, nq);
223 
224  // output buffers
225  faiss::Index::idx_t *I = new faiss::Index::idx_t[nq * k];
226  float *D = new float[nq * k];
227 
228  index->search(nq, xq, k, D, I);
229 
230  printf ("[%.3f s] Compute recalls\n", elapsed() - t0);
231 
232  // evaluate result by hand.
233  int n_1 = 0, n_10 = 0, n_100 = 0;
234  for(int i = 0; i < nq; i++) {
235  int gt_nn = gt[i * k];
236  for(int j = 0; j < k; j++) {
237  if (I[i * k + j] == gt_nn) {
238  if(j < 1) n_1++;
239  if(j < 10) n_10++;
240  if(j < 100) n_100++;
241  }
242  }
243  }
244  printf("R@1 = %.4f\n", n_1 / float(nq));
245  printf("R@10 = %.4f\n", n_10 / float(nq));
246  printf("R@100 = %.4f\n", n_100 / float(nq));
247 
248  }
249 
250  delete [] xq;
251  delete [] gt;
252  delete index;
253  return 0;
254 }
void explore(Index *index, size_t nq, const float *xq, const AutoTuneCriterion &crit, OperatingPoints *ops) const
Definition: AutoTune.cpp:515
std::vector< ParameterRange > parameter_ranges
all tunable parameters
Definition: AutoTune.h:134
virtual void train(idx_t, const float *)
Definition: Index.h:89
virtual void add(idx_t n, const float *x)=0
long idx_t
all indices are this type
Definition: Index.h:62
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
virtual void initialize(const Index *index)
initialize with reasonable parameters for the index
Definition: AutoTune.cpp:331
size_t n_combinations() const
nb of combinations, = product of values sizes
Definition: AutoTune.cpp:268
void set_index_parameters(Index *index, size_t cno) const
set a combination of parameters on an index
Definition: AutoTune.cpp:389
Index * index_factory(int d, const char *description_in, MetricType metric)
Definition: AutoTune.cpp:639