Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/index_io.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "index_io.h"
12 
13 #include <cstdio>
14 #include <cstdlib>
15 
16 #include <sys/mman.h>
17 
18 #include "FaissAssert.h"
19 
20 #include "IndexFlat.h"
21 #include "VectorTransform.h"
22 #include "IndexLSH.h"
23 #include "IndexPQ.h"
24 #include "IndexIVF.h"
25 #include "IndexIVFPQ.h"
26 #include "MetaIndexes.h"
27 #include "IndexScalarQuantizer.h"
28 #include "IndexHNSW.h"
29 
30 /*************************************************************
31  * The I/O format is the content of the class. For objects that are
32  * inherited, like Index, a 4-character-code (fourcc) indicates which
33  * child class this is an instance of.
34  *
35  * In this case, the fields of the parent class are written first,
36  * then the ones for the child classes. Note that this requires
37  * classes to be serialized to have a constructor without parameters,
38  * so that the fields can be filled in later. The default constructor
39  * should set reasonable defaults for all fields.
40  *
41  * The fourccs are assigned arbitrarily. When the class changed (added
42  * or deprecated fields), the fourcc can be replaced. New code should
43  * be able to read the old fourcc and fill in new classes.
44  *
45  * TODO: serialization to strings for use in Python pickle or Torch
46  * serialization.
47  *
48  * TODO: in this file, the read functions that encouter errors may
49  * leak memory.
50  **************************************************************/
51 
52 
53 
54 namespace faiss {
55 
56 static uint32_t fourcc (const char sx[4]) {
57  const unsigned char *x = (unsigned char*)sx;
58  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
59 }
60 
61 /*************************************************************
62  * I/O macros
63  *
64  * we use macros so that we have a line number to report in
65  * abort (). This makes debugging a lot easier.
66  **************************************************************/
67 
68 
69 #define WRITEANDCHECK(ptr, n) { \
70  size_t ret = fwrite (ptr, sizeof (* (ptr)), n, f); \
71  FAISS_THROW_IF_NOT_MSG (ret == (n), "write error"); \
72  }
73 
74 #define READANDCHECK(ptr, n) { \
75  size_t ret = fread (ptr, sizeof (* (ptr)), n, f); \
76  FAISS_THROW_IF_NOT_MSG (ret == (n), "read error"); \
77  }
78 
79 #define WRITE1(x) WRITEANDCHECK(&(x), 1)
80 #define READ1(x) READANDCHECK(&(x), 1)
81 
82 #define WRITEVECTOR(vec) { \
83  size_t size = (vec).size (); \
84  WRITEANDCHECK (&size, 1); \
85  WRITEANDCHECK ((vec).data (), size); \
86  }
87 
88 #define READVECTOR(vec) { \
89  long size; \
90  READANDCHECK (&size, 1); \
91  FAISS_THROW_IF_NOT (size >= 0 && size < (1L << 40)); \
92  (vec).resize (size); \
93  READANDCHECK ((vec).data (), size); \
94  }
95 
97  FILE *f;
98  ScopeFileCloser (FILE *f): f (f) {}
99  ~ScopeFileCloser () {fclose (f); }
100 };
101 
102 // Macros for read/write arrays aligned to 16 bytes in the
103 // file. Useful when mmapped.
104 
105 #define WRITETABPAD16(tab, size_in) { \
106  size_t size = (size_in); \
107  WRITEANDCHECK (&size, 1); \
108  uint8_t padding[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \
109  int idx = ftell(f) % 16; \
110  padding [idx] = 15 - idx; \
111  WRITEANDCHECK (padding + idx, 16 - idx); \
112  WRITEANDCHECK ((tab), size); \
113 }
114 
115 #define READTABPAD16(tab, basetype, expected_size) { \
116  size_t size; \
117  READANDCHECK (&size, 1); \
118  FAISS_THROW_IF_NOT ((expected_size) == size); \
119  uint8_t padding[16], npad; \
120  READ1(npad); \
121  FAISS_THROW_IF_NOT (npad < 16); \
122  READANDCHECK (padding, npad); \
123  (tab) = new basetype [size]; \
124  READANDCHECK ((tab), size); \
125 }
126 
127 // read only the array header, return its offset and skip over it
128 #define TABOFFSETPAD16(taboffset, basetype, expected_size) { \
129  size_t size; \
130  READANDCHECK (&size, 1); \
131  FAISS_THROW_IF_NOT ((expected_size) == size); \
132  uint8_t padding[16], npad; \
133  READ1(npad); \
134  FAISS_THROW_IF_NOT (npad < 16); \
135  READANDCHECK (padding, npad); \
136  taboffset = ftell(f); \
137  fseek (f, sizeof(basetype) * size, SEEK_CUR); \
138 }
139 
140 
141 
142 
143 /*************************************************************
144  * Write
145  **************************************************************/
146 
147 static void write_index_header (const Index *idx, FILE *f) {
148  WRITE1 (idx->d);
149  WRITE1 (idx->ntotal);
150  Index::idx_t dummy = 1 << 20;
151  WRITE1 (dummy);
152  WRITE1 (dummy);
153  WRITE1 (idx->is_trained);
154  WRITE1 (idx->metric_type);
155 }
156 
157 void write_VectorTransform (const VectorTransform *vt, FILE *f) {
158  if (const LinearTransform * lt =
159  dynamic_cast < const LinearTransform *> (vt)) {
160  if (dynamic_cast<const RandomRotationMatrix *>(lt)) {
161  uint32_t h = fourcc ("rrot");
162  WRITE1 (h);
163  } else if (const PCAMatrix * pca =
164  dynamic_cast<const PCAMatrix *>(lt)) {
165  uint32_t h = fourcc ("PcAm");
166  WRITE1 (h);
167  WRITE1 (pca->eigen_power);
168  WRITE1 (pca->random_rotation);
169  WRITE1 (pca->balanced_bins);
170  WRITEVECTOR (pca->mean);
171  WRITEVECTOR (pca->eigenvalues);
172  WRITEVECTOR (pca->PCAMat);
173  } else {
174  // generic LinearTransform (includes OPQ)
175  uint32_t h = fourcc ("LTra");
176  WRITE1 (h);
177  }
178  WRITE1 (lt->have_bias);
179  WRITEVECTOR (lt->A);
180  WRITEVECTOR (lt->b);
181  } else if (const RemapDimensionsTransform *rdt =
182  dynamic_cast<const RemapDimensionsTransform *>(vt)) {
183  uint32_t h = fourcc ("RmDT");
184  WRITE1 (h);
185  WRITEVECTOR (rdt->map);
186  } else if (const NormalizationTransform *nt =
187  dynamic_cast<const NormalizationTransform *>(vt)) {
188  uint32_t h = fourcc ("VNrm");
189  WRITE1 (h);
190  WRITE1 (nt->norm);
191  } else {
192  FAISS_THROW_MSG ("cannot serialize this");
193  }
194  // common fields
195  WRITE1 (vt->d_in);
196  WRITE1 (vt->d_out);
197  WRITE1 (vt->is_trained);
198 }
199 
200 static void write_ProductQuantizer (const ProductQuantizer *pq, FILE *f) {
201  WRITE1 (pq->d);
202  WRITE1 (pq->M);
203  WRITE1 (pq->nbits);
204  WRITEVECTOR (pq->centroids);
205 }
206 
207 static void write_ScalarQuantizer (const ScalarQuantizer *ivsc, FILE *f) {
208  WRITE1 (ivsc->qtype);
209  WRITE1 (ivsc->rangestat);
210  WRITE1 (ivsc->rangestat_arg);
211  WRITE1 (ivsc->d);
212  WRITE1 (ivsc->code_size);
213  WRITEVECTOR (ivsc->trained);
214 }
215 
216 void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
217  FILE *f = fopen (fname, "w");
218  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
219  ScopeFileCloser closer(f);
220  write_ProductQuantizer (pq, f);
221 }
222 
223 static void write_HNSW (const HNSW *hnsw, FILE *f) {
224 
225  WRITEVECTOR (hnsw->assign_probas);
226  WRITEVECTOR (hnsw->cum_nneighbor_per_level);
227  WRITEVECTOR (hnsw->levels);
228  WRITEVECTOR (hnsw->offsets);
229  WRITEVECTOR (hnsw->neighbors);
230 
231  WRITE1 (hnsw->entry_point);
232  WRITE1 (hnsw->max_level);
233  WRITE1 (hnsw->efConstruction);
234  WRITE1 (hnsw->efSearch);
235  WRITE1 (hnsw->upper_beam);
236 
237 }
238 
239 static void write_ivf_header (const IndexIVF * ivf, FILE *f,
240  bool include_ids = true) {
241  write_index_header (ivf, f);
242  WRITE1 (ivf->nlist);
243  WRITE1 (ivf->nprobe);
244  write_index (ivf->quantizer, f);
245  if (include_ids) {
246  for (size_t i = 0; i < ivf->nlist; i++)
247  WRITEVECTOR (ivf->ids[i]);
248  }
249  WRITE1 (ivf->maintain_direct_map);
250  WRITEVECTOR (ivf->direct_map);
251 }
252 
253 void write_index (const Index *idx, FILE *f) {
254  if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
255  uint32_t h = fourcc (
256  idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
257  idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
258  WRITE1 (h);
259  write_index_header (idx, f);
260  WRITEVECTOR (idxf->xb);
261  } else if(const IndexLSH * idxl = dynamic_cast<const IndexLSH *> (idx)) {
262  uint32_t h = fourcc ("IxHe");
263  WRITE1 (h);
264  write_index_header (idx, f);
265  WRITE1 (idxl->nbits);
266  WRITE1 (idxl->rotate_data);
267  WRITE1 (idxl->train_thresholds);
268  WRITEVECTOR (idxl->thresholds);
269  WRITE1 (idxl->bytes_per_vec);
270  write_VectorTransform (&idxl->rrot, f);
271  WRITEVECTOR (idxl->codes);
272  } else if(const IndexPQ * idxp = dynamic_cast<const IndexPQ *> (idx)) {
273  uint32_t h = fourcc ("IxPq");
274  WRITE1 (h);
275  write_index_header (idx, f);
276  write_ProductQuantizer (&idxp->pq, f);
277  WRITEVECTOR (idxp->codes);
278  // search params -- maybe not useful to store?
279  WRITE1 (idxp->search_type);
280  WRITE1 (idxp->encode_signs);
281  WRITE1 (idxp->polysemous_ht);
282  } else if(const Index2Layer * idxp =
283  dynamic_cast<const Index2Layer *> (idx)) {
284  uint32_t h = fourcc ("Ix2L");
285  WRITE1 (h);
286  write_index_header (idx, f);
287  write_index (idxp->q1.quantizer, f);
288  WRITE1 (idxp->q1.nlist);
289  WRITE1 (idxp->q1.quantizer_trains_alone);
290  write_ProductQuantizer (&idxp->pq, f);
291  WRITE1 (idxp->code_size_1);
292  WRITE1 (idxp->code_size_2);
293  WRITE1 (idxp->code_size);
294  WRITEVECTOR (idxp->codes);
295  } else if(const IndexScalarQuantizer * idxs =
296  dynamic_cast<const IndexScalarQuantizer *> (idx)) {
297  uint32_t h = fourcc ("IxSQ");
298  WRITE1 (h);
299  write_index_header (idx, f);
300  write_ScalarQuantizer (&idxs->sq, f);
301  WRITEVECTOR (idxs->codes);
302  } else if(const IndexIVFFlat * ivfl =
303  dynamic_cast<const IndexIVFFlat *> (idx)) {
304  uint32_t h = fourcc ("IvFL");
305  WRITE1 (h);
306  write_ivf_header (ivfl, f);
307  for(int i = 0; i < ivfl->nlist; i++)
308  WRITEVECTOR (ivfl->codes[i]);
309  } else if(const IndexIVFScalarQuantizer * ivsc =
310  dynamic_cast<const IndexIVFScalarQuantizer *> (idx)) {
311  uint32_t h = fourcc ("IvSQ");
312  WRITE1 (h);
313  write_ivf_header (ivsc, f);
314  write_ScalarQuantizer (&ivsc->sq, f);
315  WRITE1 (ivsc->code_size);
316  for(int i = 0; i < ivsc->nlist; i++)
317  WRITEVECTOR (ivsc->codes[i]);
318  } else if(const IndexIVFPQ * ivpq =
319  dynamic_cast<const IndexIVFPQ *> (idx)) {
320  const IndexIVFPQR * ivfpqr = dynamic_cast<const IndexIVFPQR *> (idx);
321  const IndexIVFPQCompact * ivfpqc =
322  dynamic_cast<const IndexIVFPQCompact *> (idx);
323  uint32_t h = fourcc (ivfpqr ? "IvQR" : ivfpqc ? "IvPC" : "IvPQ");
324  WRITE1 (h);
325  write_ivf_header (ivpq, f, !ivfpqc);
326  WRITE1 (ivpq->by_residual);
327  WRITE1 (ivpq->code_size);
328  write_ProductQuantizer (&ivpq->pq, f);
329  if (!ivfpqc) {
330  for(int i = 0; i < ivpq->codes.size(); i++)
331  WRITEVECTOR (ivpq->codes[i]);
332  }
333  if (ivfpqr) {
334  write_ProductQuantizer (&ivfpqr->refine_pq, f);
335  WRITEVECTOR (ivfpqr->refine_codes);
336  WRITE1 (ivfpqr->k_factor);
337  }
338  if (ivfpqc) {
339  WRITETABPAD16 (ivfpqc->limits, ivfpqc->nlist + 1);
340  WRITETABPAD16 (ivfpqc->compact_ids, ivfpqc->ntotal);
341  WRITETABPAD16 (ivfpqc->compact_codes,
342  ivfpqc->ntotal * ivfpqc->code_size);
343  }
344  } else if(const IndexPreTransform * ixpt =
345  dynamic_cast<const IndexPreTransform *> (idx)) {
346  uint32_t h = fourcc ("IxPT");
347  WRITE1 (h);
348  write_index_header (ixpt, f);
349  int nt = ixpt->chain.size();
350  WRITE1 (nt);
351  for (int i = 0; i < nt; i++)
352  write_VectorTransform (ixpt->chain[i], f);
353  write_index (ixpt->index, f);
354  } else if(const MultiIndexQuantizer * imiq =
355  dynamic_cast<const MultiIndexQuantizer *> (idx)) {
356  uint32_t h = fourcc ("Imiq");
357  WRITE1 (h);
358  write_index_header (imiq, f);
359  write_ProductQuantizer (&imiq->pq, f);
360  } else if(const IndexRefineFlat * idxrf =
361  dynamic_cast<const IndexRefineFlat *> (idx)) {
362  uint32_t h = fourcc ("IxRF");
363  WRITE1 (h);
364  write_index_header (idxrf, f);
365  write_index (idxrf->base_index, f);
366  write_index (&idxrf->refine_index, f);
367  WRITE1 (idxrf->k_factor);
368  } else if(const IndexIDMap * idxmap =
369  dynamic_cast<const IndexIDMap *> (idx)) {
370  uint32_t h =
371  dynamic_cast<const IndexIDMap2 *> (idx) ? fourcc ("IxM2") :
372  fourcc ("IxMp");
373  // no need to store additional info for IndexIDMap2
374  WRITE1 (h);
375  write_index_header (idxmap, f);
376  write_index (idxmap->index, f);
377  WRITEVECTOR (idxmap->id_map);
378  } else if(const IndexHNSW * idxhnsw =
379  dynamic_cast<const IndexHNSW *> (idx)) {
380  uint32_t h =
381  dynamic_cast<const IndexHNSWFlat*>(idx) ? fourcc("IHNf") :
382  dynamic_cast<const IndexHNSWPQ*>(idx) ? fourcc("IHNp") :
383  dynamic_cast<const IndexHNSWSQ*>(idx) ? fourcc("IHNs") :
384  dynamic_cast<const IndexHNSW2Level*>(idx) ? fourcc("IHN2") :
385  0;
386  FAISS_THROW_IF_NOT (h != 0);
387  WRITE1 (h);
388  write_index_header (idxhnsw, f);
389  write_HNSW (&idxhnsw->hnsw, f);
390  write_index (idxhnsw->storage, f);
391  } else {
392  FAISS_THROW_MSG ("don't know how to serialize this type of index");
393  }
394 }
395 
396 void write_index (const Index *idx, const char *fname) {
397  FILE *f = fopen (fname, "w");
398  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
399  ScopeFileCloser closer(f);
400  write_index (idx, f);
401 }
402 
403 void write_VectorTransform (const VectorTransform *vt, const char *fname) {
404  FILE *f = fopen (fname, "w");
405  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
406  ScopeFileCloser closer(f);
407  write_VectorTransform (vt, f);
408 }
409 
410 /*************************************************************
411  * Read
412  **************************************************************/
413 
414 static void read_index_header (Index *idx, FILE *f) {
415  READ1 (idx->d);
416  READ1 (idx->ntotal);
417  Index::idx_t dummy;
418  READ1 (dummy);
419  READ1 (dummy);
420  READ1 (idx->is_trained);
421  READ1 (idx->metric_type);
422  idx->verbose = false;
423 }
424 
425 VectorTransform* read_VectorTransform (FILE *f) {
426  uint32_t h;
427  READ1 (h);
428  VectorTransform *vt = nullptr;
429 
430  if (h == fourcc ("rrot") || h == fourcc ("PCAm") ||
431  h == fourcc ("LTra") || h == fourcc ("PcAm")) {
432  LinearTransform *lt = nullptr;
433  if (h == fourcc ("rrot")) {
434  lt = new RandomRotationMatrix ();
435  } else if (h == fourcc ("PCAm") ||
436  h == fourcc ("PcAm")) {
437  PCAMatrix * pca = new PCAMatrix ();
438  READ1 (pca->eigen_power);
439  READ1 (pca->random_rotation);
440  if (h == fourcc ("PcAm"))
441  READ1 (pca->balanced_bins);
442  READVECTOR (pca->mean);
443  READVECTOR (pca->eigenvalues);
444  READVECTOR (pca->PCAMat);
445  lt = pca;
446  } else if (h == fourcc ("LTra")) {
447  lt = new LinearTransform ();
448  }
449  READ1 (lt->have_bias);
450  READVECTOR (lt->A);
451  READVECTOR (lt->b);
452  FAISS_THROW_IF_NOT (lt->A.size() >= lt->d_in * lt->d_out);
453  FAISS_THROW_IF_NOT (!lt->have_bias || lt->b.size() >= lt->d_out);
454  lt->set_is_orthonormal();
455  vt = lt;
456  } else if (h == fourcc ("RmDT")) {
457  RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
458  READVECTOR (rdt->map);
459  vt = rdt;
460  } else if (h == fourcc ("VNrm")) {
461  NormalizationTransform *nt = new NormalizationTransform ();
462  READ1 (nt->norm);
463  vt = nt;
464  } else {
465  FAISS_THROW_MSG("fourcc not recognized");
466  }
467  READ1 (vt->d_in);
468  READ1 (vt->d_out);
469  READ1 (vt->is_trained);
470  return vt;
471 }
472 
473 static void read_ProductQuantizer (ProductQuantizer *pq, FILE *f) {
474  READ1 (pq->d);
475  READ1 (pq->M);
476  READ1 (pq->nbits);
477  pq->set_derived_values ();
478  READVECTOR (pq->centroids);
479 }
480 
481 static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
482  READ1 (ivsc->qtype);
483  READ1 (ivsc->rangestat);
484  READ1 (ivsc->rangestat_arg);
485  READ1 (ivsc->d);
486  READ1 (ivsc->code_size);
487  READVECTOR (ivsc->trained);
488 }
489 
490 static void read_HNSW (HNSW *hnsw, FILE *f) {
491  READVECTOR (hnsw->assign_probas);
492  READVECTOR (hnsw->cum_nneighbor_per_level);
493  READVECTOR (hnsw->levels);
494  READVECTOR (hnsw->offsets);
495  READVECTOR (hnsw->neighbors);
496 
497  READ1 (hnsw->entry_point);
498  READ1 (hnsw->max_level);
499  READ1 (hnsw->efConstruction);
500  READ1 (hnsw->efSearch);
501  READ1 (hnsw->upper_beam);
502 }
503 
504 ProductQuantizer * read_ProductQuantizer (const char*fname) {
505  FILE *f = fopen (fname, "r");
506  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
507  ScopeFileCloser closer(f);
508  ProductQuantizer *pq = new ProductQuantizer();
509  ScopeDeleter1<ProductQuantizer> del (pq);
510  read_ProductQuantizer(pq, f);
511  del.release ();
512  return pq;
513 }
514 
515 static void read_ivf_header (IndexIVF * ivf, FILE *f,
516  bool include_ids = true) {
517  read_index_header (ivf, f);
518  READ1 (ivf->nlist);
519  READ1 (ivf->nprobe);
520  ivf->quantizer = read_index (f);
521  ivf->own_fields = true;
522  if (include_ids) {
523  ivf->ids.resize (ivf->nlist);
524  for (size_t i = 0; i < ivf->nlist; i++)
525  READVECTOR (ivf->ids[i]);
526  }
527  READ1 (ivf->maintain_direct_map);
528  READVECTOR (ivf->direct_map);
529 }
530 
531 static IndexIVFPQ *read_ivfpq (FILE *f, uint32_t h, bool try_mmap)
532 {
533 
534  IndexIVFPQR *ivfpqr =
535  h == fourcc ("IvQR") ? new IndexIVFPQR () : nullptr;
536  IndexIVFPQCompact *ivfpqc =
537  h == fourcc ("IvPC") ? new IndexIVFPQCompact () : nullptr;
538  IndexIVFPQ * ivpq = ivfpqr ? ivfpqr : ivfpqc ? ivfpqc : new IndexIVFPQ ();
539  read_ivf_header (ivpq, f, !ivfpqc);
540  READ1 (ivpq->by_residual);
541  READ1 (ivpq->code_size);
542  read_ProductQuantizer (&ivpq->pq, f);
543  if (!ivfpqc) {
544  ivpq->codes.resize (ivpq->nlist);
545  for (size_t i = 0; i < ivpq->nlist; i++)
546  READVECTOR (ivpq->codes[i]);
547  }
548  // precomputed table not stored. It is cheaper to recompute it
549  ivpq->use_precomputed_table = 0;
550  if (ivpq->by_residual)
551  ivpq->precompute_table ();
552  if (ivfpqr) {
553  read_ProductQuantizer (&ivfpqr->refine_pq, f);
554  READVECTOR (ivfpqr->refine_codes);
555  READ1 (ivfpqr->k_factor);
556  }
557  if (ivfpqc) {
558  if (!try_mmap) {
559  READTABPAD16 (ivfpqc->limits, uint32_t, ivfpqc->nlist + 1);
560  READTABPAD16 (ivfpqc->compact_ids, uint32_t, ivfpqc->ntotal);
561  READTABPAD16 (ivfpqc->compact_codes, uint8_t,
562  ivfpqc->ntotal * ivfpqc->code_size);
563  } else {
564  long offset_limits, offset_compact_ids, offset_compact_codes;
565  TABOFFSETPAD16 (offset_limits, uint32_t, ivfpqc->nlist + 1);
566  TABOFFSETPAD16 (offset_compact_ids, uint32_t, ivfpqc->ntotal);
567  TABOFFSETPAD16 (offset_compact_codes, uint8_t,
568  ivfpqc->ntotal * ivfpqc->code_size);
569  ivfpqc->mmap_length = ftell (f);
570  // mmap the whole file
571  ivfpqc->mmap_buffer = (char*)mmap (
572  nullptr, ivfpqc->mmap_length,
573  PROT_READ, MAP_SHARED, fileno (f), 0);
574  if (!ivfpqc->mmap_buffer) {
575  perror ("mmap failed");
576  abort ();
577  }
578  // at this point the file can be closed, it does not
579  // invalidate the mapping
580  ivfpqc->limits = (uint32_t*)(ivfpqc->mmap_buffer + offset_limits);
581  ivfpqc->compact_ids = (uint32_t*)(ivfpqc->mmap_buffer +
582  offset_compact_ids);
583  ivfpqc->compact_codes = (uint8_t*)(ivfpqc->mmap_buffer +
584  offset_compact_codes);
585  }
586  }
587  return ivpq;
588 }
589 
590 int read_old_fmt_hack = 0;
591 
592 Index *read_index (FILE * f, bool try_mmap) {
593  Index * idx = nullptr;
594  uint32_t h;
595  READ1 (h);
596  if (h == fourcc ("IxFI") || h == fourcc ("IxF2")) {
597  IndexFlat *idxf;
598  if (h == fourcc ("IxFI")) idxf = new IndexFlatIP ();
599  else idxf = new IndexFlatL2 ();
600  read_index_header (idxf, f);
601  READVECTOR (idxf->xb);
602  FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->d);
603  // leak!
604  idx = idxf;
605  } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
606  IndexLSH * idxl = new IndexLSH ();
607  read_index_header (idxl, f);
608  READ1 (idxl->nbits);
609  READ1 (idxl->rotate_data);
610  READ1 (idxl->train_thresholds);
611  READVECTOR (idxl->thresholds);
612  READ1 (idxl->bytes_per_vec);
613  if (h == fourcc("IxHE")) {
614  FAISS_THROW_IF_NOT_FMT (idxl->nbits % 64 == 0,
615  "can only read old format IndexLSH with "
616  "nbits multiple of 64 (got %d)",
617  (int) idxl->nbits);
618  // leak
619  idxl->bytes_per_vec *= 8;
620  }
621  {
622  RandomRotationMatrix *rrot = dynamic_cast<RandomRotationMatrix *>
623  (read_VectorTransform (f));
624  FAISS_THROW_IF_NOT_MSG(rrot, "expected a random rotation");
625  idxl->rrot = *rrot;
626  delete rrot;
627  }
628  READVECTOR (idxl->codes);
629  FAISS_THROW_IF_NOT (idxl->rrot.d_in == idxl->d &&
630  idxl->rrot.d_out == idxl->nbits);
631  FAISS_THROW_IF_NOT (
632  idxl->codes.size() == idxl->ntotal * idxl->bytes_per_vec);
633  idx = idxl;
634  } else if (h == fourcc ("IxPQ") || h == fourcc ("IxPo") ||
635  h == fourcc ("IxPq")) {
636  // IxPQ and IxPo were merged into the same IndexPQ object
637  IndexPQ * idxp =new IndexPQ ();
638  read_index_header (idxp, f);
639  read_ProductQuantizer (&idxp->pq, f);
640  READVECTOR (idxp->codes);
641  if (h == fourcc ("IxPo") || h == fourcc ("IxPq")) {
642  READ1 (idxp->search_type);
643  READ1 (idxp->encode_signs);
644  READ1 (idxp->polysemous_ht);
645  }
646  // Old versoins of PQ all had metric_type set to INNER_PRODUCT
647  // when they were in fact using L2. Therefore, we force metric type
648  // to L2 when the old format is detected
649  if (h == fourcc ("IxPQ") || h == fourcc ("IxPo")) {
650  idxp->metric_type = METRIC_L2;
651  }
652  idx = idxp;
653  } else if (h == fourcc ("IvFl") || h == fourcc("IvFL")) {
654  IndexIVFFlat * ivfl = new IndexIVFFlat ();
655  read_ivf_header (ivfl, f);
656  ivfl->code_size = ivfl->d * sizeof(float);
657  ivfl->codes.resize (ivfl->nlist);
658  if (h == fourcc ("IvFL")) {
659  for (size_t i = 0; i < ivfl->nlist; i++) {
660  READVECTOR (ivfl->codes[i]);
661  }
662  } else { // old format
663  for (size_t i = 0; i < ivfl->nlist; i++) {
664  std::vector<float> vec;
665  READVECTOR (vec);
666  ivfl->codes[i].resize(vec.size() * sizeof(float));
667  memcpy(ivfl->codes[i].data(), vec.data(),
668  ivfl->codes[i].size());
669  }
670  }
671  idx = ivfl;
672  } else if (h == fourcc ("IxSQ")) {
674  read_index_header (idxs, f);
675  read_ScalarQuantizer (&idxs->sq, f);
676  READVECTOR (idxs->codes);
677  idxs->code_size = idxs->sq.code_size;
678  idx = idxs;
679  } else if(h == fourcc ("IvSQ")) {
681  read_ivf_header (ivsc, f);
682  ivsc->codes.resize(ivsc->nlist);
683  read_ScalarQuantizer (&ivsc->sq, f);
684  READ1 (ivsc->code_size);
685  for(int i = 0; i < ivsc->nlist; i++)
686  READVECTOR (ivsc->codes[i]);
687  idx = ivsc;
688  } else if(h == fourcc ("IvPQ") || h == fourcc ("IvQR") ||
689  h == fourcc ("IvPC")) {
690 
691  idx = read_ivfpq (f, h, try_mmap);
692 
693  } else if(h == fourcc ("IxPT")) {
694  IndexPreTransform * ixpt = new IndexPreTransform();
695  ixpt->own_fields = true;
696  read_index_header (ixpt, f);
697  int nt;
698  if (read_old_fmt_hack == 2) {
699  nt = 1;
700  } else {
701  READ1 (nt);
702  }
703  for (int i = 0; i < nt; i++) {
704  ixpt->chain.push_back (read_VectorTransform (f));
705  }
706  ixpt->index = read_index (f);
707  idx = ixpt;
708  } else if(h == fourcc ("Imiq")) {
710  read_index_header (imiq, f);
711  read_ProductQuantizer (&imiq->pq, f);
712  idx = imiq;
713  } else if(h == fourcc ("IxRF")) {
714  IndexRefineFlat *idxrf = new IndexRefineFlat ();
715  read_index_header (idxrf, f);
716  idxrf->base_index = read_index(f);
717  idxrf->own_fields = true;
718  IndexFlat *rf = dynamic_cast<IndexFlat*> (read_index (f));
719  std::swap (*rf, idxrf->refine_index);
720  delete rf;
721  READ1 (idxrf->k_factor);
722  idx = idxrf;
723  } else if(h == fourcc ("IxMp") || h == fourcc ("IxM2")) {
724  bool is_map2 = h == fourcc ("IxM2");
725  IndexIDMap * idxmap = is_map2 ? new IndexIDMap2 () : new IndexIDMap ();
726  read_index_header (idxmap, f);
727  idxmap->index = read_index (f);
728  idxmap->own_fields = true;
729  READVECTOR (idxmap->id_map);
730  if (is_map2) {
731  static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
732  }
733  idx = idxmap;
734  } else if (h == fourcc ("Ix2L")) {
735  Index2Layer * idxp = new Index2Layer ();
736  read_index_header (idxp, f);
737  idxp->q1.quantizer = read_index (f);
738  READ1 (idxp->q1.nlist);
739  READ1 (idxp->q1.quantizer_trains_alone);
740  read_ProductQuantizer (&idxp->pq, f);
741  READ1 (idxp->code_size_1);
742  READ1 (idxp->code_size_2);
743  READ1 (idxp->code_size);
744  READVECTOR (idxp->codes);
745  idx = idxp;
746  } else if(h == fourcc("IHNf") || h == fourcc("IHNp") ||
747  h == fourcc("IHNs") || h == fourcc("IHN2")) {
748  IndexHNSW *idxhnsw = nullptr;
749  if (h == fourcc("IHNf")) idxhnsw = new IndexHNSWFlat ();
750  if (h == fourcc("IHNp")) idxhnsw = new IndexHNSWPQ ();
751  if (h == fourcc("IHNs")) idxhnsw = new IndexHNSWSQ ();
752  if (h == fourcc("IHN2")) idxhnsw = new IndexHNSW2Level ();
753  read_index_header (idxhnsw, f);
754  read_HNSW (&idxhnsw->hnsw, f);
755  idxhnsw->storage = read_index (f);
756  idxhnsw->own_fields = true;
757  if (h == fourcc("IHNp")) {
758  dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table ();
759  }
760  idx = idxhnsw;
761  } else {
762  FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
763  idx = nullptr;
764  }
765  return idx;
766 }
767 
768 
769 
770 Index *read_index (const char *fname, bool try_mmap) {
771  FILE *f = fopen (fname, "r");
772  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for reading:", fname);
773  Index *idx = read_index (f, try_mmap);
774  fclose (f);
775  return idx;
776 }
777 
778 VectorTransform *read_VectorTransform (const char *fname) {
779  FILE *f = fopen (fname, "r");
780  if (!f) {
781  fprintf (stderr, "cannot open %s for reading:", fname);
782  perror ("");
783  abort ();
784  }
785  VectorTransform *vt = read_VectorTransform (f);
786  fclose (f);
787  return vt;
788 }
789 
790 /*************************************************************
791  * cloning functions
792  **************************************************************/
793 
794 
795 
796 Index * clone_index (const Index *index)
797 {
798  Cloner cl;
799  return cl.clone_Index (index);
800 }
801 
802 // assumes there is a copy constructor ready. Always try from most
803 // specific to most general
804 #define TRYCLONE(classname, obj) \
805  if (const classname *clo = dynamic_cast<const classname *>(obj)) { \
806  return new classname(*clo); \
807  } else
808 
809 VectorTransform *Cloner::clone_VectorTransform (const VectorTransform *vt)
810 {
811  TRYCLONE (RemapDimensionsTransform, vt)
812  TRYCLONE (OPQMatrix, vt)
813  TRYCLONE (PCAMatrix, vt)
814  TRYCLONE (RandomRotationMatrix, vt)
815  TRYCLONE (LinearTransform, vt)
816  {
817  FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
818  }
819  return nullptr;
820 }
821 
822 IndexIVF * Cloner::clone_IndexIVF (const IndexIVF *ivf)
823 {
824  TRYCLONE (IndexIVFPQR, ivf)
825  TRYCLONE (IndexIVFPQ, ivf)
826  TRYCLONE (IndexIVFFlat, ivf)
827  TRYCLONE (IndexIVFScalarQuantizer, ivf)
828  {
829  FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
830  }
831  return nullptr;
832 }
833 
834 Index *Cloner::clone_Index (const Index *index)
835 {
836  TRYCLONE (IndexPQ, index)
837  TRYCLONE (IndexLSH, index)
838  TRYCLONE (IndexFlatL2, index)
839  TRYCLONE (IndexFlatIP, index)
840  TRYCLONE (IndexFlat, index)
841  TRYCLONE (IndexScalarQuantizer, index)
842  TRYCLONE (MultiIndexQuantizer, index)
843  if (const IndexIVF * ivf = dynamic_cast<const IndexIVF*>(index)) {
844  IndexIVF *res = clone_IndexIVF (ivf);
845  res->own_fields = true;
846  res->quantizer = clone_Index (ivf->quantizer);
847  return res;
848  } else if (const IndexPreTransform * ipt =
849  dynamic_cast<const IndexPreTransform*> (index)) {
850  IndexPreTransform *res = new IndexPreTransform ();
851  res->d = ipt->d;
852  res->index = clone_Index (ipt->index);
853  for (int i = 0; i < ipt->chain.size(); i++)
854  res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
855  res->own_fields = true;
856  return res;
857  } else if (const IndexIDMap *idmap =
858  dynamic_cast<const IndexIDMap*> (index)) {
859  IndexIDMap *res = new IndexIDMap (*idmap);
860  res->own_fields = true;
861  res->index = clone_Index (idmap->index);
862  return res;
863  } else {
864  FAISS_THROW_MSG( "clone not supported for this type of Index");
865  }
866  return nullptr;
867 }
868 
869 
870 } // namespace faiss
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
Definition: IndexPQ.h:34
size_t code_size
bytes per vector
size_t code_size_2
size of the code for the second level
Definition: IndexIVFPQ.h:253
Index * index
! chain of tranforms
Randomly rotate a set of vectors.
size_t code_size
code_size_1 + code_size_2
Definition: IndexIVFPQ.h:256
Index * read_index(FILE *f, bool try_mmap)
Definition: index_io.cpp:592
int bytes_per_vec
nb of 8-bits per encoded vector
Definition: IndexLSH.h:28
std::vector< float > thresholds
thresholds to compare with
Definition: IndexLSH.h:34
bool train_thresholds
whether we train thresholds or use 0
Definition: IndexLSH.h:30
Level1Quantizer q1
first level quantizer
Definition: IndexIVFPQ.h:241
std::vector< uint8_t > codes
Codes. Size ntotal * code_size.
Definition: IndexIVFPQ.h:247
Index * base_index
faster index to pre-select the vectors that should be filtered
Definition: IndexFlat.h:109
IndexFlat refine_index
storage for full vectors
Definition: IndexFlat.h:106
bool own_fields
should the base index be deallocated?
Definition: IndexFlat.h:110
int d
vector dimension
Definition: Index.h:64
std::vector< long > id_map
! whether pointers are deleted in destructo
Definition: MetaIndexes.h:29
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
RandomRotationMatrix rrot
optional random rotation
Definition: IndexLSH.h:32
ScalarQuantizer sq
Used to encode the vectors.
long idx_t
all indices are this type
Definition: Index.h:62
ProductQuantizer pq
The product quantizer used to encode the vectors.
Definition: IndexPQ.h:31
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
bool own_fields
! the sub-index
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
int d_out
! input dimension
size_t code_size_1
size of the code for the first level (ceil(log8(q1.nlist)))
Definition: IndexIVFPQ.h:250
int nbits
nb of bits per vector
Definition: IndexLSH.h:27
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69
ProductQuantizer pq
second level quantizer is always a PQ
Definition: IndexIVFPQ.h:244
std::vector< float > xb
database vectors, size ntotal * d
Definition: IndexFlat.h:25
int polysemous_ht
Hamming threshold used for polysemy.
Definition: IndexPQ.h:91
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:83
bool rotate_data
whether to apply a random rotation to input
Definition: IndexLSH.h:29
std::vector< uint8_t > codes
encoded dataset
Definition: IndexLSH.h:37
bool own_fields
! the sub-index
Definition: MetaIndexes.h:28