Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/index_io.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "index_io.h"
12 
13 #include <cstdio>
14 #include <cstdlib>
15 
16 #include <sys/mman.h>
17 
18 #include "FaissAssert.h"
19 
20 #include "IndexFlat.h"
21 #include "VectorTransform.h"
22 #include "IndexLSH.h"
23 #include "IndexPQ.h"
24 #include "IndexIVF.h"
25 #include "IndexIVFPQ.h"
26 #include "MetaIndexes.h"
27 #include "IndexScalarQuantizer.h"
28 
29 /*************************************************************
30  * The I/O format is the content of the class. For objects that are
31  * inherited, like Index, a 4-character-code (fourcc) indicates which
32  * child class this is an instance of.
33  *
34  * In this case, the fields of the parent class are written first,
35  * then the ones for the child classes. Note that this requires
36  * classes to be serialized to have a constructor without parameters,
37  * so that the fields can be filled in later. The default constructor
38  * should set reasonable defaults for all fields.
39  *
40  * The fourccs are assigned arbitrarily. When the class changed (added
41  * or deprecated fields), the fourcc can be replaced. New code should
42  * be able to read the old fourcc and fill in new classes.
43  *
44  * TODO: serialization to strings for use in Python pickle or Torch
45  * serialization.
46  *
47  * TODO: in this file, the read functions that encouter errors may
48  * leak memory.
49  **************************************************************/
50 
51 
52 
53 namespace faiss {
54 
55 static uint32_t fourcc (const char sx[4]) {
56  const unsigned char *x = (unsigned char*)sx;
57  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
58 }
59 
60 /*************************************************************
61  * I/O macros
62  *
63  * we use macros so that we have a line number to report in
64  * abort (). This makes debugging a lot easier.
65  **************************************************************/
66 
67 
68 #define WRITEANDCHECK(ptr, n) { \
69  size_t ret = fwrite (ptr, sizeof (* (ptr)), n, f); \
70  FAISS_THROW_IF_NOT_MSG (ret == (n), "write error"); \
71  }
72 
73 #define READANDCHECK(ptr, n) { \
74  size_t ret = fread (ptr, sizeof (* (ptr)), n, f); \
75  FAISS_THROW_IF_NOT_MSG (ret == (n), "read error"); \
76  }
77 
78 #define WRITE1(x) WRITEANDCHECK(&(x), 1)
79 #define READ1(x) READANDCHECK(&(x), 1)
80 
81 #define WRITEVECTOR(vec) { \
82  size_t size = (vec).size (); \
83  WRITEANDCHECK (&size, 1); \
84  WRITEANDCHECK ((vec).data (), size); \
85  }
86 
87 #define READVECTOR(vec) { \
88  long size; \
89  READANDCHECK (&size, 1); \
90  FAISS_THROW_IF_NOT (size >= 0 && size < (1L << 40)); \
91  (vec).resize (size); \
92  READANDCHECK ((vec).data (), size); \
93  }
94 
96  FILE *f;
97  ScopeFileCloser (FILE *f): f (f) {}
98  ~ScopeFileCloser () {fclose (f); }
99 };
100 
101 // Macros for read/write arrays aligned to 16 bytes in the
102 // file. Useful when mmapped.
103 
104 #define WRITETABPAD16(tab, size_in) { \
105  size_t size = (size_in); \
106  WRITEANDCHECK (&size, 1); \
107  uint8_t padding[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \
108  int idx = ftell(f) % 16; \
109  padding [idx] = 15 - idx; \
110  WRITEANDCHECK (padding + idx, 16 - idx); \
111  WRITEANDCHECK ((tab), size); \
112 }
113 
114 #define READTABPAD16(tab, basetype, expected_size) { \
115  size_t size; \
116  READANDCHECK (&size, 1); \
117  FAISS_THROW_IF_NOT ((expected_size) == size); \
118  uint8_t padding[16], npad; \
119  READ1(npad); \
120  FAISS_THROW_IF_NOT (npad < 16); \
121  READANDCHECK (padding, npad); \
122  (tab) = new basetype [size]; \
123  READANDCHECK ((tab), size); \
124 }
125 
126 // read only the array header, return its offset and skip over it
127 #define TABOFFSETPAD16(taboffset, basetype, expected_size) { \
128  size_t size; \
129  READANDCHECK (&size, 1); \
130  FAISS_THROW_IF_NOT ((expected_size) == size); \
131  uint8_t padding[16], npad; \
132  READ1(npad); \
133  FAISS_THROW_IF_NOT (npad < 16); \
134  READANDCHECK (padding, npad); \
135  taboffset = ftell(f); \
136  fseek (f, sizeof(basetype) * size, SEEK_CUR); \
137 }
138 
139 
140 
141 
142 /*************************************************************
143  * Write
144  **************************************************************/
145 
146 static void write_index_header (const Index *idx, FILE *f) {
147  WRITE1 (idx->d);
148  WRITE1 (idx->ntotal);
149  Index::idx_t dummy = 1 << 20;
150  WRITE1 (dummy);
151  WRITE1 (dummy);
152  WRITE1 (idx->is_trained);
153  WRITE1 (idx->metric_type);
154 }
155 
156 
157 
158 void write_VectorTransform (const VectorTransform *vt, FILE *f) {
159  if (const LinearTransform * lt =
160  dynamic_cast < const LinearTransform *> (vt)) {
161  if (dynamic_cast<const RandomRotationMatrix *>(lt)) {
162  uint32_t h = fourcc ("rrot");
163  WRITE1 (h);
164  } else if (const PCAMatrix * pca =
165  dynamic_cast<const PCAMatrix *>(lt)) {
166  uint32_t h = fourcc ("PcAm");
167  WRITE1 (h);
168  WRITE1 (pca->eigen_power);
169  WRITE1 (pca->random_rotation);
170  WRITE1 (pca->balanced_bins);
171  WRITEVECTOR (pca->mean);
172  WRITEVECTOR (pca->eigenvalues);
173  WRITEVECTOR (pca->PCAMat);
174  } else {
175  // generic LinearTransform (includes OPQ)
176  uint32_t h = fourcc ("LTra");
177  WRITE1 (h);
178  }
179  WRITE1 (lt->have_bias);
180  WRITEVECTOR (lt->A);
181  WRITEVECTOR (lt->b);
182  } else if (const RemapDimensionsTransform *rdt =
183  dynamic_cast<const RemapDimensionsTransform *>(vt)) {
184  uint32_t h = fourcc ("RmDT");
185  WRITE1 (h);
186  WRITEVECTOR (rdt->map);
187  } else if (const NormalizationTransform *nt =
188  dynamic_cast<const NormalizationTransform *>(vt)) {
189  uint32_t h = fourcc ("VNrm");
190  WRITE1 (h);
191  WRITE1 (nt->norm);
192  } else {
193  FAISS_THROW_MSG ("cannot serialize this");
194  }
195  // common fields
196  WRITE1 (vt->d_in);
197  WRITE1 (vt->d_out);
198  WRITE1 (vt->is_trained);
199 }
200 
201 static void write_ProductQuantizer (const ProductQuantizer *pq, FILE *f) {
202  WRITE1 (pq->d);
203  WRITE1 (pq->M);
204  WRITE1 (pq->nbits);
205  WRITEVECTOR (pq->centroids);
206 }
207 
208 static void write_ScalarQuantizer (const ScalarQuantizer *ivsc, FILE *f) {
209  WRITE1 (ivsc->qtype);
210  WRITE1 (ivsc->rangestat);
211  WRITE1 (ivsc->rangestat_arg);
212  WRITE1 (ivsc->d);
213  WRITE1 (ivsc->code_size);
214  WRITEVECTOR (ivsc->trained);
215 }
216 
217 void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
218  FILE *f = fopen (fname, "w");
219  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
220  ScopeFileCloser closer(f);
221  write_ProductQuantizer (pq, f);
222 }
223 
224 
225 static void write_ivf_header (const IndexIVF * ivf, FILE *f,
226  bool include_ids = true) {
227  write_index_header (ivf, f);
228  WRITE1 (ivf->nlist);
229  WRITE1 (ivf->nprobe);
230  write_index (ivf->quantizer, f);
231  if (include_ids) {
232  for (size_t i = 0; i < ivf->nlist; i++)
233  WRITEVECTOR (ivf->ids[i]);
234  }
235  WRITE1 (ivf->maintain_direct_map);
236  WRITEVECTOR (ivf->direct_map);
237 }
238 
239 void write_index (const Index *idx, FILE *f) {
240  if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
241  uint32_t h = fourcc (
242  idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
243  idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
244  WRITE1 (h);
245  write_index_header (idx, f);
246  WRITEVECTOR (idxf->xb);
247  } else if(const IndexLSH * idxl = dynamic_cast<const IndexLSH *> (idx)) {
248  uint32_t h = fourcc ("IxHe");
249  WRITE1 (h);
250  write_index_header (idx, f);
251  WRITE1 (idxl->nbits);
252  WRITE1 (idxl->rotate_data);
253  WRITE1 (idxl->train_thresholds);
254  WRITEVECTOR (idxl->thresholds);
255  WRITE1 (idxl->bytes_per_vec);
256  write_VectorTransform (&idxl->rrot, f);
257  WRITEVECTOR (idxl->codes);
258  } else if(const IndexPQ * idxp = dynamic_cast<const IndexPQ *> (idx)) {
259  uint32_t h = fourcc ("IxPq");
260  WRITE1 (h);
261  write_index_header (idx, f);
262  write_ProductQuantizer (&idxp->pq, f);
263  WRITEVECTOR (idxp->codes);
264  // search params -- maybe not useful to store?
265  WRITE1 (idxp->search_type);
266  WRITE1 (idxp->encode_signs);
267  WRITE1 (idxp->polysemous_ht);
268  } else if(const IndexScalarQuantizer * idxs =
269  dynamic_cast<const IndexScalarQuantizer *> (idx)) {
270  uint32_t h = fourcc ("IxSQ");
271  WRITE1 (h);
272  write_index_header (idx, f);
273  write_ScalarQuantizer (&idxs->sq, f);
274  WRITEVECTOR (idxs->codes);
275  } else if(const IndexIVFFlat * ivfl =
276  dynamic_cast<const IndexIVFFlat *> (idx)) {
277  uint32_t h = fourcc ("IvFL");
278  WRITE1 (h);
279  write_ivf_header (ivfl, f);
280  for(int i = 0; i < ivfl->nlist; i++)
281  WRITEVECTOR (ivfl->codes[i]);
282  } else if(const IndexIVFScalarQuantizer * ivsc =
283  dynamic_cast<const IndexIVFScalarQuantizer *> (idx)) {
284  uint32_t h = fourcc ("IvSQ");
285  WRITE1 (h);
286  write_ivf_header (ivsc, f);
287  write_ScalarQuantizer (&ivsc->sq, f);
288  WRITE1 (ivsc->code_size);
289  for(int i = 0; i < ivsc->nlist; i++)
290  WRITEVECTOR (ivsc->codes[i]);
291  } else if(const IndexIVFPQ * ivpq =
292  dynamic_cast<const IndexIVFPQ *> (idx)) {
293  const IndexIVFPQR * ivfpqr = dynamic_cast<const IndexIVFPQR *> (idx);
294  const IndexIVFPQCompact * ivfpqc =
295  dynamic_cast<const IndexIVFPQCompact *> (idx);
296  uint32_t h = fourcc (ivfpqr ? "IvQR" : ivfpqc ? "IvPC" : "IvPQ");
297  WRITE1 (h);
298  write_ivf_header (ivpq, f, !ivfpqc);
299  WRITE1 (ivpq->by_residual);
300  WRITE1 (ivpq->code_size);
301  write_ProductQuantizer (&ivpq->pq, f);
302  if (!ivfpqc) {
303  for(int i = 0; i < ivpq->codes.size(); i++)
304  WRITEVECTOR (ivpq->codes[i]);
305  }
306  if (ivfpqr) {
307  write_ProductQuantizer (&ivfpqr->refine_pq, f);
308  WRITEVECTOR (ivfpqr->refine_codes);
309  WRITE1 (ivfpqr->k_factor);
310  }
311  if (ivfpqc) {
312  WRITETABPAD16 (ivfpqc->limits, ivfpqc->nlist + 1);
313  WRITETABPAD16 (ivfpqc->compact_ids, ivfpqc->ntotal);
314  WRITETABPAD16 (ivfpqc->compact_codes,
315  ivfpqc->ntotal * ivfpqc->code_size);
316  }
317  } else if(const IndexPreTransform * ixpt =
318  dynamic_cast<const IndexPreTransform *> (idx)) {
319  uint32_t h = fourcc ("IxPT");
320  WRITE1 (h);
321  write_index_header (ixpt, f);
322  int nt = ixpt->chain.size();
323  WRITE1 (nt);
324  for (int i = 0; i < nt; i++)
325  write_VectorTransform (ixpt->chain[i], f);
326  write_index (ixpt->index, f);
327  } else if(const MultiIndexQuantizer * imiq =
328  dynamic_cast<const MultiIndexQuantizer *> (idx)) {
329  uint32_t h = fourcc ("Imiq");
330  WRITE1 (h);
331  write_index_header (imiq, f);
332  write_ProductQuantizer (&imiq->pq, f);
333  } else if(const IndexRefineFlat * idxrf =
334  dynamic_cast<const IndexRefineFlat *> (idx)) {
335  uint32_t h = fourcc ("IxRF");
336  WRITE1 (h);
337  write_index_header (idxrf, f);
338  write_index (idxrf->base_index, f);
339  write_index (&idxrf->refine_index, f);
340  WRITE1 (idxrf->k_factor);
341  } else if(const IndexIDMap * idxmap =
342  dynamic_cast<const IndexIDMap *> (idx)) {
343  uint32_t h =
344  dynamic_cast<const IndexIDMap2 *> (idx) ? fourcc ("IxM2") :
345  fourcc ("IxMp");
346  // no need to store additional info for IndexIDMap2
347  WRITE1 (h);
348  write_index_header (idxmap, f);
349  write_index (idxmap->index, f);
350  WRITEVECTOR (idxmap->id_map);
351  } else {
352  FAISS_THROW_MSG ("don't know how to serialize this type of index");
353  }
354 }
355 
356 void write_index (const Index *idx, const char *fname) {
357  FILE *f = fopen (fname, "w");
358  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
359  ScopeFileCloser closer(f);
360  write_index (idx, f);
361 }
362 
363 void write_VectorTransform (const VectorTransform *vt, const char *fname) {
364  FILE *f = fopen (fname, "w");
365  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
366  ScopeFileCloser closer(f);
367  write_VectorTransform (vt, f);
368 }
369 
370 /*************************************************************
371  * Read
372  **************************************************************/
373 
374 static void read_index_header (Index *idx, FILE *f) {
375  READ1 (idx->d);
376  READ1 (idx->ntotal);
377  Index::idx_t dummy;
378  READ1 (dummy);
379  READ1 (dummy);
380  READ1 (idx->is_trained);
381  READ1 (idx->metric_type);
382  idx->verbose = false;
383 }
384 
385 VectorTransform* read_VectorTransform (FILE *f) {
386  uint32_t h;
387  READ1 (h);
388  VectorTransform *vt = nullptr;
389 
390  if (h == fourcc ("rrot") || h == fourcc ("PCAm") ||
391  h == fourcc ("LTra") || h == fourcc ("PcAm")) {
392  LinearTransform *lt = nullptr;
393  if (h == fourcc ("rrot")) {
394  lt = new RandomRotationMatrix ();
395  } else if (h == fourcc ("PCAm") ||
396  h == fourcc ("PcAm")) {
397  PCAMatrix * pca = new PCAMatrix ();
398  READ1 (pca->eigen_power);
399  READ1 (pca->random_rotation);
400  if (h == fourcc ("PcAm"))
401  READ1 (pca->balanced_bins);
402  READVECTOR (pca->mean);
403  READVECTOR (pca->eigenvalues);
404  READVECTOR (pca->PCAMat);
405  lt = pca;
406  } else if (h == fourcc ("LTra")) {
407  lt = new LinearTransform ();
408  }
409  READ1 (lt->have_bias);
410  READVECTOR (lt->A);
411  READVECTOR (lt->b);
412  vt = lt;
413  } else if (h == fourcc ("RmDT")) {
414  RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
415  READVECTOR (rdt->map);
416  vt = rdt;
417  } else if (h == fourcc ("VNrm")) {
418  NormalizationTransform *nt = new NormalizationTransform ();
419  READ1 (nt->norm);
420  vt = nt;
421  } else {
422  FAISS_THROW_MSG("fourcc not recognized");
423  }
424  READ1 (vt->d_in);
425  READ1 (vt->d_out);
426  READ1 (vt->is_trained);
427  return vt;
428 }
429 
430 static void read_ProductQuantizer (ProductQuantizer *pq, FILE *f) {
431  READ1 (pq->d);
432  READ1 (pq->M);
433  READ1 (pq->nbits);
434  pq->set_derived_values ();
435  READVECTOR (pq->centroids);
436 }
437 
438 static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
439  READ1 (ivsc->qtype);
440  READ1 (ivsc->rangestat);
441  READ1 (ivsc->rangestat_arg);
442  READ1 (ivsc->d);
443  READ1 (ivsc->code_size);
444  READVECTOR (ivsc->trained);
445 }
446 
447 
448 ProductQuantizer * read_ProductQuantizer (const char*fname) {
449  FILE *f = fopen (fname, "r");
450  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
451  ScopeFileCloser closer(f);
452  ProductQuantizer *pq = new ProductQuantizer();
453  ScopeDeleter1<ProductQuantizer> del (pq);
454  read_ProductQuantizer(pq, f);
455  del.release ();
456  return pq;
457 }
458 
459 static void read_ivf_header (IndexIVF * ivf, FILE *f,
460  bool include_ids = true) {
461  read_index_header (ivf, f);
462  READ1 (ivf->nlist);
463  READ1 (ivf->nprobe);
464  ivf->quantizer = read_index (f);
465  ivf->own_fields = true;
466  if (include_ids) {
467  ivf->ids.resize (ivf->nlist);
468  for (size_t i = 0; i < ivf->nlist; i++)
469  READVECTOR (ivf->ids[i]);
470  }
471  READ1 (ivf->maintain_direct_map);
472  READVECTOR (ivf->direct_map);
473 }
474 
475 static IndexIVFPQ *read_ivfpq (FILE *f, uint32_t h, bool try_mmap)
476 {
477 
478  IndexIVFPQR *ivfpqr =
479  h == fourcc ("IvQR") ? new IndexIVFPQR () : nullptr;
480  IndexIVFPQCompact *ivfpqc =
481  h == fourcc ("IvPC") ? new IndexIVFPQCompact () : nullptr;
482  IndexIVFPQ * ivpq = ivfpqr ? ivfpqr : ivfpqc ? ivfpqc : new IndexIVFPQ ();
483  read_ivf_header (ivpq, f, !ivfpqc);
484  READ1 (ivpq->by_residual);
485  READ1 (ivpq->code_size);
486  read_ProductQuantizer (&ivpq->pq, f);
487  if (!ivfpqc) {
488  ivpq->codes.resize (ivpq->nlist);
489  for (size_t i = 0; i < ivpq->nlist; i++)
490  READVECTOR (ivpq->codes[i]);
491  }
492  // precomputed table not stored. It is cheaper to recompute it
493  ivpq->use_precomputed_table = 0;
494  if (ivpq->by_residual)
495  ivpq->precompute_table ();
496  if (ivfpqr) {
497  read_ProductQuantizer (&ivfpqr->refine_pq, f);
498  READVECTOR (ivfpqr->refine_codes);
499  READ1 (ivfpqr->k_factor);
500  }
501  if (ivfpqc) {
502  if (!try_mmap) {
503  READTABPAD16 (ivfpqc->limits, uint32_t, ivfpqc->nlist + 1);
504  READTABPAD16 (ivfpqc->compact_ids, uint32_t, ivfpqc->ntotal);
505  READTABPAD16 (ivfpqc->compact_codes, uint8_t,
506  ivfpqc->ntotal * ivfpqc->code_size);
507  } else {
508  long offset_limits, offset_compact_ids, offset_compact_codes;
509  TABOFFSETPAD16 (offset_limits, uint32_t, ivfpqc->nlist + 1);
510  TABOFFSETPAD16 (offset_compact_ids, uint32_t, ivfpqc->ntotal);
511  TABOFFSETPAD16 (offset_compact_codes, uint8_t,
512  ivfpqc->ntotal * ivfpqc->code_size);
513  ivfpqc->mmap_length = ftell (f);
514  // mmap the whole file
515  ivfpqc->mmap_buffer = (char*)mmap (
516  nullptr, ivfpqc->mmap_length,
517  PROT_READ, MAP_SHARED, fileno (f), 0);
518  if (!ivfpqc->mmap_buffer) {
519  perror ("mmap failed");
520  abort ();
521  }
522  // at this point the file can be closed, it does not
523  // invalidate the mapping
524  ivfpqc->limits = (uint32_t*)(ivfpqc->mmap_buffer + offset_limits);
525  ivfpqc->compact_ids = (uint32_t*)(ivfpqc->mmap_buffer +
526  offset_compact_ids);
527  ivfpqc->compact_codes = (uint8_t*)(ivfpqc->mmap_buffer +
528  offset_compact_codes);
529  }
530  }
531  return ivpq;
532 }
533 
534 int read_old_fmt_hack = 0;
535 
536 Index *read_index (FILE * f, bool try_mmap) {
537  Index * idx = nullptr;
538  uint32_t h;
539  READ1 (h);
540  if (h == fourcc ("IxFI") || h == fourcc ("IxF2")) {
541  IndexFlat *idxf;
542  if (h == fourcc ("IxFI")) idxf = new IndexFlatIP ();
543  else idxf = new IndexFlatL2 ();
544  read_index_header (idxf, f);
545  READVECTOR (idxf->xb);
546  FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->d);
547  // leak!
548  idx = idxf;
549  } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
550  IndexLSH * idxl = new IndexLSH ();
551  read_index_header (idxl, f);
552  READ1 (idxl->nbits);
553  READ1 (idxl->rotate_data);
554  READ1 (idxl->train_thresholds);
555  READVECTOR (idxl->thresholds);
556  READ1 (idxl->bytes_per_vec);
557  if (h == fourcc("IxHE")) {
558  FAISS_THROW_IF_NOT_FMT (idxl->nbits % 64 == 0,
559  "can only read old format IndexLSH with "
560  "nbits multiple of 64 (got %d)",
561  (int) idxl->nbits);
562  // leak
563  idxl->bytes_per_vec *= 8;
564  }
565  {
566  RandomRotationMatrix *rrot = dynamic_cast<RandomRotationMatrix *>
567  (read_VectorTransform (f));
568  FAISS_THROW_IF_NOT_MSG(rrot, "expected a random rotation");
569  idxl->rrot = *rrot;
570  delete rrot;
571  }
572  READVECTOR (idxl->codes);
573  FAISS_THROW_IF_NOT (idxl->rrot.d_in == idxl->d &&
574  idxl->rrot.d_out == idxl->nbits);
575  FAISS_THROW_IF_NOT (
576  idxl->codes.size() == idxl->ntotal * idxl->bytes_per_vec);
577  idx = idxl;
578  } else if (h == fourcc ("IxPQ") || h == fourcc ("IxPo") ||
579  h == fourcc ("IxPq")) {
580  // IxPQ and IxPo were merged into the same IndexPQ object
581  IndexPQ * idxp =new IndexPQ ();
582  read_index_header (idxp, f);
583  read_ProductQuantizer (&idxp->pq, f);
584  READVECTOR (idxp->codes);
585  if (h == fourcc ("IxPo") || h == fourcc ("IxPq")) {
586  READ1 (idxp->search_type);
587  READ1 (idxp->encode_signs);
588  READ1 (idxp->polysemous_ht);
589  }
590  // Old versoins of PQ all had metric_type set to INNER_PRODUCT
591  // when they were in fact using L2. Therefore, we force metric type
592  // to L2 when the old format is detected
593  if (h == fourcc ("IxPQ") || h == fourcc ("IxPo")) {
594  idxp->metric_type = METRIC_L2;
595  }
596  idx = idxp;
597  } else if (h == fourcc ("IvFl") || h == fourcc("IvFL")) {
598  IndexIVFFlat * ivfl = new IndexIVFFlat ();
599  read_ivf_header (ivfl, f);
600  ivfl->code_size = ivfl->d * sizeof(float);
601  ivfl->codes.resize (ivfl->nlist);
602  if (h == fourcc ("IvFL")) {
603  for (size_t i = 0; i < ivfl->nlist; i++) {
604  READVECTOR (ivfl->codes[i]);
605  }
606  } else { // old format
607  for (size_t i = 0; i < ivfl->nlist; i++) {
608  std::vector<float> vec;
609  READVECTOR (vec);
610  ivfl->codes[i].resize(vec.size() * sizeof(float));
611  memcpy(ivfl->codes[i].data(), vec.data(),
612  ivfl->codes[i].size());
613  }
614  }
615  idx = ivfl;
616  } else if (h == fourcc ("IxSQ")) {
618  read_index_header (idxs, f);
619  read_ScalarQuantizer (&idxs->sq, f);
620  READVECTOR (idxs->codes);
621  idxs->code_size = idxs->sq.code_size;
622  idx = idxs;
623  } else if(h == fourcc ("IvSQ")) {
625  read_ivf_header (ivsc, f);
626  ivsc->codes.resize(ivsc->nlist);
627  read_ScalarQuantizer (&ivsc->sq, f);
628  READ1 (ivsc->code_size);
629  for(int i = 0; i < ivsc->nlist; i++)
630  READVECTOR (ivsc->codes[i]);
631  idx = ivsc;
632  } else if(h == fourcc ("IvPQ") || h == fourcc ("IvQR") ||
633  h == fourcc ("IvPC")) {
634 
635  idx = read_ivfpq (f, h, try_mmap);
636 
637  } else if(h == fourcc ("IxPT")) {
638  IndexPreTransform * ixpt = new IndexPreTransform();
639  ixpt->own_fields = true;
640  read_index_header (ixpt, f);
641  int nt;
642  if (read_old_fmt_hack == 2) {
643  nt = 1;
644  } else {
645  READ1 (nt);
646  }
647  for (int i = 0; i < nt; i++) {
648  ixpt->chain.push_back (read_VectorTransform (f));
649  }
650  ixpt->index = read_index (f);
651  idx = ixpt;
652  } else if(h == fourcc ("Imiq")) {
654  read_index_header (imiq, f);
655  read_ProductQuantizer (&imiq->pq, f);
656  idx = imiq;
657  } else if(h == fourcc ("IxRF")) {
658  IndexRefineFlat *idxrf = new IndexRefineFlat ();
659  read_index_header (idxrf, f);
660  idxrf->base_index = read_index(f);
661  idxrf->own_fields = true;
662  IndexFlat *rf = dynamic_cast<IndexFlat*> (read_index (f));
663  std::swap (*rf, idxrf->refine_index);
664  delete rf;
665  READ1 (idxrf->k_factor);
666  idx = idxrf;
667  } else if(h == fourcc ("IxMp") || h == fourcc ("IxM2")) {
668  bool is_map2 = h == fourcc ("IxM2");
669  IndexIDMap * idxmap = is_map2 ? new IndexIDMap2 () : new IndexIDMap ();
670  read_index_header (idxmap, f);
671  idxmap->index = read_index (f);
672  idxmap->own_fields = true;
673  READVECTOR (idxmap->id_map);
674  if (is_map2) {
675  static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
676  }
677  idx = idxmap;
678  } else {
679  FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
680  idx = nullptr;
681  }
682  return idx;
683 }
684 
685 
686 
687 Index *read_index (const char *fname, bool try_mmap) {
688  FILE *f = fopen (fname, "r");
689  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for reading:", fname);
690  Index *idx = read_index (f, try_mmap);
691  fclose (f);
692  return idx;
693 }
694 
695 VectorTransform *read_VectorTransform (const char *fname) {
696  FILE *f = fopen (fname, "r");
697  if (!f) {
698  fprintf (stderr, "cannot open %s for reading:", fname);
699  perror ("");
700  abort ();
701  }
702  VectorTransform *vt = read_VectorTransform (f);
703  fclose (f);
704  return vt;
705 }
706 
707 /*************************************************************
708  * cloning functions
709  **************************************************************/
710 
711 
712 
713 Index * clone_index (const Index *index)
714 {
715  Cloner cl;
716  return cl.clone_Index (index);
717 }
718 
719 // assumes there is a copy constructor ready. Always try from most
720 // specific to most general
721 #define TRYCLONE(classname, obj) \
722  if (const classname *clo = dynamic_cast<const classname *>(obj)) { \
723  return new classname(*clo); \
724  } else
725 
726 VectorTransform *Cloner::clone_VectorTransform (const VectorTransform *vt)
727 {
728  TRYCLONE (RemapDimensionsTransform, vt)
729  TRYCLONE (OPQMatrix, vt)
730  TRYCLONE (PCAMatrix, vt)
731  TRYCLONE (RandomRotationMatrix, vt)
732  TRYCLONE (LinearTransform, vt)
733  {
734  FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
735  }
736  return nullptr;
737 }
738 
739 IndexIVF * Cloner::clone_IndexIVF (const IndexIVF *ivf)
740 {
741  TRYCLONE (IndexIVFPQR, ivf)
742  TRYCLONE (IndexIVFPQ, ivf)
743  TRYCLONE (IndexIVFFlat, ivf)
744  TRYCLONE (IndexIVFScalarQuantizer, ivf)
745  {
746  FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
747  }
748  return nullptr;
749 }
750 
751 Index *Cloner::clone_Index (const Index *index)
752 {
753  TRYCLONE (IndexPQ, index)
754  TRYCLONE (IndexLSH, index)
755  TRYCLONE (IndexFlatL2, index)
756  TRYCLONE (IndexFlatIP, index)
757  TRYCLONE (IndexFlat, index)
758  TRYCLONE (IndexScalarQuantizer, index)
759  TRYCLONE (MultiIndexQuantizer, index)
760  if (const IndexIVF * ivf = dynamic_cast<const IndexIVF*>(index)) {
761  IndexIVF *res = clone_IndexIVF (ivf);
762  res->own_fields = true;
763  res->quantizer = clone_Index (ivf->quantizer);
764  return res;
765  } else if (const IndexPreTransform * ipt =
766  dynamic_cast<const IndexPreTransform*> (index)) {
767  IndexPreTransform *res = new IndexPreTransform ();
768  res->d = ipt->d;
769  res->index = clone_Index (ipt->index);
770  for (int i = 0; i < ipt->chain.size(); i++)
771  res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
772  res->own_fields = true;
773  return res;
774  } else {
775  FAISS_THROW_MSG( "clone not supported for this type of Index");
776  }
777  return nullptr;
778 }
779 
780 
781 } // namespace faiss
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
Definition: IndexPQ.h:34
size_t code_size
bytes per vector
Index * index
! chain of tranforms
Randomly rotate a set of vectors.
Index * read_index(FILE *f, bool try_mmap)
Definition: index_io.cpp:536
int bytes_per_vec
nb of 8-bits per encoded vector
Definition: IndexLSH.h:28
std::vector< float > thresholds
thresholds to compare with
Definition: IndexLSH.h:34
bool train_thresholds
whether we train thresholds or use 0
Definition: IndexLSH.h:30
Index * base_index
faster index to pre-select the vectors that should be filtered
Definition: IndexFlat.h:109
IndexFlat refine_index
storage for full vectors
Definition: IndexFlat.h:106
bool own_fields
should the base index be deallocated?
Definition: IndexFlat.h:110
int d
vector dimension
Definition: Index.h:64
std::vector< long > id_map
! whether pointers are deleted in destructo
Definition: MetaIndexes.h:29
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
RandomRotationMatrix rrot
optional random rotation
Definition: IndexLSH.h:32
ScalarQuantizer sq
Used to encode the vectors.
long idx_t
all indices are this type
Definition: Index.h:62
ProductQuantizer pq
The product quantizer used to encode the vectors.
Definition: IndexPQ.h:31
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
bool own_fields
! the sub-index
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
size_t nlist
number of possible key values
Definition: IndexIVF.h:46
int d_out
! input dimension
int nbits
nb of bits per vector
Definition: IndexLSH.h:27
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69
std::vector< float > xb
database vectors, size ntotal * d
Definition: IndexFlat.h:25
int polysemous_ht
Hamming threshold used for polysemy.
Definition: IndexPQ.h:91
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:64
bool rotate_data
whether to apply a random rotation to input
Definition: IndexLSH.h:29
std::vector< uint8_t > codes
encoded dataset
Definition: IndexLSH.h:37
bool own_fields
! the sub-index
Definition: MetaIndexes.h:28