Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/index_io.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "index_io.h"
12 
13 #include <cstdio>
14 #include <cstdlib>
15 
16 #include <sys/mman.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <unistd.h>
20 
21 #include "FaissAssert.h"
22 
23 #include "IndexFlat.h"
24 #include "VectorTransform.h"
25 #include "IndexLSH.h"
26 #include "IndexPQ.h"
27 #include "IndexIVF.h"
28 #include "IndexIVFPQ.h"
29 #include "IndexIVFFlat.h"
30 #include "MetaIndexes.h"
31 #include "IndexScalarQuantizer.h"
32 #include "IndexHNSW.h"
33 #include "OnDiskInvertedLists.h"
34 
35 
36 
37 /*************************************************************
38  * The I/O format is the content of the class. For objects that are
39  * inherited, like Index, a 4-character-code (fourcc) indicates which
40  * child class this is an instance of.
41  *
42  * In this case, the fields of the parent class are written first,
43  * then the ones for the child classes. Note that this requires
44  * classes to be serialized to have a constructor without parameters,
45  * so that the fields can be filled in later. The default constructor
46  * should set reasonable defaults for all fields.
47  *
48  * The fourccs are assigned arbitrarily. When the class changed (added
49  * or deprecated fields), the fourcc can be replaced. New code should
50  * be able to read the old fourcc and fill in new classes.
51  *
52  * TODO: serialization to strings for use in Python pickle or Torch
53  * serialization.
54  *
55  * TODO: in this file, the read functions that encouter errors may
56  * leak memory.
57  **************************************************************/
58 
59 
60 
61 namespace faiss {
62 
63 static uint32_t fourcc (const char sx[4]) {
64  const unsigned char *x = (unsigned char*)sx;
65  return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
66 }
67 
68 /*************************************************************
69  * I/O macros
70  *
71  * we use macros so that we have a line number to report in
72  * abort (). This makes debugging a lot easier.
73  **************************************************************/
74 
75 
76 #define WRITEANDCHECK(ptr, n) { \
77  size_t ret = fwrite (ptr, sizeof (* (ptr)), n, f); \
78  FAISS_THROW_IF_NOT_MSG (ret == (n), "write error"); \
79  }
80 
81 #define READANDCHECK(ptr, n) { \
82  size_t ret = fread (ptr, sizeof (* (ptr)), n, f); \
83  FAISS_THROW_IF_NOT_MSG (ret == (n), "read error"); \
84  }
85 
86 #define WRITE1(x) WRITEANDCHECK(&(x), 1)
87 #define READ1(x) READANDCHECK(&(x), 1)
88 
89 #define WRITEVECTOR(vec) { \
90  size_t size = (vec).size (); \
91  WRITEANDCHECK (&size, 1); \
92  WRITEANDCHECK ((vec).data (), size); \
93  }
94 
95 #define READVECTOR(vec) { \
96  long size; \
97  READANDCHECK (&size, 1); \
98  FAISS_THROW_IF_NOT (size >= 0 && size < (1L << 40)); \
99  (vec).resize (size); \
100  READANDCHECK ((vec).data (), size); \
101  }
102 
104  FILE *f;
105  ScopeFileCloser (FILE *f): f (f) {}
106  ~ScopeFileCloser () {fclose (f); }
107 };
108 
109 
110 
111 
112 
113 /*************************************************************
114  * Write
115  **************************************************************/
116 
117 static void write_index_header (const Index *idx, FILE *f) {
118  WRITE1 (idx->d);
119  WRITE1 (idx->ntotal);
120  Index::idx_t dummy = 1 << 20;
121  WRITE1 (dummy);
122  WRITE1 (dummy);
123  WRITE1 (idx->is_trained);
124  WRITE1 (idx->metric_type);
125 }
126 
127 void write_VectorTransform (const VectorTransform *vt, FILE *f) {
128  if (const LinearTransform * lt =
129  dynamic_cast < const LinearTransform *> (vt)) {
130  if (dynamic_cast<const RandomRotationMatrix *>(lt)) {
131  uint32_t h = fourcc ("rrot");
132  WRITE1 (h);
133  } else if (const PCAMatrix * pca =
134  dynamic_cast<const PCAMatrix *>(lt)) {
135  uint32_t h = fourcc ("PcAm");
136  WRITE1 (h);
137  WRITE1 (pca->eigen_power);
138  WRITE1 (pca->random_rotation);
139  WRITE1 (pca->balanced_bins);
140  WRITEVECTOR (pca->mean);
141  WRITEVECTOR (pca->eigenvalues);
142  WRITEVECTOR (pca->PCAMat);
143  } else {
144  // generic LinearTransform (includes OPQ)
145  uint32_t h = fourcc ("LTra");
146  WRITE1 (h);
147  }
148  WRITE1 (lt->have_bias);
149  WRITEVECTOR (lt->A);
150  WRITEVECTOR (lt->b);
151  } else if (const RemapDimensionsTransform *rdt =
152  dynamic_cast<const RemapDimensionsTransform *>(vt)) {
153  uint32_t h = fourcc ("RmDT");
154  WRITE1 (h);
155  WRITEVECTOR (rdt->map);
156  } else if (const NormalizationTransform *nt =
157  dynamic_cast<const NormalizationTransform *>(vt)) {
158  uint32_t h = fourcc ("VNrm");
159  WRITE1 (h);
160  WRITE1 (nt->norm);
161  } else {
162  FAISS_THROW_MSG ("cannot serialize this");
163  }
164  // common fields
165  WRITE1 (vt->d_in);
166  WRITE1 (vt->d_out);
167  WRITE1 (vt->is_trained);
168 }
169 
170 static void write_ProductQuantizer (const ProductQuantizer *pq, FILE *f) {
171  WRITE1 (pq->d);
172  WRITE1 (pq->M);
173  WRITE1 (pq->nbits);
174  WRITEVECTOR (pq->centroids);
175 }
176 
177 static void write_ScalarQuantizer (const ScalarQuantizer *ivsc, FILE *f) {
178  WRITE1 (ivsc->qtype);
179  WRITE1 (ivsc->rangestat);
180  WRITE1 (ivsc->rangestat_arg);
181  WRITE1 (ivsc->d);
182  WRITE1 (ivsc->code_size);
183  WRITEVECTOR (ivsc->trained);
184 }
185 
186 static void write_InvertedLists (const InvertedLists *ils, FILE *f) {
187  if (ils == nullptr) {
188  uint32_t h = fourcc ("il00");
189  WRITE1 (h);
190  } else if (const auto & ails =
191  dynamic_cast<const ArrayInvertedLists *>(ils)) {
192  uint32_t h = fourcc ("ilar");
193  WRITE1 (h);
194  WRITE1 (ails->nlist);
195  WRITE1 (ails->code_size);
196  // here we store either as a full or a sparse data buffer
197  size_t n_non0 = 0;
198  for (size_t i = 0; i < ails->nlist; i++) {
199  if (ails->ids[i].size() > 0)
200  n_non0++;
201  }
202  if (n_non0 > ails->nlist / 2) {
203  uint32_t list_type = fourcc("full");
204  WRITE1 (list_type);
205  std::vector<size_t> sizes;
206  for (size_t i = 0; i < ails->nlist; i++) {
207  sizes.push_back (ails->ids[i].size());
208  }
209  WRITEVECTOR (sizes);
210  } else {
211  int list_type = fourcc("sprs"); // sparse
212  WRITE1 (list_type);
213  std::vector<size_t> sizes;
214  for (size_t i = 0; i < ails->nlist; i++) {
215  size_t n = ails->ids[i].size();
216  if (n > 0) {
217  sizes.push_back (i);
218  sizes.push_back (n);
219  }
220  }
221  WRITEVECTOR (sizes);
222  }
223  // make a single contiguous data buffer (useful for mmapping)
224  for (size_t i = 0; i < ails->nlist; i++) {
225  size_t n = ails->ids[i].size();
226  if (n > 0) {
227  WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
228  WRITEANDCHECK (ails->ids[i].data(), n);
229  }
230  }
231  } else if (const auto & od =
232  dynamic_cast<const OnDiskInvertedLists *>(ils)) {
233  uint32_t h = fourcc ("ilod");
234  WRITE1 (h);
235  WRITE1 (ils->nlist);
236  WRITE1 (ils->code_size);
237  // this is a POD object
238  WRITEVECTOR (od->lists);
239 
240  {
241  std::vector<OnDiskInvertedLists::Slot> v(
242  od->slots.begin(), od->slots.end());
243  WRITEVECTOR(v);
244  }
245  {
246  std::vector<char> x(od->filename.begin(), od->filename.end());
247  WRITEVECTOR(x);
248  }
249  WRITE1(od->totsize);
250 
251  } else {
252  FAISS_THROW_MSG ("write_InvertedLists: unsupported invlist type");
253  }
254 }
255 
256 
257 void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
258  FILE *f = fopen (fname, "w");
259  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
260  ScopeFileCloser closer(f);
261  write_ProductQuantizer (pq, f);
262 }
263 
264 static void write_HNSW (const HNSW *hnsw, FILE *f) {
265 
266  WRITEVECTOR (hnsw->assign_probas);
267  WRITEVECTOR (hnsw->cum_nneighbor_per_level);
268  WRITEVECTOR (hnsw->levels);
269  WRITEVECTOR (hnsw->offsets);
270  WRITEVECTOR (hnsw->neighbors);
271 
272  WRITE1 (hnsw->entry_point);
273  WRITE1 (hnsw->max_level);
274  WRITE1 (hnsw->efConstruction);
275  WRITE1 (hnsw->efSearch);
276  WRITE1 (hnsw->upper_beam);
277 
278 }
279 
280 static void write_ivf_header (const IndexIVF * ivf, FILE *f) {
281  write_index_header (ivf, f);
282  WRITE1 (ivf->nlist);
283  WRITE1 (ivf->nprobe);
284  write_index (ivf->quantizer, f);
285  WRITE1 (ivf->maintain_direct_map);
286  WRITEVECTOR (ivf->direct_map);
287 }
288 
289 void write_index (const Index *idx, FILE *f) {
290  if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
291  uint32_t h = fourcc (
292  idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
293  idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
294  WRITE1 (h);
295  write_index_header (idx, f);
296  WRITEVECTOR (idxf->xb);
297  } else if(const IndexLSH * idxl = dynamic_cast<const IndexLSH *> (idx)) {
298  uint32_t h = fourcc ("IxHe");
299  WRITE1 (h);
300  write_index_header (idx, f);
301  WRITE1 (idxl->nbits);
302  WRITE1 (idxl->rotate_data);
303  WRITE1 (idxl->train_thresholds);
304  WRITEVECTOR (idxl->thresholds);
305  WRITE1 (idxl->bytes_per_vec);
306  write_VectorTransform (&idxl->rrot, f);
307  WRITEVECTOR (idxl->codes);
308  } else if(const IndexPQ * idxp = dynamic_cast<const IndexPQ *> (idx)) {
309  uint32_t h = fourcc ("IxPq");
310  WRITE1 (h);
311  write_index_header (idx, f);
312  write_ProductQuantizer (&idxp->pq, f);
313  WRITEVECTOR (idxp->codes);
314  // search params -- maybe not useful to store?
315  WRITE1 (idxp->search_type);
316  WRITE1 (idxp->encode_signs);
317  WRITE1 (idxp->polysemous_ht);
318  } else if(const Index2Layer * idxp =
319  dynamic_cast<const Index2Layer *> (idx)) {
320  uint32_t h = fourcc ("Ix2L");
321  WRITE1 (h);
322  write_index_header (idx, f);
323  write_index (idxp->q1.quantizer, f);
324  WRITE1 (idxp->q1.nlist);
325  WRITE1 (idxp->q1.quantizer_trains_alone);
326  write_ProductQuantizer (&idxp->pq, f);
327  WRITE1 (idxp->code_size_1);
328  WRITE1 (idxp->code_size_2);
329  WRITE1 (idxp->code_size);
330  WRITEVECTOR (idxp->codes);
331  } else if(const IndexScalarQuantizer * idxs =
332  dynamic_cast<const IndexScalarQuantizer *> (idx)) {
333  uint32_t h = fourcc ("IxSQ");
334  WRITE1 (h);
335  write_index_header (idx, f);
336  write_ScalarQuantizer (&idxs->sq, f);
337  WRITEVECTOR (idxs->codes);
338  } else if(const IndexIVFFlat * ivfl =
339  dynamic_cast<const IndexIVFFlat *> (idx)) {
340  uint32_t h = fourcc ("IwFl");
341  WRITE1 (h);
342  write_ivf_header (ivfl, f);
343  write_InvertedLists (ivfl->invlists, f);
344  } else if(const IndexIVFScalarQuantizer * ivsc =
345  dynamic_cast<const IndexIVFScalarQuantizer *> (idx)) {
346  uint32_t h = fourcc ("IwSQ");
347  WRITE1 (h);
348  write_ivf_header (ivsc, f);
349  write_ScalarQuantizer (&ivsc->sq, f);
350  WRITE1 (ivsc->code_size);
351  write_InvertedLists (ivsc->invlists, f);
352  } else if(const IndexIVFPQ * ivpq =
353  dynamic_cast<const IndexIVFPQ *> (idx)) {
354  const IndexIVFPQR * ivfpqr = dynamic_cast<const IndexIVFPQR *> (idx);
355 
356  uint32_t h = fourcc (ivfpqr ? "IwQR" : "IwPQ");
357  WRITE1 (h);
358  write_ivf_header (ivpq, f);
359  WRITE1 (ivpq->by_residual);
360  WRITE1 (ivpq->code_size);
361  write_ProductQuantizer (&ivpq->pq, f);
362  write_InvertedLists (ivpq->invlists, f);
363  if (ivfpqr) {
364  write_ProductQuantizer (&ivfpqr->refine_pq, f);
365  WRITEVECTOR (ivfpqr->refine_codes);
366  WRITE1 (ivfpqr->k_factor);
367  }
368 
369  } else if(const IndexPreTransform * ixpt =
370  dynamic_cast<const IndexPreTransform *> (idx)) {
371  uint32_t h = fourcc ("IxPT");
372  WRITE1 (h);
373  write_index_header (ixpt, f);
374  int nt = ixpt->chain.size();
375  WRITE1 (nt);
376  for (int i = 0; i < nt; i++)
377  write_VectorTransform (ixpt->chain[i], f);
378  write_index (ixpt->index, f);
379  } else if(const MultiIndexQuantizer * imiq =
380  dynamic_cast<const MultiIndexQuantizer *> (idx)) {
381  uint32_t h = fourcc ("Imiq");
382  WRITE1 (h);
383  write_index_header (imiq, f);
384  write_ProductQuantizer (&imiq->pq, f);
385  } else if(const IndexRefineFlat * idxrf =
386  dynamic_cast<const IndexRefineFlat *> (idx)) {
387  uint32_t h = fourcc ("IxRF");
388  WRITE1 (h);
389  write_index_header (idxrf, f);
390  write_index (idxrf->base_index, f);
391  write_index (&idxrf->refine_index, f);
392  WRITE1 (idxrf->k_factor);
393  } else if(const IndexIDMap * idxmap =
394  dynamic_cast<const IndexIDMap *> (idx)) {
395  uint32_t h =
396  dynamic_cast<const IndexIDMap2 *> (idx) ? fourcc ("IxM2") :
397  fourcc ("IxMp");
398  // no need to store additional info for IndexIDMap2
399  WRITE1 (h);
400  write_index_header (idxmap, f);
401  write_index (idxmap->index, f);
402  WRITEVECTOR (idxmap->id_map);
403  } else if(const IndexHNSW * idxhnsw =
404  dynamic_cast<const IndexHNSW *> (idx)) {
405  uint32_t h =
406  dynamic_cast<const IndexHNSWFlat*>(idx) ? fourcc("IHNf") :
407  dynamic_cast<const IndexHNSWPQ*>(idx) ? fourcc("IHNp") :
408  dynamic_cast<const IndexHNSWSQ*>(idx) ? fourcc("IHNs") :
409  dynamic_cast<const IndexHNSW2Level*>(idx) ? fourcc("IHN2") :
410  0;
411  FAISS_THROW_IF_NOT (h != 0);
412  WRITE1 (h);
413  write_index_header (idxhnsw, f);
414  write_HNSW (&idxhnsw->hnsw, f);
415  write_index (idxhnsw->storage, f);
416  } else {
417  FAISS_THROW_MSG ("don't know how to serialize this type of index");
418  }
419 }
420 
421 void write_index (const Index *idx, const char *fname) {
422  FILE *f = fopen (fname, "w");
423  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
424  ScopeFileCloser closer(f);
425  write_index (idx, f);
426 }
427 
428 void write_VectorTransform (const VectorTransform *vt, const char *fname) {
429  FILE *f = fopen (fname, "w");
430  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
431  ScopeFileCloser closer(f);
432  write_VectorTransform (vt, f);
433 }
434 
435 /*************************************************************
436  * Read
437  **************************************************************/
438 
439 static void read_index_header (Index *idx, FILE *f) {
440  READ1 (idx->d);
441  READ1 (idx->ntotal);
442  Index::idx_t dummy;
443  READ1 (dummy);
444  READ1 (dummy);
445  READ1 (idx->is_trained);
446  READ1 (idx->metric_type);
447  idx->verbose = false;
448 }
449 
450 VectorTransform* read_VectorTransform (FILE *f) {
451  uint32_t h;
452  READ1 (h);
453  VectorTransform *vt = nullptr;
454 
455  if (h == fourcc ("rrot") || h == fourcc ("PCAm") ||
456  h == fourcc ("LTra") || h == fourcc ("PcAm")) {
457  LinearTransform *lt = nullptr;
458  if (h == fourcc ("rrot")) {
459  lt = new RandomRotationMatrix ();
460  } else if (h == fourcc ("PCAm") ||
461  h == fourcc ("PcAm")) {
462  PCAMatrix * pca = new PCAMatrix ();
463  READ1 (pca->eigen_power);
464  READ1 (pca->random_rotation);
465  if (h == fourcc ("PcAm"))
466  READ1 (pca->balanced_bins);
467  READVECTOR (pca->mean);
468  READVECTOR (pca->eigenvalues);
469  READVECTOR (pca->PCAMat);
470  lt = pca;
471  } else if (h == fourcc ("LTra")) {
472  lt = new LinearTransform ();
473  }
474  READ1 (lt->have_bias);
475  READVECTOR (lt->A);
476  READVECTOR (lt->b);
477  FAISS_THROW_IF_NOT (lt->A.size() >= lt->d_in * lt->d_out);
478  FAISS_THROW_IF_NOT (!lt->have_bias || lt->b.size() >= lt->d_out);
479  lt->set_is_orthonormal();
480  vt = lt;
481  } else if (h == fourcc ("RmDT")) {
482  RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
483  READVECTOR (rdt->map);
484  vt = rdt;
485  } else if (h == fourcc ("VNrm")) {
486  NormalizationTransform *nt = new NormalizationTransform ();
487  READ1 (nt->norm);
488  vt = nt;
489  } else {
490  FAISS_THROW_MSG("fourcc not recognized");
491  }
492  READ1 (vt->d_in);
493  READ1 (vt->d_out);
494  READ1 (vt->is_trained);
495  return vt;
496 }
497 
498 
499 static void read_ArrayInvertedLists_sizes (
500  FILE *f, std::vector<size_t> & sizes)
501 {
502  size_t nlist = sizes.size();
503  uint32_t list_type;
504  READ1(list_type);
505  if (list_type == fourcc("full")) {
506  size_t os = sizes.size();
507  READVECTOR (sizes);
508  FAISS_THROW_IF_NOT (os == sizes.size());
509  } else if (list_type == fourcc("sprs")) {
510  std::vector<size_t> idsizes;
511  READVECTOR (idsizes);
512  for (size_t j = 0; j < idsizes.size(); j += 2) {
513  FAISS_THROW_IF_NOT (idsizes[j] < sizes.size());
514  sizes[idsizes[j]] = idsizes[j + 1];
515  }
516  } else {
517  FAISS_THROW_MSG ("invalid list_type");
518  }
519 }
520 
521 
522 InvertedLists *read_InvertedLists (FILE *f, int io_flags) {
523  uint32_t h;
524  READ1 (h);
525  if (h == fourcc ("il00")) {
526  return nullptr;
527  } else if (h == fourcc ("ilar") && !(io_flags & IO_FLAG_MMAP)) {
528  auto ails = new ArrayInvertedLists (0, 0);
529  READ1 (ails->nlist);
530  READ1 (ails->code_size);
531  ails->ids.resize (ails->nlist);
532  ails->codes.resize (ails->nlist);
533  std::vector<size_t> sizes (ails->nlist);
534  read_ArrayInvertedLists_sizes (f, sizes);
535  for (size_t i = 0; i < ails->nlist; i++) {
536  ails->ids[i].resize (sizes[i]);
537  ails->codes[i].resize (sizes[i] * ails->code_size);
538  }
539  for (size_t i = 0; i < ails->nlist; i++) {
540  size_t n = ails->ids[i].size();
541  if (n > 0) {
542  READANDCHECK (ails->codes[i].data(), n * ails->code_size);
543  READANDCHECK (ails->ids[i].data(), n);
544  }
545  }
546  return ails;
547  } else if (h == fourcc ("ilar") && (io_flags & IO_FLAG_MMAP)) {
548  auto ails = new OnDiskInvertedLists ();
549  READ1 (ails->nlist);
550  READ1 (ails->code_size);
551  ails->read_only = true;
552  ails->lists.resize (ails->nlist);
553  std::vector<size_t> sizes (ails->nlist);
554  read_ArrayInvertedLists_sizes (f, sizes);
555  size_t o0 = ftell (f), o = o0;
556  { // do the mmap
557  struct stat buf;
558  int ret = fstat (fileno(f), &buf);
559  FAISS_THROW_IF_NOT_FMT (ret == 0,
560  "fstat failed: %s", strerror(errno));
561  ails->totsize = buf.st_size;
562  ails->ptr = (uint8_t*)mmap (nullptr, ails->totsize,
563  PROT_READ, MAP_SHARED,
564  fileno (f), 0);
565  FAISS_THROW_IF_NOT_FMT (ails->ptr != MAP_FAILED,
566  "could not mmap: %s",
567  strerror(errno));
568  }
569  for (size_t i = 0; i < ails->nlist; i++) {
570  OnDiskInvertedLists::List & l = ails->lists[i];
571  l.size = l.capacity = sizes[i];
572  l.offset = o;
573  o += l.size * (sizeof(OnDiskInvertedLists::idx_t) +
574  ails->code_size);
575  }
576  // resume normal reading of file
577  fseek (f, o, SEEK_SET);
578  return ails;
579  } else if (h == fourcc ("ilod")) {
580  OnDiskInvertedLists *od = new OnDiskInvertedLists();
581  od->read_only = io_flags & IO_FLAG_READ_ONLY;
582  READ1 (od->nlist);
583  READ1 (od->code_size);
584  // this is a POD object
585  READVECTOR (od->lists);
586  {
587  std::vector<OnDiskInvertedLists::Slot> v;
588  READVECTOR(v);
589  od->slots.assign(v.begin(), v.end());
590  }
591  {
592  std::vector<char> x;
593  READVECTOR(x);
594  od->filename.assign(x.begin(), x.end());
595  }
596  READ1(od->totsize);
597  od->do_mmap();
598  return od;
599  } else {
600  FAISS_THROW_MSG ("read_InvertedLists: unsupported invlist type");
601  }
602 }
603 
604 static void read_InvertedLists (IndexIVF *ivf, FILE *f, int io_flags) {
605  InvertedLists *ils = read_InvertedLists (f, io_flags);
606  FAISS_THROW_IF_NOT (ils->nlist == ivf->nlist &&
607  ils->code_size == ivf->code_size);
608  ivf->invlists = ils;
609  ivf->own_invlists = true;
610 }
611 
612 
613 static void read_ProductQuantizer (ProductQuantizer *pq, FILE *f) {
614  READ1 (pq->d);
615  READ1 (pq->M);
616  READ1 (pq->nbits);
617  pq->set_derived_values ();
618  READVECTOR (pq->centroids);
619 }
620 
621 static void read_ScalarQuantizer (ScalarQuantizer *ivsc, FILE *f) {
622  READ1 (ivsc->qtype);
623  READ1 (ivsc->rangestat);
624  READ1 (ivsc->rangestat_arg);
625  READ1 (ivsc->d);
626  READ1 (ivsc->code_size);
627  READVECTOR (ivsc->trained);
628 }
629 
630 
631 static void read_HNSW (HNSW *hnsw, FILE *f) {
632  READVECTOR (hnsw->assign_probas);
633  READVECTOR (hnsw->cum_nneighbor_per_level);
634  READVECTOR (hnsw->levels);
635  READVECTOR (hnsw->offsets);
636  READVECTOR (hnsw->neighbors);
637 
638  READ1 (hnsw->entry_point);
639  READ1 (hnsw->max_level);
640  READ1 (hnsw->efConstruction);
641  READ1 (hnsw->efSearch);
642  READ1 (hnsw->upper_beam);
643 }
644 
645 ProductQuantizer * read_ProductQuantizer (const char*fname) {
646  FILE *f = fopen (fname, "r");
647  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for writing", fname);
648  ScopeFileCloser closer(f);
649  ProductQuantizer *pq = new ProductQuantizer();
650  ScopeDeleter1<ProductQuantizer> del (pq);
651  read_ProductQuantizer(pq, f);
652  del.release ();
653  return pq;
654 }
655 
656 static void read_ivf_header (
657  IndexIVF * ivf, FILE *f,
658  std::vector<std::vector<Index::idx_t> > *ids = nullptr)
659 {
660  read_index_header (ivf, f);
661  READ1 (ivf->nlist);
662  READ1 (ivf->nprobe);
663  ivf->quantizer = read_index (f);
664  ivf->own_fields = true;
665  if (ids) { // used in legacy "Iv" formats
666  ids->resize (ivf->nlist);
667  for (size_t i = 0; i < ivf->nlist; i++)
668  READVECTOR ((*ids)[i]);
669  }
670  READ1 (ivf->maintain_direct_map);
671  READVECTOR (ivf->direct_map);
672 }
673 
674 // used for legacy formats
675 static ArrayInvertedLists *set_array_invlist(
676  IndexIVF *ivf, std::vector<std::vector<Index::idx_t> > &ids)
677 {
678  ArrayInvertedLists *ail = new ArrayInvertedLists (
679  ivf->nlist, ivf->code_size);
680  std::swap (ail->ids, ids);
681  ivf->invlists = ail;
682  ivf->own_invlists = true;
683  return ail;
684 }
685 
686 static IndexIVFPQ *read_ivfpq (FILE *f, uint32_t h, int io_flags)
687 {
688  bool legacy = h == fourcc ("IvQR") || h == fourcc ("IvPQ");
689 
690  IndexIVFPQR *ivfpqr =
691  h == fourcc ("IvQR") || h == fourcc ("IwQR") ?
692  new IndexIVFPQR () : nullptr;
693  IndexIVFPQ * ivpq = ivfpqr ? ivfpqr : new IndexIVFPQ ();
694 
695  std::vector<std::vector<Index::idx_t> > ids;
696  read_ivf_header (ivpq, f, legacy ? &ids : nullptr);
697  READ1 (ivpq->by_residual);
698  READ1 (ivpq->code_size);
699  read_ProductQuantizer (&ivpq->pq, f);
700 
701  if (legacy) {
702  ArrayInvertedLists *ail = set_array_invlist (ivpq, ids);
703  for (size_t i = 0; i < ail->nlist; i++)
704  READVECTOR (ail->codes[i]);
705  } else {
706  read_InvertedLists (ivpq, f, io_flags);
707  }
708 
709  // precomputed table not stored. It is cheaper to recompute it
710  ivpq->use_precomputed_table = 0;
711  if (ivpq->by_residual)
712  ivpq->precompute_table ();
713  if (ivfpqr) {
714  read_ProductQuantizer (&ivfpqr->refine_pq, f);
715  READVECTOR (ivfpqr->refine_codes);
716  READ1 (ivfpqr->k_factor);
717  }
718  return ivpq;
719 }
720 
721 int read_old_fmt_hack = 0;
722 
723 Index *read_index (FILE * f, int io_flags) {
724  Index * idx = nullptr;
725  uint32_t h;
726  READ1 (h);
727  if (h == fourcc ("IxFI") || h == fourcc ("IxF2")) {
728  IndexFlat *idxf;
729  if (h == fourcc ("IxFI")) idxf = new IndexFlatIP ();
730  else idxf = new IndexFlatL2 ();
731  read_index_header (idxf, f);
732  READVECTOR (idxf->xb);
733  FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->d);
734  // leak!
735  idx = idxf;
736  } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
737  IndexLSH * idxl = new IndexLSH ();
738  read_index_header (idxl, f);
739  READ1 (idxl->nbits);
740  READ1 (idxl->rotate_data);
741  READ1 (idxl->train_thresholds);
742  READVECTOR (idxl->thresholds);
743  READ1 (idxl->bytes_per_vec);
744  if (h == fourcc("IxHE")) {
745  FAISS_THROW_IF_NOT_FMT (idxl->nbits % 64 == 0,
746  "can only read old format IndexLSH with "
747  "nbits multiple of 64 (got %d)",
748  (int) idxl->nbits);
749  // leak
750  idxl->bytes_per_vec *= 8;
751  }
752  {
753  RandomRotationMatrix *rrot = dynamic_cast<RandomRotationMatrix *>
754  (read_VectorTransform (f));
755  FAISS_THROW_IF_NOT_MSG(rrot, "expected a random rotation");
756  idxl->rrot = *rrot;
757  delete rrot;
758  }
759  READVECTOR (idxl->codes);
760  FAISS_THROW_IF_NOT (idxl->rrot.d_in == idxl->d &&
761  idxl->rrot.d_out == idxl->nbits);
762  FAISS_THROW_IF_NOT (
763  idxl->codes.size() == idxl->ntotal * idxl->bytes_per_vec);
764  idx = idxl;
765  } else if (h == fourcc ("IxPQ") || h == fourcc ("IxPo") ||
766  h == fourcc ("IxPq")) {
767  // IxPQ and IxPo were merged into the same IndexPQ object
768  IndexPQ * idxp =new IndexPQ ();
769  read_index_header (idxp, f);
770  read_ProductQuantizer (&idxp->pq, f);
771  READVECTOR (idxp->codes);
772  if (h == fourcc ("IxPo") || h == fourcc ("IxPq")) {
773  READ1 (idxp->search_type);
774  READ1 (idxp->encode_signs);
775  READ1 (idxp->polysemous_ht);
776  }
777  // Old versoins of PQ all had metric_type set to INNER_PRODUCT
778  // when they were in fact using L2. Therefore, we force metric type
779  // to L2 when the old format is detected
780  if (h == fourcc ("IxPQ") || h == fourcc ("IxPo")) {
781  idxp->metric_type = METRIC_L2;
782  }
783  idx = idxp;
784  } else if (h == fourcc ("IvFl") || h == fourcc("IvFL")) { // legacy
785  IndexIVFFlat * ivfl = new IndexIVFFlat ();
786  std::vector<std::vector<Index::idx_t> > ids;
787  read_ivf_header (ivfl, f, &ids);
788  ivfl->code_size = ivfl->d * sizeof(float);
789  ArrayInvertedLists *ail = set_array_invlist (ivfl, ids);
790 
791  if (h == fourcc ("IvFL")) {
792  for (size_t i = 0; i < ivfl->nlist; i++) {
793  READVECTOR (ail->codes[i]);
794  }
795  } else { // old format
796  for (size_t i = 0; i < ivfl->nlist; i++) {
797  std::vector<float> vec;
798  READVECTOR (vec);
799  ail->codes[i].resize(vec.size() * sizeof(float));
800  memcpy(ail->codes[i].data(), vec.data(),
801  ail->codes[i].size());
802  }
803  }
804  idx = ivfl;
805  } else if (h == fourcc ("IwFl")) {
806  IndexIVFFlat * ivfl = new IndexIVFFlat ();
807  read_ivf_header (ivfl, f);
808  ivfl->code_size = ivfl->d * sizeof(float);
809  read_InvertedLists (ivfl, f, io_flags);
810  idx = ivfl;
811  } else if (h == fourcc ("IxSQ")) {
812  IndexScalarQuantizer * idxs = new IndexScalarQuantizer ();
813  read_index_header (idxs, f);
814  read_ScalarQuantizer (&idxs->sq, f);
815  READVECTOR (idxs->codes);
816  idxs->code_size = idxs->sq.code_size;
817  idx = idxs;
818  } else if(h == fourcc ("IvSQ")) { // legacy
819  IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
820  std::vector<std::vector<Index::idx_t> > ids;
821  read_ivf_header (ivsc, f, &ids);
822  read_ScalarQuantizer (&ivsc->sq, f);
823  READ1 (ivsc->code_size);
824  ArrayInvertedLists *ail = set_array_invlist (ivsc, ids);
825  for(int i = 0; i < ivsc->nlist; i++)
826  READVECTOR (ail->codes[i]);
827  idx = ivsc;
828  } else if(h == fourcc ("IwSQ")) {
829  IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
830  read_ivf_header (ivsc, f);
831  read_ScalarQuantizer (&ivsc->sq, f);
832  READ1 (ivsc->code_size);
833  read_InvertedLists (ivsc, f, io_flags);
834  idx = ivsc;
835  } else if(h == fourcc ("IvPQ") || h == fourcc ("IvQR") ||
836  h == fourcc ("IwPQ") || h == fourcc ("IwQR")) {
837 
838  idx = read_ivfpq (f, h, io_flags);
839 
840  } else if(h == fourcc ("IxPT")) {
841  IndexPreTransform * ixpt = new IndexPreTransform();
842  ixpt->own_fields = true;
843  read_index_header (ixpt, f);
844  int nt;
845  if (read_old_fmt_hack == 2) {
846  nt = 1;
847  } else {
848  READ1 (nt);
849  }
850  for (int i = 0; i < nt; i++) {
851  ixpt->chain.push_back (read_VectorTransform (f));
852  }
853  ixpt->index = read_index (f);
854  idx = ixpt;
855  } else if(h == fourcc ("Imiq")) {
856  MultiIndexQuantizer * imiq = new MultiIndexQuantizer ();
857  read_index_header (imiq, f);
858  read_ProductQuantizer (&imiq->pq, f);
859  idx = imiq;
860  } else if(h == fourcc ("IxRF")) {
861  IndexRefineFlat *idxrf = new IndexRefineFlat ();
862  read_index_header (idxrf, f);
863  idxrf->base_index = read_index(f);
864  idxrf->own_fields = true;
865  IndexFlat *rf = dynamic_cast<IndexFlat*> (read_index (f));
866  std::swap (*rf, idxrf->refine_index);
867  delete rf;
868  READ1 (idxrf->k_factor);
869  idx = idxrf;
870  } else if(h == fourcc ("IxMp") || h == fourcc ("IxM2")) {
871  bool is_map2 = h == fourcc ("IxM2");
872  IndexIDMap * idxmap = is_map2 ? new IndexIDMap2 () : new IndexIDMap ();
873  read_index_header (idxmap, f);
874  idxmap->index = read_index (f);
875  idxmap->own_fields = true;
876  READVECTOR (idxmap->id_map);
877  if (is_map2) {
878  static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
879  }
880  idx = idxmap;
881  } else if (h == fourcc ("Ix2L")) {
882  Index2Layer * idxp = new Index2Layer ();
883  read_index_header (idxp, f);
884  idxp->q1.quantizer = read_index (f);
885  READ1 (idxp->q1.nlist);
886  READ1 (idxp->q1.quantizer_trains_alone);
887  read_ProductQuantizer (&idxp->pq, f);
888  READ1 (idxp->code_size_1);
889  READ1 (idxp->code_size_2);
890  READ1 (idxp->code_size);
891  READVECTOR (idxp->codes);
892  idx = idxp;
893  } else if(h == fourcc("IHNf") || h == fourcc("IHNp") ||
894  h == fourcc("IHNs") || h == fourcc("IHN2")) {
895  IndexHNSW *idxhnsw = nullptr;
896  if (h == fourcc("IHNf")) idxhnsw = new IndexHNSWFlat ();
897  if (h == fourcc("IHNp")) idxhnsw = new IndexHNSWPQ ();
898  if (h == fourcc("IHNs")) idxhnsw = new IndexHNSWSQ ();
899  if (h == fourcc("IHN2")) idxhnsw = new IndexHNSW2Level ();
900  read_index_header (idxhnsw, f);
901  read_HNSW (&idxhnsw->hnsw, f);
902  idxhnsw->storage = read_index (f);
903  idxhnsw->own_fields = true;
904  if (h == fourcc("IHNp")) {
905  dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table ();
906  }
907  idx = idxhnsw;
908  } else {
909  FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
910  idx = nullptr;
911  }
912  return idx;
913 }
914 
915 
916 
917 Index *read_index (const char *fname, int io_flags) {
918  FILE *f = fopen (fname, "r");
919  FAISS_THROW_IF_NOT_FMT (f, "cannot open %s for reading:", fname);
920  Index *idx = read_index (f, io_flags);
921  fclose (f);
922  return idx;
923 }
924 
925 VectorTransform *read_VectorTransform (const char *fname) {
926  FILE *f = fopen (fname, "r");
927  if (!f) {
928  fprintf (stderr, "cannot open %s for reading:", fname);
929  perror ("");
930  abort ();
931  }
932  VectorTransform *vt = read_VectorTransform (f);
933  fclose (f);
934  return vt;
935 }
936 
937 /*************************************************************
938  * cloning functions
939  **************************************************************/
940 
941 
942 
943 Index * clone_index (const Index *index)
944 {
945  Cloner cl;
946  return cl.clone_Index (index);
947 }
948 
949 // assumes there is a copy constructor ready. Always try from most
950 // specific to most general
951 #define TRYCLONE(classname, obj) \
952  if (const classname *clo = dynamic_cast<const classname *>(obj)) { \
953  return new classname(*clo); \
954  } else
955 
956 VectorTransform *Cloner::clone_VectorTransform (const VectorTransform *vt)
957 {
958  TRYCLONE (RemapDimensionsTransform, vt)
959  TRYCLONE (OPQMatrix, vt)
960  TRYCLONE (PCAMatrix, vt)
961  TRYCLONE (RandomRotationMatrix, vt)
962  TRYCLONE (LinearTransform, vt)
963  {
964  FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
965  }
966  return nullptr;
967 }
968 
969 IndexIVF * Cloner::clone_IndexIVF (const IndexIVF *ivf)
970 {
971  TRYCLONE (IndexIVFPQR, ivf)
972  TRYCLONE (IndexIVFPQ, ivf)
973  TRYCLONE (IndexIVFFlat, ivf)
974  TRYCLONE (IndexIVFScalarQuantizer, ivf)
975  {
976  FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
977  }
978  return nullptr;
979 }
980 
981 Index *Cloner::clone_Index (const Index *index)
982 {
983  TRYCLONE (IndexPQ, index)
984  TRYCLONE (IndexLSH, index)
985  TRYCLONE (IndexFlatL2, index)
986  TRYCLONE (IndexFlatIP, index)
987  TRYCLONE (IndexFlat, index)
988  TRYCLONE (IndexScalarQuantizer, index)
989  TRYCLONE (MultiIndexQuantizer, index)
990  if (const IndexIVF * ivf = dynamic_cast<const IndexIVF*>(index)) {
991  IndexIVF *res = clone_IndexIVF (ivf);
992  if (ivf->invlists == nullptr) {
993  res->invlists = nullptr;
994  } else if (auto *ails = dynamic_cast<const ArrayInvertedLists*>
995  (ivf->invlists)) {
996  res->invlists = new ArrayInvertedLists(*ails);
997  res->own_invlists = true;
998  } else {
999  FAISS_THROW_MSG( "clone not supported for this type of inverted lists");
1000  }
1001  res->own_fields = true;
1002  res->quantizer = clone_Index (ivf->quantizer);
1003  return res;
1004  } else if (const IndexPreTransform * ipt =
1005  dynamic_cast<const IndexPreTransform*> (index)) {
1006  IndexPreTransform *res = new IndexPreTransform ();
1007  res->d = ipt->d;
1008  res->index = clone_Index (ipt->index);
1009  for (int i = 0; i < ipt->chain.size(); i++)
1010  res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
1011  res->own_fields = true;
1012  return res;
1013  } else if (const IndexIDMap *idmap =
1014  dynamic_cast<const IndexIDMap*> (index)) {
1015  IndexIDMap *res = new IndexIDMap (*idmap);
1016  res->own_fields = true;
1017  res->index = clone_Index (idmap->index);
1018  return res;
1019  } else {
1020  FAISS_THROW_MSG( "clone not supported for this type of Index");
1021  }
1022  return nullptr;
1023 }
1024 
1025 
1026 } // namespace faiss
int d
vector dimension
Definition: Index.h:64
long idx_t
all indices are this type
Definition: Index.h:62
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69