Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVFPQ.h
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 // -*- c++ -*-
12 
13 #ifndef FAISS_INDEX_IVFPQ_H
14 #define FAISS_INDEX_IVFPQ_H
15 
16 
17 #include <vector>
18 
19 #include "IndexIVF.h"
20 #include "IndexPQ.h"
21 
22 
23 namespace faiss {
24 
25 
26 
27 /** Inverted file with Product Quantizer encoding. Each residual
28  * vector is encoded as a product quantizer code.
29  */
31  bool by_residual; ///< Encode residual or plain vector?
32  int use_precomputed_table; ///< if by_residual, build precompute tables
33  size_t code_size; ///< code size per vector in bytes
34  ProductQuantizer pq; ///< produces the codes
35 
36  bool do_polysemous_training; ///< reorder PQ centroids after training?
37  PolysemousTraining *polysemous_training; ///< if NULL, use default
38 
39  // search-time parameters
40  size_t scan_table_threshold; ///< use table computation or on-the-fly?
41  size_t max_codes; ///< max nb of codes to visit to do a query
42  int polysemous_ht; ///< Hamming thresh for polysemous filtering
43 
44  std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
45 
46  /// if use_precompute_table
47  /// size nlist * pq.M * pq.ksub
48  std::vector <float> precomputed_table;
49 
50  IndexIVFPQ (
51  Index * quantizer, size_t d, size_t nlist,
52  size_t M, size_t nbits_per_idx);
53 
54  virtual void set_typename () override;
55 
56  virtual void add_with_ids (
57  idx_t n, const float *x,
58  const long *xids = nullptr) override;
59 
60  /// same as add_core, also:
61  /// - output 2nd level residuals if residuals_2 != NULL
62  /// - use precomputed list numbers if precomputed_idx != NULL
63  void add_core_o (idx_t n, const float *x,
64  const long *xids, float *residuals_2,
65  const long *precomputed_idx = nullptr);
66 
67  virtual void search (
68  idx_t n, const float *x, idx_t k,
69  float *distances, idx_t *labels) const override;
70 
71  virtual void reset () override;
72 
73  virtual long remove_ids (const IDSelector & sel) override;
74 
75  /// trains the product quantizer
76  virtual void train_residual(idx_t n, const float *x) override;
77 
78  /// same as train_residual, also output 2nd level residuals
79  void train_residual_o (idx_t n, const float *x, float *residuals_2);
80 
81 
82  /** Reconstruct a subset of the indexed vectors
83  *
84  * @param i0 first vector to reconstruct
85  * @param ni nb of vectors to reconstruct
86  * @param recons output array of reconstructed vectors, size ni * d
87  */
88  virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons)
89  const override;
90 
91  virtual void reconstruct (idx_t key, float * recons)
92  const override;
93 
94  /** Find exact duplicates in the dataset.
95  *
96  * the duplicates are returned in pre-allocated arrays (see the
97  * max sizes).
98  *
99  * @params lims limits between groups of duplicates
100  * (max size ntotal / 2 + 1)
101  * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
102  * duplicates (max size ntotal)
103  * @return n number of groups found
104  */
105  size_t find_duplicates (idx_t *ids, size_t *lims) const;
106 
107  // map a vector to a binary code knowning the index
108  void encode (long key, const float * x, uint8_t * code) const;
109 
110  /** Encode multiple vectors
111  *
112  * @param n nb vectors to encode
113  * @param keys posting list ids for those vectors (size n)
114  * @param x vectors (size n * d)
115  * @param codes output codes (size n * code_size)
116  * @param compute_keys if false, assume keys are precomputed,
117  * otherwise compute them
118  */
119  void encode_multiple (size_t n, long *keys,
120  const float * x, uint8_t * codes,
121  bool compute_keys = false) const;
122 
123  /// inverse of encode_multiple
124  void decode_multiple (size_t n, const long *keys,
125  const uint8_t * xcodes, float * x) const;
126 
127  /** search a set of vectors, that are pre-quantized by the IVF
128  * quantizer. Fill in the corresponding heaps with the query
129  * results.
130  *
131  * @param nx nb of vectors to query
132  * @param qx query vectors, size nx * d
133  * @param keys coarse quantization indices, size nx * nprobe
134  * @param coarse_dis
135  * distances to coarse centroids, size nx * nprobe
136  * @param res heaps for all the results, gives the nprobe
137  * @param store_pairs store inv list index + inv list offset
138  * instead in upper/lower 32 bit of result,
139  * instead of ids (used for reranking).
140  */
141  virtual void search_knn_with_key (
142  size_t nx,
143  const float * qx,
144  const long * keys,
145  const float * coarse_dis,
147  bool store_pairs = false) const;
148 
149  /// build precomputed table
150  void precompute_table ();
151 
152  /// used to implement merging
153  virtual void merge_from_residuals (IndexIVF &other) override;
154 
155 
156  /** copy a subset of the entries index to the other index
157  *
158  * if subset_type == 0: copies ids in [a1, a2)
159  * if subset_type == 1: copies ids if id % a1 == a2
160  */
161  void copy_subset_to (IndexIVFPQ & other, int subset_type,
162  long a1, long a2) const;
163 
164  IndexIVFPQ ();
165 
166 };
167 
168 
169 /// statistics are robust to internal threading, but not if
170 /// IndexIVFPQ::search is called by multiple threads
172  size_t nq; // nb of queries run
173  size_t nlist; // nb of inverted lists scanned
174  size_t ncode; // nb of codes visited
175  size_t nrefine; // nb of refines (IVFPQR)
176 
177  size_t n_hamming_pass;
178  // nb of passed Hamming distance tests (for polysemous)
179 
180  // timings measured with the CPU RTC
181  // on all threads
182  size_t assign_cycles;
183  size_t search_cycles;
184  size_t refine_cycles; // only for IVFPQR
185 
186  // single thread (double-counted with search_cycles)
187  size_t init_query_cycles;
188  size_t init_list_cycles;
189  size_t scan_cycles;
190  size_t heap_cycles;
191 
192  IndexIVFPQStats () {reset (); }
193  void reset ();
194 };
195 
196 // global var that collects them all
197 extern IndexIVFPQStats indexIVFPQ_stats;
198 
199 
200 
201 /** Index with an additional level of PQ refinement */
203  ProductQuantizer refine_pq; ///< 3rd level quantizer
204  std::vector <uint8_t> refine_codes; ///< corresponding codes
205 
206  /// factor between k requested in search and the k requested from the IVFPQ
207  float k_factor;
208 
209  IndexIVFPQR (
210  Index * quantizer, size_t d, size_t nlist,
211  size_t M, size_t nbits_per_idx,
212  size_t M_refine, size_t nbits_per_idx_refine);
213 
214  virtual void set_typename () override;
215 
216  virtual void reset() override;
217 
218  virtual long remove_ids (const IDSelector & sel) override;
219 
220  /// trains the two product quantizers
221  virtual void train_residual (idx_t n, const float *x) override;
222 
223  virtual void add_with_ids (idx_t n, const float *x, const long *xids)
224  override;
225 
226  /// same as add_with_ids, but optionally use the precomputed list ids
227  void add_core (idx_t n, const float *x, const long *xids,
228  const long *precomputed_idx = nullptr);
229 
230 
231  virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons)
232  const override;
233 
234  virtual void search (
235  idx_t n, const float *x, idx_t k,
236  float *distances, idx_t *labels) const override;
237 
238  virtual void merge_from_residuals (IndexIVF &other) override;
239 
240  IndexIVFPQR();
241 };
242 
243 
244 /** Index with 32-bit ids and flat tables. Must be constructed from an
245  * exisiting IndexIVFPQ. Cannot be copy-constructed/assigned. The
246  * actual data is stored in the compact_* tables, the ids and codes
247  * tables are not used. */
249 
250  explicit IndexIVFPQCompact (const IndexIVFPQ &other);
251 
252  /// how were the compact tables allocated?
254  Alloc_type_none, ///< alloc from outside
255  Alloc_type_new, ///< was allocated with new
256  Alloc_type_mmap ///< was mmapped
257  };
258 
259  Alloc_type_t alloc_type;
260 
261  uint32_t *limits; ///< size nlist + 1
262  uint32_t *compact_ids; ///< size ntotal
263  uint8_t *compact_codes; ///< size ntotal * code_size
264 
265  // file and buffer this was mmapped (will be unmapped when object
266  // is deleted)
267  char * mmap_buffer;
268  long mmap_length;
269 
270  virtual void search_knn_with_key (
271  size_t nx,
272  const float * qx,
273  const long * keys,
274  const float * coarse_dis,
275  float_maxheap_array_t * res,
276  bool store_pairs = false) const override;
277 
278  /// the three following functions will fail at runtime
279  virtual void add (idx_t, const float *) override;
280  virtual void reset () override;
281  virtual void train (idx_t, const float *) override;
282 
283  virtual ~IndexIVFPQCompact ();
284 
286 
287 };
288 
289 
290 
291 } // namespace faiss
292 
293 
294 
295 
296 
297 #endif
uint32_t * compact_ids
size ntotal
Definition: IndexIVFPQ.h:262
uint8_t * compact_codes
size ntotal * code_size
Definition: IndexIVFPQ.h:263
void precompute_table()
build precomputed table
Definition: IndexIVFPQ.cpp:400
void copy_subset_to(IndexIVFPQ &other, int subset_type, long a1, long a2) const
Definition: IndexIVFPQ.cpp:341
virtual void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVFPQ.cpp:311
ProductQuantizer refine_pq
3rd level quantizer
Definition: IndexIVFPQ.h:203
PolysemousTraining * polysemous_training
if NULL, use default
Definition: IndexIVFPQ.h:37
virtual void add(idx_t, const float *) override
the three following functions will fail at runtime
virtual void search_knn_with_key(size_t nx, const float *qx, const long *keys, const float *coarse_dis, float_maxheap_array_t *res, bool store_pairs=false) const override
virtual void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
virtual void merge_from_residuals(IndexIVF &other) override
used to implement merging
Definition: IndexIVFPQ.cpp:331
void decode_multiple(size_t n, const long *keys, const uint8_t *xcodes, float *x) const
inverse of encode_multiple
Definition: IndexIVFPQ.cpp:177
void train_residual_o(idx_t n, const float *x, float *residuals_2)
same as train_residual, also output 2nd level residuals
Definition: IndexIVFPQ.cpp:84
bool do_polysemous_training
reorder PQ centroids after training?
Definition: IndexIVFPQ.h:36
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:40
virtual void train_residual(idx_t n, const float *x) override
trains the two product quantizers
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx=nullptr)
same as add_with_ids, but optionally use the precomputed list ids
uint32_t * limits
size nlist + 1
Definition: IndexIVFPQ.h:261
std::vector< float > precomputed_table
Definition: IndexIVFPQ.h:48
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:42
virtual void search_knn_with_key(size_t nx, const float *qx, const long *keys, const float *coarse_dis, float_maxheap_array_t *res, bool store_pairs=false) const
Definition: IndexIVFPQ.cpp:970
virtual void reset() override
removes all elements from the database.
virtual void add_with_ids(idx_t n, const float *x, const long *xids=nullptr) override
Definition: IndexIVFPQ.cpp:194
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:56
int d
vector dimension
Definition: Index.h:66
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:50
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVFPQ.h:41
Alloc_type_t
how were the compact tables allocated?
Definition: IndexIVFPQ.h:253
std::vector< uint8_t > refine_codes
corresponding codes
Definition: IndexIVFPQ.h:204
virtual long remove_ids(const IDSelector &sel) override
virtual void train_residual(idx_t n, const float *x) override
trains the product quantizer
Definition: IndexIVFPQ.cpp:78
void encode_multiple(size_t n, long *keys, const float *x, uint8_t *codes, bool compute_keys=false) const
Definition: IndexIVFPQ.cpp:158
virtual void train(idx_t, const float *) override
Trains the quantizer and calls train_residual to train sub-quantizers.
long idx_t
all indices are this type
Definition: Index.h:64
virtual void reset() override
removes all elements from the database.
optimizes the order of indices in a ProductQuantizer
virtual void merge_from_residuals(IndexIVF &other) override
used to implement merging
bool by_residual
Encode residual or plain vector?
Definition: IndexIVFPQ.h:31
ProductQuantizer pq
produces the codes
Definition: IndexIVFPQ.h:34
size_t nlist
number of possible key values
Definition: IndexIVF.h:47
virtual void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVFPQ.cpp:282
void add_core_o(idx_t n, const float *x, const long *xids, float *residuals_2, const long *precomputed_idx=nullptr)
Definition: IndexIVFPQ.cpp:200
size_t code_size
code size per vector in bytes
Definition: IndexIVFPQ.h:33
virtual long remove_ids(const IDSelector &sel) override
virtual void reset() override
removes all elements from the database.
virtual void add_with_ids(idx_t n, const float *x, const long *xids) override
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
size_t find_duplicates(idx_t *ids, size_t *lims) const
float k_factor
factor between k requested in search and the k requested from the IVFPQ
Definition: IndexIVFPQ.h:207
int use_precomputed_table
if by_residual, build precompute tables
Definition: IndexIVFPQ.h:32