Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVFPQ.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the CC-by-NC license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 // -*- c++ -*-
11 
12 #ifndef FAISS_INDEX_IVFPQ_H
13 #define FAISS_INDEX_IVFPQ_H
14 
15 
16 #include <vector>
17 
18 #include "IndexIVF.h"
19 #include "IndexPQ.h"
20 
21 
22 namespace faiss {
23 
24 
25 
26 /** Inverted file with Product Quantizer encoding. Each residual
27  * vector is encoded as a product quantizer code.
28  */
30  bool by_residual; ///< Encode residual or plain vector?
31  int use_precomputed_table; ///< if by_residual, build precompute tables
32  size_t code_size; ///< code size per vector in bytes
33  ProductQuantizer pq; ///< produces the codes
34 
35  bool do_polysemous_training; ///< reorder PQ centroids after training?
36  PolysemousTraining *polysemous_training; ///< if NULL, use default
37 
38  // search-time parameters
39  size_t scan_table_threshold; ///< use table computation or on-the-fly?
40  size_t max_codes; ///< max nb of codes to visit to do a query
41  int polysemous_ht; ///< Hamming thresh for polysemous filtering
42 
43  std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
44 
45  /// if use_precompute_table
46  /// size nlist * pq.M * pq.ksub
47  std::vector <float> precomputed_table;
48 
49  IndexIVFPQ (
50  Index * quantizer, size_t d, size_t nlist,
51  size_t M, size_t nbits_per_idx);
52 
53  void add_with_ids(idx_t n, const float* x, const long* xids = nullptr)
54  override;
55 
56  /// same as add_core, also:
57  /// - output 2nd level residuals if residuals_2 != NULL
58  /// - use precomputed list numbers if precomputed_idx != NULL
59  void add_core_o (idx_t n, const float *x,
60  const long *xids, float *residuals_2,
61  const long *precomputed_idx = nullptr);
62 
63  void search(
64  idx_t n,
65  const float* x,
66  idx_t k,
67  float* distances,
68  idx_t* labels) const override;
69 
70  void reset() override;
71 
72  long remove_ids(const IDSelector& sel) override;
73 
74  /// trains the product quantizer
75  void train_residual(idx_t n, const float* x) override;
76 
77  /// same as train_residual, also output 2nd level residuals
78  void train_residual_o (idx_t n, const float *x, float *residuals_2);
79 
80 
81  /** Reconstruct a subset of the indexed vectors
82  *
83  * @param i0 first vector to reconstruct
84  * @param ni nb of vectors to reconstruct
85  * @param recons output array of reconstructed vectors, size ni * d
86  */
87  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
88 
89  void reconstruct(idx_t key, float* recons) const override;
90 
91  /** Find exact duplicates in the dataset.
92  *
93  * the duplicates are returned in pre-allocated arrays (see the
94  * max sizes).
95  *
96  * @params lims limits between groups of duplicates
97  * (max size ntotal / 2 + 1)
98  * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
99  * duplicates (max size ntotal)
100  * @return n number of groups found
101  */
102  size_t find_duplicates (idx_t *ids, size_t *lims) const;
103 
104  // map a vector to a binary code knowning the index
105  void encode (long key, const float * x, uint8_t * code) const;
106 
107  /** Encode multiple vectors
108  *
109  * @param n nb vectors to encode
110  * @param keys posting list ids for those vectors (size n)
111  * @param x vectors (size n * d)
112  * @param codes output codes (size n * code_size)
113  * @param compute_keys if false, assume keys are precomputed,
114  * otherwise compute them
115  */
116  void encode_multiple (size_t n, long *keys,
117  const float * x, uint8_t * codes,
118  bool compute_keys = false) const;
119 
120  /// inverse of encode_multiple
121  void decode_multiple (size_t n, const long *keys,
122  const uint8_t * xcodes, float * x) const;
123 
124  /** search a set of vectors, that are pre-quantized by the IVF
125  * quantizer. Fill in the corresponding heaps with the query
126  * results.
127  *
128  * @param nx nb of vectors to query
129  * @param qx query vectors, size nx * d
130  * @param keys coarse quantization indices, size nx * nprobe
131  * @param coarse_dis
132  * distances to coarse centroids, size nx * nprobe
133  * @param res heaps for all the results, gives the nprobe
134  * @param store_pairs store inv list index + inv list offset
135  * instead in upper/lower 32 bit of result,
136  * instead of ids (used for reranking).
137  */
138  virtual void search_knn_with_key (
139  size_t nx,
140  const float * qx,
141  const long * keys,
142  const float * coarse_dis,
144  bool store_pairs = false) const;
145 
146  /// build precomputed table
147  void precompute_table ();
148 
149  /// used to implement merging
150  void merge_from_residuals(IndexIVF& other) override;
151 
152  /** copy a subset of the entries index to the other index
153  *
154  * if subset_type == 0: copies ids in [a1, a2)
155  * if subset_type == 1: copies ids if id % a1 == a2
156  */
157  void copy_subset_to (IndexIVFPQ & other, int subset_type,
158  long a1, long a2) const;
159 
160  IndexIVFPQ ();
161 
162 };
163 
164 
165 /// statistics are robust to internal threading, but not if
166 /// IndexIVFPQ::search is called by multiple threads
168  size_t nq; // nb of queries run
169  size_t nlist; // nb of inverted lists scanned
170  size_t ncode; // nb of codes visited
171  size_t nrefine; // nb of refines (IVFPQR)
172 
173  size_t n_hamming_pass;
174  // nb of passed Hamming distance tests (for polysemous)
175 
176  // timings measured with the CPU RTC
177  // on all threads
178  size_t assign_cycles;
179  size_t search_cycles;
180  size_t refine_cycles; // only for IVFPQR
181 
182  // single thread (double-counted with search_cycles)
183  size_t init_query_cycles;
184  size_t init_list_cycles;
185  size_t scan_cycles;
186  size_t heap_cycles;
187 
188  IndexIVFPQStats () {reset (); }
189  void reset ();
190 };
191 
192 // global var that collects them all
193 extern IndexIVFPQStats indexIVFPQ_stats;
194 
195 
196 
197 /** Index with an additional level of PQ refinement */
199  ProductQuantizer refine_pq; ///< 3rd level quantizer
200  std::vector <uint8_t> refine_codes; ///< corresponding codes
201 
202  /// factor between k requested in search and the k requested from the IVFPQ
203  float k_factor;
204 
205  IndexIVFPQR (
206  Index * quantizer, size_t d, size_t nlist,
207  size_t M, size_t nbits_per_idx,
208  size_t M_refine, size_t nbits_per_idx_refine);
209 
210  void reset() override;
211 
212  long remove_ids(const IDSelector& sel) override;
213 
214  /// trains the two product quantizers
215  void train_residual(idx_t n, const float* x) override;
216 
217  void add_with_ids(idx_t n, const float* x, const long* xids) override;
218 
219  /// same as add_with_ids, but optionally use the precomputed list ids
220  void add_core (idx_t n, const float *x, const long *xids,
221  const long *precomputed_idx = nullptr);
222 
223  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
224 
225  void search(
226  idx_t n,
227  const float* x,
228  idx_t k,
229  float* distances,
230  idx_t* labels) const override;
231 
232  void merge_from_residuals(IndexIVF& other) override;
233 
234  IndexIVFPQR();
235 };
236 
237 
238 /** Index with 32-bit ids and flat tables. Must be constructed from an
239  * exisiting IndexIVFPQ. Cannot be copy-constructed/assigned. The
240  * actual data is stored in the compact_* tables, the ids and codes
241  * tables are not used. */
243 
244  explicit IndexIVFPQCompact (const IndexIVFPQ &other);
245 
246  /// how were the compact tables allocated?
248  Alloc_type_none, ///< alloc from outside
249  Alloc_type_new, ///< was allocated with new
250  Alloc_type_mmap ///< was mmapped
251  };
252 
253  Alloc_type_t alloc_type;
254 
255  uint32_t *limits; ///< size nlist + 1
256  uint32_t *compact_ids; ///< size ntotal
257  uint8_t *compact_codes; ///< size ntotal * code_size
258 
259  // file and buffer this was mmapped (will be unmapped when object
260  // is deleted)
261  char * mmap_buffer;
262  long mmap_length;
263 
264  void search_knn_with_key(
265  size_t nx,
266  const float* qx,
267  const long* keys,
268  const float* coarse_dis,
270  bool store_pairs = false) const override;
271 
272  /// the three following functions will fail at runtime
273  void add(idx_t, const float*) override;
274  void reset() override;
275  void train(idx_t, const float*) override;
276 
277  ~IndexIVFPQCompact() override;
278 
280 
281 };
282 
283 
284 
285 } // namespace faiss
286 
287 
288 
289 
290 
291 #endif
uint32_t * compact_ids
size ntotal
Definition: IndexIVFPQ.h:256
uint8_t * compact_codes
size ntotal * code_size
Definition: IndexIVFPQ.h:257
void precompute_table()
build precomputed table
Definition: IndexIVFPQ.cpp:391
void copy_subset_to(IndexIVFPQ &other, int subset_type, long a1, long a2) const
Definition: IndexIVFPQ.cpp:332
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVFPQ.cpp:302
ProductQuantizer refine_pq
3rd level quantizer
Definition: IndexIVFPQ.h:199
PolysemousTraining * polysemous_training
if NULL, use default
Definition: IndexIVFPQ.h:36
void add(idx_t, const float *) override
the three following functions will fail at runtime
void search_knn_with_key(size_t nx, const float *qx, const long *keys, const float *coarse_dis, float_maxheap_array_t *res, bool store_pairs=false) const override
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void merge_from_residuals(IndexIVF &other) override
used to implement merging
Definition: IndexIVFPQ.cpp:322
void decode_multiple(size_t n, const long *keys, const uint8_t *xcodes, float *x) const
inverse of encode_multiple
Definition: IndexIVFPQ.cpp:168
void train_residual_o(idx_t n, const float *x, float *residuals_2)
same as train_residual, also output 2nd level residuals
Definition: IndexIVFPQ.cpp:72
bool do_polysemous_training
reorder PQ centroids after training?
Definition: IndexIVFPQ.h:35
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:39
void train_residual(idx_t n, const float *x) override
trains the two product quantizers
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx=nullptr)
same as add_with_ids, but optionally use the precomputed list ids
uint32_t * limits
size nlist + 1
Definition: IndexIVFPQ.h:255
std::vector< float > precomputed_table
Definition: IndexIVFPQ.h:47
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:41
virtual void search_knn_with_key(size_t nx, const float *qx, const long *keys, const float *coarse_dis, float_maxheap_array_t *res, bool store_pairs=false) const
Definition: IndexIVFPQ.cpp:963
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids=nullptr) override
Definition: IndexIVFPQ.cpp:185
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:55
int d
vector dimension
Definition: Index.h:64
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:49
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVFPQ.h:40
Alloc_type_t
how were the compact tables allocated?
Definition: IndexIVFPQ.h:247
std::vector< uint8_t > refine_codes
corresponding codes
Definition: IndexIVFPQ.h:200
long remove_ids(const IDSelector &sel) override
void train_residual(idx_t n, const float *x) override
trains the product quantizer
Definition: IndexIVFPQ.cpp:66
void encode_multiple(size_t n, long *keys, const float *x, uint8_t *codes, bool compute_keys=false) const
Definition: IndexIVFPQ.cpp:149
void train(idx_t, const float *) override
Trains the quantizer and calls train_residual to train sub-quantizers.
long idx_t
all indices are this type
Definition: Index.h:62
void reset() override
removes all elements from the database.
optimizes the order of indices in a ProductQuantizer
void merge_from_residuals(IndexIVF &other) override
used to implement merging
bool by_residual
Encode residual or plain vector?
Definition: IndexIVFPQ.h:30
ProductQuantizer pq
produces the codes
Definition: IndexIVFPQ.h:33
size_t nlist
number of possible key values
Definition: IndexIVF.h:46
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVFPQ.cpp:273
void add_core_o(idx_t n, const float *x, const long *xids, float *residuals_2, const long *precomputed_idx=nullptr)
Definition: IndexIVFPQ.cpp:191
size_t code_size
code size per vector in bytes
Definition: IndexIVFPQ.h:32
long remove_ids(const IDSelector &sel) override
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
size_t find_duplicates(idx_t *ids, size_t *lims) const
float k_factor
factor between k requested in search and the k requested from the IVFPQ
Definition: IndexIVFPQ.h:203
int use_precomputed_table
if by_residual, build precompute tables
Definition: IndexIVFPQ.h:31