Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVFPQ.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 // -*- c++ -*-
11 
12 #ifndef FAISS_INDEX_IVFPQ_H
13 #define FAISS_INDEX_IVFPQ_H
14 
15 
16 #include <vector>
17 
18 #include "IndexIVF.h"
19 #include "IndexPQ.h"
20 
21 
22 namespace faiss {
23 
24 
25 
26 /** Inverted file with Product Quantizer encoding. Each residual
27  * vector is encoded as a product quantizer code.
28  */
30  bool by_residual; ///< Encode residual or plain vector?
31  int use_precomputed_table; ///< if by_residual, build precompute tables
32  ProductQuantizer pq; ///< produces the codes
33 
34  bool do_polysemous_training; ///< reorder PQ centroids after training?
35  PolysemousTraining *polysemous_training; ///< if NULL, use default
36 
37  // search-time parameters
38  size_t scan_table_threshold; ///< use table computation or on-the-fly?
39  int polysemous_ht; ///< Hamming thresh for polysemous filtering
40 
41 
42  /// if use_precompute_table
43  /// size nlist * pq.M * pq.ksub
44  std::vector <float> precomputed_table;
45 
46  IndexIVFPQ (
47  Index * quantizer, size_t d, size_t nlist,
48  size_t M, size_t nbits_per_idx);
49 
50  void add_with_ids(idx_t n, const float* x, const long* xids = nullptr)
51  override;
52 
53  /// same as add_core, also:
54  /// - output 2nd level residuals if residuals_2 != NULL
55  /// - use precomputed list numbers if precomputed_idx != NULL
56  void add_core_o (idx_t n, const float *x,
57  const long *xids, float *residuals_2,
58  const long *precomputed_idx = nullptr);
59 
60  /// trains the product quantizer
61  void train_residual(idx_t n, const float* x) override;
62 
63  /// same as train_residual, also output 2nd level residuals
64  void train_residual_o (idx_t n, const float *x, float *residuals_2);
65 
66  void reconstruct_from_offset (long list_no, long offset,
67  float* recons) const override;
68 
69  /** Find exact duplicates in the dataset.
70  *
71  * the duplicates are returned in pre-allocated arrays (see the
72  * max sizes).
73  *
74  * @params lims limits between groups of duplicates
75  * (max size ntotal / 2 + 1)
76  * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
77  * duplicates (max size ntotal)
78  * @return n number of groups found
79  */
80  size_t find_duplicates (idx_t *ids, size_t *lims) const;
81 
82  // map a vector to a binary code knowning the index
83  void encode (long key, const float * x, uint8_t * code) const;
84 
85  /** Encode multiple vectors
86  *
87  * @param n nb vectors to encode
88  * @param keys posting list ids for those vectors (size n)
89  * @param x vectors (size n * d)
90  * @param codes output codes (size n * code_size)
91  * @param compute_keys if false, assume keys are precomputed,
92  * otherwise compute them
93  */
94  void encode_multiple (size_t n, long *keys,
95  const float * x, uint8_t * codes,
96  bool compute_keys = false) const;
97 
98  /// inverse of encode_multiple
99  void decode_multiple (size_t n, const long *keys,
100  const uint8_t * xcodes, float * x) const;
101 
102  void search_preassigned (idx_t n, const float *x, idx_t k,
103  const idx_t *assign,
104  const float *centroid_dis,
105  float *distances, idx_t *labels,
106  bool store_pairs) const override;
107 
108  /// build precomputed table
109  void precompute_table ();
110 
111  IndexIVFPQ ();
112 
113 };
114 
115 
116 /// statistics are robust to internal threading, but not if
117 /// IndexIVFPQ::search_preassigned is called by multiple threads
119  size_t nq; // nb of queries run
120  size_t nlist; // nb of inverted lists scanned
121  size_t ncode; // nb of codes visited
122  size_t nrefine; // nb of refines (IVFPQR)
123 
124  size_t n_hamming_pass;
125  // nb of passed Hamming distance tests (for polysemous)
126 
127  // timings measured with the CPU RTC
128  // on all threads
129  size_t assign_cycles;
130  size_t search_cycles;
131  size_t refine_cycles; // only for IVFPQR
132 
133  // single thread (double-counted with search_cycles)
134  size_t init_query_cycles;
135  size_t init_list_cycles;
136  size_t scan_cycles;
137  size_t heap_cycles;
138 
139  IndexIVFPQStats () {reset (); }
140  void reset ();
141 };
142 
143 // global var that collects them all
144 extern IndexIVFPQStats indexIVFPQ_stats;
145 
146 
147 
148 /** Index with an additional level of PQ refinement */
150  ProductQuantizer refine_pq; ///< 3rd level quantizer
151  std::vector <uint8_t> refine_codes; ///< corresponding codes
152 
153  /// factor between k requested in search and the k requested from the IVFPQ
154  float k_factor;
155 
156  IndexIVFPQR (
157  Index * quantizer, size_t d, size_t nlist,
158  size_t M, size_t nbits_per_idx,
159  size_t M_refine, size_t nbits_per_idx_refine);
160 
161  void reset() override;
162 
163  long remove_ids(const IDSelector& sel) override;
164 
165  /// trains the two product quantizers
166  void train_residual(idx_t n, const float* x) override;
167 
168  void add_with_ids(idx_t n, const float* x, const long* xids) override;
169 
170  /// same as add_with_ids, but optionally use the precomputed list ids
171  void add_core (idx_t n, const float *x, const long *xids,
172  const long *precomputed_idx = nullptr);
173 
174  void reconstruct_from_offset (long list_no, long offset,
175  float* recons) const override;
176 
177  void merge_from (IndexIVF &other, idx_t add_id) override;
178 
179 
180  void search_preassigned (idx_t n, const float *x, idx_t k,
181  const idx_t *assign,
182  const float *centroid_dis,
183  float *distances, idx_t *labels,
184  bool store_pairs) const override;
185 
186  IndexIVFPQR();
187 };
188 
189 
190 /** Index with 32-bit ids and flat tables. Must be constructed from an
191  * exisiting IndexIVFPQ. Cannot be copy-constructed/assigned. The
192  * actual data is stored in the compact_* tables, the ids and codes
193  * tables are not used. */
195 
196  explicit IndexIVFPQCompact (const IndexIVFPQ &other);
197 
198  /// how were the compact tables allocated?
200  Alloc_type_none, ///< alloc from outside
201  Alloc_type_new, ///< was allocated with new
202  Alloc_type_mmap ///< was mmapped
203  };
204 
205  Alloc_type_t alloc_type;
206 
207  uint32_t *limits; ///< size nlist + 1
208  uint32_t *compact_ids; ///< size ntotal
209  uint8_t *compact_codes; ///< size ntotal * code_size
210 
211  // file and buffer this was mmapped (will be unmapped when object
212  // is deleted)
213  char * mmap_buffer;
214  long mmap_length;
215 
216  void search_preassigned (idx_t n, const float *x, idx_t k,
217  const idx_t *assign,
218  const float *centroid_dis,
219  float *distances, idx_t *labels,
220  bool store_pairs) const override;
221 
222  /// the three following functions will fail at runtime
223  void add(idx_t, const float*) override;
224  void reset() override;
225  void train(idx_t, const float*) override;
226 
227  ~IndexIVFPQCompact() override;
228 
230 
231 };
232 
233 
234 /** Same as an IndexIVFPQ without the inverted lists: codes are stored sequentially
235  *
236  * The class is mainly inteded to store encoded vectors that can be
237  * accessed randomly, the search function is not implemented.
238  */
240  /// first level quantizer
242 
243  /// second level quantizer is always a PQ
245 
246  /// Codes. Size ntotal * code_size.
247  std::vector<uint8_t> codes;
248 
249  /// size of the code for the first level (ceil(log8(q1.nlist)))
250  size_t code_size_1;
251 
252  /// size of the code for the second level
253  size_t code_size_2;
254 
255  /// code_size_1 + code_size_2
256  size_t code_size;
257 
258  Index2Layer (Index * quantizer, size_t nlist,
259  int M, MetricType metric = METRIC_L2);
260 
261  Index2Layer ();
262  ~Index2Layer ();
263 
264  void train(idx_t n, const float* x) override;
265 
266  void add(idx_t n, const float* x) override;
267 
268  /// not implemented
269  void search(
270  idx_t n,
271  const float* x,
272  idx_t k,
273  float* distances,
274  idx_t* labels) const override;
275 
276  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
277 
278  void reconstruct(idx_t key, float* recons) const override;
279 
280  void reset() override;
281 
282  /// transfer the flat codes to an IVFPQ index
283  void transfer_to_IVFPQ(IndexIVFPQ & other) const;
284 
285 };
286 
287 
288 
289 } // namespace faiss
290 
291 
292 
293 
294 
295 #endif
uint32_t * compact_ids
size ntotal
Definition: IndexIVFPQ.h:208
uint8_t * compact_codes
size ntotal * code_size
Definition: IndexIVFPQ.h:209
void precompute_table()
build precomputed table
Definition: IndexIVFPQ.cpp:344
void merge_from(IndexIVF &other, idx_t add_id) override
void transfer_to_IVFPQ(IndexIVFPQ &other) const
transfer the flat codes to an IVFPQ index
size_t code_size_2
size of the code for the second level
Definition: IndexIVFPQ.h:253
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
ProductQuantizer refine_pq
3rd level quantizer
Definition: IndexIVFPQ.h:150
PolysemousTraining * polysemous_training
if NULL, use default
Definition: IndexIVFPQ.h:35
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
size_t code_size
code_size_1 + code_size_2
Definition: IndexIVFPQ.h:256
void add(idx_t, const float *) override
the three following functions will fail at runtime
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
void reset() override
removes all elements from the database.
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
void decode_multiple(size_t n, const long *keys, const uint8_t *xcodes, float *x) const
inverse of encode_multiple
Definition: IndexIVFPQ.cpp:168
void train_residual_o(idx_t n, const float *x, float *residuals_2)
same as train_residual, also output 2nd level residuals
Definition: IndexIVFPQ.cpp:70
bool do_polysemous_training
reorder PQ centroids after training?
Definition: IndexIVFPQ.h:34
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:38
void train_residual(idx_t n, const float *x) override
trains the two product quantizers
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx=nullptr)
same as add_with_ids, but optionally use the precomputed list ids
uint32_t * limits
size nlist + 1
Definition: IndexIVFPQ.h:207
std::vector< float > precomputed_table
Definition: IndexIVFPQ.h:44
Level1Quantizer q1
first level quantizer
Definition: IndexIVFPQ.h:241
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:39
void reset() override
removes all elements from the database.
std::vector< uint8_t > codes
Codes. Size ntotal * code_size.
Definition: IndexIVFPQ.h:247
void add_with_ids(idx_t n, const float *x, const long *xids=nullptr) override
Definition: IndexIVFPQ.cpp:185
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:81
int d
vector dimension
Definition: Index.h:64
Alloc_type_t
how were the compact tables allocated?
Definition: IndexIVFPQ.h:199
std::vector< uint8_t > refine_codes
corresponding codes
Definition: IndexIVFPQ.h:151
void train_residual(idx_t n, const float *x) override
trains the product quantizer
Definition: IndexIVFPQ.cpp:64
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
Definition: IndexIVFPQ.cpp:291
void encode_multiple(size_t n, long *keys, const float *x, uint8_t *codes, bool compute_keys=false) const
Definition: IndexIVFPQ.cpp:149
void train(idx_t, const float *) override
Trains the quantizer and calls train_residual to train sub-quantizers.
long idx_t
all indices are this type
Definition: Index.h:62
void train(idx_t n, const float *x) override
optimizes the order of indices in a ProductQuantizer
void add(idx_t n, const float *x) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
not implemented
bool by_residual
Encode residual or plain vector?
Definition: IndexIVFPQ.h:30
ProductQuantizer pq
produces the codes
Definition: IndexIVFPQ.h:32
size_t code_size_1
size of the code for the first level (ceil(log8(q1.nlist)))
Definition: IndexIVFPQ.h:250
void add_core_o(idx_t n, const float *x, const long *xids, float *residuals_2, const long *precomputed_idx=nullptr)
Definition: IndexIVFPQ.cpp:191
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
Definition: IndexIVFPQ.cpp:919
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
void reset() override
removes all elements from the database.
ProductQuantizer pq
second level quantizer is always a PQ
Definition: IndexIVFPQ.h:244
void add_with_ids(idx_t n, const float *x, const long *xids) override
void reconstruct(idx_t key, float *recons) const override
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t find_duplicates(idx_t *ids, size_t *lims) const
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:43
float k_factor
factor between k requested in search and the k requested from the IVFPQ
Definition: IndexIVFPQ.h:154
int use_precomputed_table
if by_residual, build precompute tables
Definition: IndexIVFPQ.h:31