Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVFPQ.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 // -*- c++ -*-
11 
12 #ifndef FAISS_INDEX_IVFPQ_H
13 #define FAISS_INDEX_IVFPQ_H
14 
15 
16 #include <vector>
17 
18 #include "IndexIVF.h"
19 #include "IndexPQ.h"
20 
21 
22 namespace faiss {
23 
24 
25 
26 /** Inverted file with Product Quantizer encoding. Each residual
27  * vector is encoded as a product quantizer code.
28  */
30  bool by_residual; ///< Encode residual or plain vector?
31  int use_precomputed_table; ///< if by_residual, build precompute tables
32  ProductQuantizer pq; ///< produces the codes
33 
34  bool do_polysemous_training; ///< reorder PQ centroids after training?
35  PolysemousTraining *polysemous_training; ///< if NULL, use default
36 
37  // search-time parameters
38  size_t scan_table_threshold; ///< use table computation or on-the-fly?
39  size_t max_codes; ///< max nb of codes to visit to do a query
40  int polysemous_ht; ///< Hamming thresh for polysemous filtering
41 
42 
43  /// if use_precompute_table
44  /// size nlist * pq.M * pq.ksub
45  std::vector <float> precomputed_table;
46 
47  IndexIVFPQ (
48  Index * quantizer, size_t d, size_t nlist,
49  size_t M, size_t nbits_per_idx);
50 
51  void add_with_ids(idx_t n, const float* x, const long* xids = nullptr)
52  override;
53 
54  /// same as add_core, also:
55  /// - output 2nd level residuals if residuals_2 != NULL
56  /// - use precomputed list numbers if precomputed_idx != NULL
57  void add_core_o (idx_t n, const float *x,
58  const long *xids, float *residuals_2,
59  const long *precomputed_idx = nullptr);
60 
61  /// trains the product quantizer
62  void train_residual(idx_t n, const float* x) override;
63 
64  /// same as train_residual, also output 2nd level residuals
65  void train_residual_o (idx_t n, const float *x, float *residuals_2);
66 
67 
68  /** Reconstruct a subset of the indexed vectors
69  *
70  * @param i0 first vector to reconstruct
71  * @param ni nb of vectors to reconstruct
72  * @param recons output array of reconstructed vectors, size ni * d
73  */
74  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
75 
76  void reconstruct(idx_t key, float* recons) const override;
77 
78  /** Find exact duplicates in the dataset.
79  *
80  * the duplicates are returned in pre-allocated arrays (see the
81  * max sizes).
82  *
83  * @params lims limits between groups of duplicates
84  * (max size ntotal / 2 + 1)
85  * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
86  * duplicates (max size ntotal)
87  * @return n number of groups found
88  */
89  size_t find_duplicates (idx_t *ids, size_t *lims) const;
90 
91  // map a vector to a binary code knowning the index
92  void encode (long key, const float * x, uint8_t * code) const;
93 
94  /** Encode multiple vectors
95  *
96  * @param n nb vectors to encode
97  * @param keys posting list ids for those vectors (size n)
98  * @param x vectors (size n * d)
99  * @param codes output codes (size n * code_size)
100  * @param compute_keys if false, assume keys are precomputed,
101  * otherwise compute them
102  */
103  void encode_multiple (size_t n, long *keys,
104  const float * x, uint8_t * codes,
105  bool compute_keys = false) const;
106 
107  /// inverse of encode_multiple
108  void decode_multiple (size_t n, const long *keys,
109  const uint8_t * xcodes, float * x) const;
110 
111  void search_preassigned (idx_t n, const float *x, idx_t k,
112  const idx_t *assign,
113  const float *centroid_dis,
114  float *distances, idx_t *labels,
115  bool store_pairs) const override;
116 
117 
118  /// build precomputed table
119  void precompute_table ();
120 
121  IndexIVFPQ ();
122 
123 };
124 
125 
126 /// statistics are robust to internal threading, but not if
127 /// IndexIVFPQ::search_preassigned is called by multiple threads
129  size_t nq; // nb of queries run
130  size_t nlist; // nb of inverted lists scanned
131  size_t ncode; // nb of codes visited
132  size_t nrefine; // nb of refines (IVFPQR)
133 
134  size_t n_hamming_pass;
135  // nb of passed Hamming distance tests (for polysemous)
136 
137  // timings measured with the CPU RTC
138  // on all threads
139  size_t assign_cycles;
140  size_t search_cycles;
141  size_t refine_cycles; // only for IVFPQR
142 
143  // single thread (double-counted with search_cycles)
144  size_t init_query_cycles;
145  size_t init_list_cycles;
146  size_t scan_cycles;
147  size_t heap_cycles;
148 
149  IndexIVFPQStats () {reset (); }
150  void reset ();
151 };
152 
153 // global var that collects them all
154 extern IndexIVFPQStats indexIVFPQ_stats;
155 
156 
157 
158 /** Index with an additional level of PQ refinement */
160  ProductQuantizer refine_pq; ///< 3rd level quantizer
161  std::vector <uint8_t> refine_codes; ///< corresponding codes
162 
163  /// factor between k requested in search and the k requested from the IVFPQ
164  float k_factor;
165 
166  IndexIVFPQR (
167  Index * quantizer, size_t d, size_t nlist,
168  size_t M, size_t nbits_per_idx,
169  size_t M_refine, size_t nbits_per_idx_refine);
170 
171  void reset() override;
172 
173  long remove_ids(const IDSelector& sel) override;
174 
175  /// trains the two product quantizers
176  void train_residual(idx_t n, const float* x) override;
177 
178  void add_with_ids(idx_t n, const float* x, const long* xids) override;
179 
180  /// same as add_with_ids, but optionally use the precomputed list ids
181  void add_core (idx_t n, const float *x, const long *xids,
182  const long *precomputed_idx = nullptr);
183 
184  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
185 
186  void merge_from (IndexIVF &other, idx_t add_id) override;
187 
188 
189  void search(
190  idx_t n,
191  const float* x,
192  idx_t k,
193  float* distances,
194  idx_t* labels) const override;
195 
196  IndexIVFPQR();
197 };
198 
199 
200 /** Index with 32-bit ids and flat tables. Must be constructed from an
201  * exisiting IndexIVFPQ. Cannot be copy-constructed/assigned. The
202  * actual data is stored in the compact_* tables, the ids and codes
203  * tables are not used. */
205 
206  explicit IndexIVFPQCompact (const IndexIVFPQ &other);
207 
208  /// how were the compact tables allocated?
210  Alloc_type_none, ///< alloc from outside
211  Alloc_type_new, ///< was allocated with new
212  Alloc_type_mmap ///< was mmapped
213  };
214 
215  Alloc_type_t alloc_type;
216 
217  uint32_t *limits; ///< size nlist + 1
218  uint32_t *compact_ids; ///< size ntotal
219  uint8_t *compact_codes; ///< size ntotal * code_size
220 
221  // file and buffer this was mmapped (will be unmapped when object
222  // is deleted)
223  char * mmap_buffer;
224  long mmap_length;
225 
226  void search_preassigned (idx_t n, const float *x, idx_t k,
227  const idx_t *assign,
228  const float *centroid_dis,
229  float *distances, idx_t *labels,
230  bool store_pairs) const override;
231 
232  /// the three following functions will fail at runtime
233  void add(idx_t, const float*) override;
234  void reset() override;
235  void train(idx_t, const float*) override;
236 
237  ~IndexIVFPQCompact() override;
238 
240 
241 };
242 
243 
244 
245 } // namespace faiss
246 
247 
248 
249 
250 
251 #endif
uint32_t * compact_ids
size ntotal
Definition: IndexIVFPQ.h:218
uint8_t * compact_codes
size ntotal * code_size
Definition: IndexIVFPQ.h:219
void precompute_table()
build precomputed table
Definition: IndexIVFPQ.cpp:356
void merge_from(IndexIVF &other, idx_t add_id) override
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVFPQ.cpp:303
ProductQuantizer refine_pq
3rd level quantizer
Definition: IndexIVFPQ.h:160
PolysemousTraining * polysemous_training
if NULL, use default
Definition: IndexIVFPQ.h:35
void add(idx_t, const float *) override
the three following functions will fail at runtime
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:23
void decode_multiple(size_t n, const long *keys, const uint8_t *xcodes, float *x) const
inverse of encode_multiple
Definition: IndexIVFPQ.cpp:169
void train_residual_o(idx_t n, const float *x, float *residuals_2)
same as train_residual, also output 2nd level residuals
Definition: IndexIVFPQ.cpp:71
bool do_polysemous_training
reorder PQ centroids after training?
Definition: IndexIVFPQ.h:34
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:38
void train_residual(idx_t n, const float *x) override
trains the two product quantizers
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx=nullptr)
same as add_with_ids, but optionally use the precomputed list ids
uint32_t * limits
size nlist + 1
Definition: IndexIVFPQ.h:217
std::vector< float > precomputed_table
Definition: IndexIVFPQ.h:45
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:40
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids=nullptr) override
Definition: IndexIVFPQ.cpp:186
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:55
int d
vector dimension
Definition: Index.h:64
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:49
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVFPQ.h:39
Alloc_type_t
how were the compact tables allocated?
Definition: IndexIVFPQ.h:209
std::vector< uint8_t > refine_codes
corresponding codes
Definition: IndexIVFPQ.h:161
void train_residual(idx_t n, const float *x) override
trains the product quantizer
Definition: IndexIVFPQ.cpp:65
void encode_multiple(size_t n, long *keys, const float *x, uint8_t *codes, bool compute_keys=false) const
Definition: IndexIVFPQ.cpp:150
void train(idx_t, const float *) override
Trains the quantizer and calls train_residual to train sub-quantizers.
long idx_t
all indices are this type
Definition: Index.h:62
optimizes the order of indices in a ProductQuantizer
bool by_residual
Encode residual or plain vector?
Definition: IndexIVFPQ.h:30
ProductQuantizer pq
produces the codes
Definition: IndexIVFPQ.h:32
size_t nlist
number of possible key values
Definition: IndexIVF.h:46
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVFPQ.cpp:274
void add_core_o(idx_t n, const float *x, const long *xids, float *residuals_2, const long *precomputed_idx=nullptr)
Definition: IndexIVFPQ.cpp:192
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
Definition: IndexIVFPQ.cpp:931
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
size_t find_duplicates(idx_t *ids, size_t *lims) const
float k_factor
factor between k requested in search and the k requested from the IVFPQ
Definition: IndexIVFPQ.h:164
int use_precomputed_table
if by_residual, build precompute tables
Definition: IndexIVFPQ.h:31