Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/IndexIVFPQ.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #ifndef FAISS_INDEX_IVFPQ_H
11 #define FAISS_INDEX_IVFPQ_H
12 
13 
14 #include <vector>
15 
16 #include "IndexIVF.h"
17 #include "IndexPQ.h"
18 
19 
20 namespace faiss {
21 
23  size_t scan_table_threshold; ///< use table computation or on-the-fly?
24  int polysemous_ht; ///< Hamming thresh for polysemous filtering
26 };
27 
28 
29 
30 
31 /** Inverted file with Product Quantizer encoding. Each residual
32  * vector is encoded as a product quantizer code.
33  */
35  bool by_residual; ///< Encode residual or plain vector?
36 
37  ProductQuantizer pq; ///< produces the codes
38 
39  bool do_polysemous_training; ///< reorder PQ centroids after training?
40  PolysemousTraining *polysemous_training; ///< if NULL, use default
41 
42  // search-time parameters
43  size_t scan_table_threshold; ///< use table computation or on-the-fly?
44  int polysemous_ht; ///< Hamming thresh for polysemous filtering
45 
46  /** Precompute table that speed up query preprocessing at some
47  * memory cost
48  * =-1: force disable
49  * =0: decide heuristically (default: use tables only if they are
50  * < precomputed_tables_max_bytes)
51  * =1: tables that work for all quantizers (size 256 * nlist * M)
52  * =2: specific version for MultiIndexQuantizer (much more compact)
53  */
54  int use_precomputed_table; ///< if by_residual, build precompute tables
56 
57  /// if use_precompute_table
58  /// size nlist * pq.M * pq.ksub
59  std::vector <float> precomputed_table;
60 
61  IndexIVFPQ (
62  Index * quantizer, size_t d, size_t nlist,
63  size_t M, size_t nbits_per_idx);
64 
65  void add_with_ids(idx_t n, const float* x, const long* xids = nullptr)
66  override;
67 
68  void encode_vectors(idx_t n, const float* x,
69  const idx_t *list_nos,
70  uint8_t * codes) const override;
71 
72  /// same as add_core, also:
73  /// - output 2nd level residuals if residuals_2 != NULL
74  /// - use precomputed list numbers if precomputed_idx != NULL
75  void add_core_o (idx_t n, const float *x,
76  const long *xids, float *residuals_2,
77  const long *precomputed_idx = nullptr);
78 
79  /// trains the product quantizer
80  void train_residual(idx_t n, const float* x) override;
81 
82  /// same as train_residual, also output 2nd level residuals
83  void train_residual_o (idx_t n, const float *x, float *residuals_2);
84 
85  void reconstruct_from_offset (long list_no, long offset,
86  float* recons) const override;
87 
88  /** Find exact duplicates in the dataset.
89  *
90  * the duplicates are returned in pre-allocated arrays (see the
91  * max sizes).
92  *
93  * @params lims limits between groups of duplicates
94  * (max size ntotal / 2 + 1)
95  * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
96  * duplicates (max size ntotal)
97  * @return n number of groups found
98  */
99  size_t find_duplicates (idx_t *ids, size_t *lims) const;
100 
101  // map a vector to a binary code knowning the index
102  void encode (long key, const float * x, uint8_t * code) const;
103 
104  /** Encode multiple vectors
105  *
106  * @param n nb vectors to encode
107  * @param keys posting list ids for those vectors (size n)
108  * @param x vectors (size n * d)
109  * @param codes output codes (size n * code_size)
110  * @param compute_keys if false, assume keys are precomputed,
111  * otherwise compute them
112  */
113  void encode_multiple (size_t n, long *keys,
114  const float * x, uint8_t * codes,
115  bool compute_keys = false) const;
116 
117  /// inverse of encode_multiple
118  void decode_multiple (size_t n, const long *keys,
119  const uint8_t * xcodes, float * x) const;
120 
121  InvertedListScanner *get_InvertedListScanner (bool store_pairs)
122  const override;
123 
124  /// build precomputed table
125  void precompute_table ();
126 
127  IndexIVFPQ ();
128 
129 };
130 
131 
132 /// statistics are robust to internal threading, but not if
133 /// IndexIVFPQ::search_preassigned is called by multiple threads
135  size_t nrefine; // nb of refines (IVFPQR)
136 
137  size_t n_hamming_pass;
138  // nb of passed Hamming distance tests (for polysemous)
139 
140  // timings measured with the CPU RTC
141  // on all threads
142  size_t search_cycles;
143  size_t refine_cycles; // only for IVFPQR
144 
145  IndexIVFPQStats () {reset (); }
146  void reset ();
147 };
148 
149 // global var that collects them all
150 extern IndexIVFPQStats indexIVFPQ_stats;
151 
152 
153 
154 /** Index with an additional level of PQ refinement */
156  ProductQuantizer refine_pq; ///< 3rd level quantizer
157  std::vector <uint8_t> refine_codes; ///< corresponding codes
158 
159  /// factor between k requested in search and the k requested from the IVFPQ
160  float k_factor;
161 
162  IndexIVFPQR (
163  Index * quantizer, size_t d, size_t nlist,
164  size_t M, size_t nbits_per_idx,
165  size_t M_refine, size_t nbits_per_idx_refine);
166 
167  void reset() override;
168 
169  long remove_ids(const IDSelector& sel) override;
170 
171  /// trains the two product quantizers
172  void train_residual(idx_t n, const float* x) override;
173 
174  void add_with_ids(idx_t n, const float* x, const long* xids) override;
175 
176  /// same as add_with_ids, but optionally use the precomputed list ids
177  void add_core (idx_t n, const float *x, const long *xids,
178  const long *precomputed_idx = nullptr);
179 
180  void reconstruct_from_offset (long list_no, long offset,
181  float* recons) const override;
182 
183  void merge_from (IndexIVF &other, idx_t add_id) override;
184 
185 
186  void search_preassigned (idx_t n, const float *x, idx_t k,
187  const idx_t *assign,
188  const float *centroid_dis,
189  float *distances, idx_t *labels,
190  bool store_pairs,
191  const IVFSearchParameters *params=nullptr
192  ) const override;
193 
194  IndexIVFPQR();
195 };
196 
197 
198 
199 /** Same as an IndexIVFPQ without the inverted lists: codes are stored sequentially
200  *
201  * The class is mainly inteded to store encoded vectors that can be
202  * accessed randomly, the search function is not implemented.
203  */
205  /// first level quantizer
207 
208  /// second level quantizer is always a PQ
210 
211  /// Codes. Size ntotal * code_size.
212  std::vector<uint8_t> codes;
213 
214  /// size of the code for the first level (ceil(log8(q1.nlist)))
215  size_t code_size_1;
216 
217  /// size of the code for the second level
218  size_t code_size_2;
219 
220  /// code_size_1 + code_size_2
221  size_t code_size;
222 
223  Index2Layer (Index * quantizer, size_t nlist,
224  int M, MetricType metric = METRIC_L2);
225 
226  Index2Layer ();
227  ~Index2Layer ();
228 
229  void train(idx_t n, const float* x) override;
230 
231  void add(idx_t n, const float* x) override;
232 
233  /// not implemented
234  void search(
235  idx_t n,
236  const float* x,
237  idx_t k,
238  float* distances,
239  idx_t* labels) const override;
240 
241  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
242 
243  void reconstruct(idx_t key, float* recons) const override;
244 
245  void reset() override;
246 
247  /// transfer the flat codes to an IVFPQ index
248  void transfer_to_IVFPQ(IndexIVFPQ & other) const;
249 
250 };
251 
252 
253 } // namespace faiss
254 
255 
256 #endif
void precompute_table()
build precomputed table
Definition: IndexIVFPQ.cpp:363
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:24
void merge_from(IndexIVF &other, idx_t add_id) override
void transfer_to_IVFPQ(IndexIVFPQ &other) const
transfer the flat codes to an IVFPQ index
size_t code_size_2
size of the code for the second level
Definition: IndexIVFPQ.h:218
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
ProductQuantizer refine_pq
3rd level quantizer
Definition: IndexIVFPQ.h:156
PolysemousTraining * polysemous_training
if NULL, use default
Definition: IndexIVFPQ.h:40
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
size_t code_size
code_size_1 + code_size_2
Definition: IndexIVFPQ.h:221
void reset() override
removes all elements from the database.
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
void decode_multiple(size_t n, const long *keys, const uint8_t *xcodes, float *x) const
inverse of encode_multiple
Definition: IndexIVFPQ.cpp:159
void train_residual_o(idx_t n, const float *x, float *residuals_2)
same as train_residual, also output 2nd level residuals
Definition: IndexIVFPQ.cpp:66
bool do_polysemous_training
reorder PQ centroids after training?
Definition: IndexIVFPQ.h:39
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:43
void train_residual(idx_t n, const float *x) override
trains the two product quantizers
void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx=nullptr)
same as add_with_ids, but optionally use the precomputed list ids
std::vector< float > precomputed_table
Definition: IndexIVFPQ.h:59
Level1Quantizer q1
first level quantizer
Definition: IndexIVFPQ.h:206
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
Definition: IndexIVFPQ.cpp:206
int polysemous_ht
Hamming thresh for polysemous filtering.
Definition: IndexIVFPQ.h:44
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
void reset() override
removes all elements from the database.
std::vector< uint8_t > codes
Codes. Size ntotal * code_size.
Definition: IndexIVFPQ.h:212
void add_with_ids(idx_t n, const float *x, const long *xids=nullptr) override
default implementation that calls encode_vectors
Definition: IndexIVFPQ.cpp:182
int d
vector dimension
Definition: Index.h:66
long idx_t
all indices are this type
Definition: Index.h:62
std::vector< uint8_t > refine_codes
corresponding codes
Definition: IndexIVFPQ.h:157
void train_residual(idx_t n, const float *x) override
trains the product quantizer
Definition: IndexIVFPQ.cpp:60
static size_t precomputed_table_max_bytes
2G by default, accommodates tables up to PQ32 w/ 65536 centroids
Definition: IndexIVFPQ.h:55
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
Definition: IndexIVFPQ.cpp:310
void encode_multiple(size_t n, long *keys, const float *x, uint8_t *codes, bool compute_keys=false) const
Definition: IndexIVFPQ.cpp:149
size_t scan_table_threshold
use table computation or on-the-fly?
Definition: IndexIVFPQ.h:23
void train(idx_t n, const float *x) override
optimizes the order of indices in a ProductQuantizer
void add(idx_t n, const float *x) override
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
not implemented
bool by_residual
Encode residual or plain vector?
Definition: IndexIVFPQ.h:35
ProductQuantizer pq
produces the codes
Definition: IndexIVFPQ.h:37
size_t code_size_1
size of the code for the first level (ceil(log8(q1.nlist)))
Definition: IndexIVFPQ.h:215
void add_core_o(idx_t n, const float *x, const long *xids, float *residuals_2, const long *precomputed_idx=nullptr)
Definition: IndexIVFPQ.cpp:220
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const override
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:32
ProductQuantizer pq
second level quantizer is always a PQ
Definition: IndexIVFPQ.h:209
void add_with_ids(idx_t n, const float *x, const long *xids) override
default implementation that calls encode_vectors
void reconstruct(idx_t key, float *recons) const override
size_t nlist
number of possible key values
Definition: IndexIVF.h:33
size_t find_duplicates(idx_t *ids, size_t *lims) const
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44
float k_factor
factor between k requested in search and the k requested from the IVFPQ
Definition: IndexIVFPQ.h:160
int use_precomputed_table
if by_residual, build precompute tables
Definition: IndexIVFPQ.h:54