Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/ProductQuantizer.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #ifndef FAISS_PRODUCT_QUANTIZER_H
11 #define FAISS_PRODUCT_QUANTIZER_H
12 
13 #include <stdint.h>
14 
15 #include <vector>
16 
17 #include "Clustering.h"
18 #include "Heap.h"
19 
20 namespace faiss {
21 
22 /** Product Quantizer. Implemented only for METRIC_L2 */
24 
25  using idx_t = Index::idx_t;
26 
27  size_t d; ///< size of the input vectors
28  size_t M; ///< number of subquantizers
29  size_t nbits; ///< number of bits per quantization index
30 
31  // values derived from the above
32  size_t dsub; ///< dimensionality of each subvector
33  size_t code_size; ///< byte per indexed vector
34  size_t ksub; ///< number of centroids for each subquantizer
35  bool verbose; ///< verbose during training?
36 
37 
38  /// initialization
39  enum train_type_t {
40  Train_default,
41  Train_hot_start, ///< the centroids are already initialized
42  Train_shared, ///< share dictionary accross PQ segments
43  Train_hypercube, ///< intialize centroids with nbits-D hypercube
44  Train_hypercube_pca, ///< intialize centroids with nbits-D hypercube
45  };
46  train_type_t train_type;
47 
48  ClusteringParameters cp; ///< parameters used during clustering
49 
50  /// if non-NULL, use this index for assignment (should be of size
51  /// d / M)
53 
54  /// Centroid table, size M * ksub * dsub
55  std::vector<float> centroids;
56 
57  /// return the centroids associated with subvector m
58  float * get_centroids (size_t m, size_t i) {
59  return &centroids [(m * ksub + i) * dsub];
60  }
61  const float * get_centroids (size_t m, size_t i) const {
62  return &centroids [(m * ksub + i) * dsub];
63  }
64 
65  // Train the product quantizer on a set of points. A clustering
66  // can be set on input to define non-default clustering parameters
67  void train (int n, const float *x);
68 
69  ProductQuantizer(size_t d, /* dimensionality of the input vectors */
70  size_t M, /* number of subquantizers */
71  size_t nbits); /* number of bit per subvector index */
72 
73  ProductQuantizer ();
74 
75  /// compute derived values when d, M and nbits have been set
76  void set_derived_values ();
77 
78  /// Define the centroids for subquantizer m
79  void set_params (const float * centroids, int m);
80 
81  /// Quantize one vector with the product quantizer
82  void compute_code (const float * x, uint8_t * code) const ;
83 
84  /// same as compute_code for several vectors
85  void compute_codes (const float * x,
86  uint8_t * codes,
87  size_t n) const ;
88 
89  /// speed up code assignment using assign_index
90  /// (non-const because the index is changed)
92  const float * x,
93  uint8_t * codes,
94  size_t n);
95 
96  /// decode a vector from a given code (or n vectors if third argument)
97  void decode (const uint8_t *code, float *x) const;
98  void decode (const uint8_t *code, float *x, size_t n) const;
99 
100  /// If we happen to have the distance tables precomputed, this is
101  /// more efficient to compute the codes.
102  void compute_code_from_distance_table (const float *tab,
103  uint8_t *code) const;
104 
105 
106  /** Compute distance table for one vector.
107  *
108  * The distance table for x = [x_0 x_1 .. x_(M-1)] is a M * ksub
109  * matrix that contains
110  *
111  * dis_table (m, j) = || x_m - c_(m, j)||^2
112  * for m = 0..M-1 and j = 0 .. ksub - 1
113  *
114  * where c_(m, j) is the centroid no j of sub-quantizer m.
115  *
116  * @param x input vector size d
117  * @param dis_table output table, size M * ksub
118  */
119  void compute_distance_table (const float * x,
120  float * dis_table) const;
121 
122  void compute_inner_prod_table (const float * x,
123  float * dis_table) const;
124 
125 
126  /** compute distance table for several vectors
127  * @param nx nb of input vectors
128  * @param x input vector size nx * d
129  * @param dis_table output table, size nx * M * ksub
130  */
131  void compute_distance_tables (size_t nx,
132  const float * x,
133  float * dis_tables) const;
134 
135  void compute_inner_prod_tables (size_t nx,
136  const float * x,
137  float * dis_tables) const;
138 
139 
140  /** perform a search (L2 distance)
141  * @param x query vectors, size nx * d
142  * @param nx nb of queries
143  * @param codes database codes, size ncodes * code_size
144  * @param ncodes nb of nb vectors
145  * @param res heap array to store results (nh == nx)
146  * @param init_finalize_heap initialize heap (input) and sort (output)?
147  */
148  void search (const float * x,
149  size_t nx,
150  const uint8_t * codes,
151  const size_t ncodes,
152  float_maxheap_array_t *res,
153  bool init_finalize_heap = true) const;
154 
155  /** same as search, but with inner product similarity */
156  void search_ip (const float * x,
157  size_t nx,
158  const uint8_t * codes,
159  const size_t ncodes,
160  float_minheap_array_t *res,
161  bool init_finalize_heap = true) const;
162 
163 
164  /// Symmetric Distance Table
165  std::vector<float> sdc_table;
166 
167  // intitialize the SDC table from the centroids
168  void compute_sdc_table ();
169 
170  void search_sdc (const uint8_t * qcodes,
171  size_t nq,
172  const uint8_t * bcodes,
173  const size_t ncodes,
174  float_maxheap_array_t * res,
175  bool init_finalize_heap = true) const;
176 
178  uint8_t *code; ///< code for this vector
179  uint8_t offset;
180  const int nbits; ///< number of bits per subquantizer index
181 
182  uint8_t reg;
183 
184  PQEncoderGeneric(uint8_t *code, int nbits, uint8_t offset = 0);
185 
186  void encode(uint64_t x);
187 
188  ~PQEncoderGeneric();
189  };
190 
191 
192  struct PQEncoder8 {
193  uint8_t *code;
194 
195  PQEncoder8(uint8_t *code, int nbits);
196 
197  void encode(uint64_t x);
198  };
199 
200  struct PQEncoder16 {
201  uint16_t *code;
202 
203  PQEncoder16(uint8_t *code, int nbits);
204 
205  void encode(uint64_t x);
206  };
207 
208 
210  const uint8_t *code;
211  uint8_t offset;
212  const int nbits;
213  const uint64_t mask;
214  uint8_t reg;
215 
216  PQDecoderGeneric(const uint8_t *code, int nbits);
217 
218  uint64_t decode();
219  };
220 
221  struct PQDecoder8 {
222  const uint8_t *code;
223 
224  PQDecoder8(const uint8_t *code, int nbits);
225 
226  uint64_t decode();
227  };
228 
229  struct PQDecoder16 {
230  const uint16_t *code;
231 
232  PQDecoder16(const uint8_t *code, int nbits);
233 
234  uint64_t decode();
235  };
236 
237 };
238 
239 
240 } // namespace faiss
241 
242 
243 #endif
void set_params(const float *centroids, int m)
Define the centroids for subquantizer m.
intialize centroids with nbits-D hypercube
size_t nbits
number of bits per quantization index
void decode(const uint8_t *code, float *x) const
decode a vector from a given code (or n vectors if third argument)
intialize centroids with nbits-D hypercube
void set_derived_values()
compute derived values when d, M and nbits have been set
std::vector< float > sdc_table
Symmetric Distance Table.
share dictionary accross PQ segments
size_t dsub
dimensionality of each subvector
void compute_distance_tables(size_t nx, const float *x, float *dis_tables) const
void compute_code_from_distance_table(const float *tab, uint8_t *code) const
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
long idx_t
all indices are this type
Definition: Index.h:62
void compute_distance_table(const float *x, float *dis_table) const
void search(const float *x, size_t nx, const uint8_t *codes, const size_t ncodes, float_maxheap_array_t *res, bool init_finalize_heap=true) const
size_t code_size
byte per indexed vector
uint8_t * code
code for this vector
const int nbits
number of bits per subquantizer index
size_t ksub
number of centroids for each subquantizer
void search_ip(const float *x, size_t nx, const uint8_t *codes, const size_t ncodes, float_minheap_array_t *res, bool init_finalize_heap=true) const
void compute_code(const float *x, uint8_t *code) const
Quantize one vector with the product quantizer.
the centroids are already initialized
ClusteringParameters cp
parameters used during clustering
size_t M
number of subquantizers
void compute_codes_with_assign_index(const float *x, uint8_t *codes, size_t n)
float * get_centroids(size_t m, size_t i)
return the centroids associated with subvector m
size_t d
size of the input vectors
bool verbose
verbose during training?
std::vector< float > centroids
Centroid table, size M * ksub * dsub.
train_type_t
initialization