Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/ProductQuantizer.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 // -*- c++ -*-
11 
12 #ifndef FAISS_PRODUCT_QUANTIZER_H
13 #define FAISS_PRODUCT_QUANTIZER_H
14 
15 #include <stdint.h>
16 
17 #include <vector>
18 
19 #include "Clustering.h"
20 #include "Heap.h"
21 
22 namespace faiss {
23 
24 /** Product Quantizer. Implemented only for METRIC_L2 */
26 
27  size_t d; ///< size of the input vectors
28  size_t M; ///< number of subquantizers
29  size_t nbits; ///< number of bits per quantization index
30 
31  // values derived from the above
32  size_t dsub; ///< dimensionality of each subvector
33  size_t byte_per_idx; ///< nb bytes per code component (1 or 2)
34  size_t code_size; ///< byte per indexed vector
35  size_t ksub; ///< number of centroids for each subquantizer
36  bool verbose; ///< verbose during training?
37 
38 
39  /// initialization
40  enum train_type_t {
41  Train_default,
42  Train_hot_start, ///< the centroids are already initialized
43  Train_shared, ///< share dictionary accross PQ segments
44  Train_hypercube, ///< intialize centroids with nbits-D hypercube
45  Train_hypercube_pca, ///< intialize centroids with nbits-D hypercube
46  };
47  train_type_t train_type;
48 
49  ClusteringParameters cp; ///< parameters used during clustering
50 
51  /// if non-NULL, use this index for assignment (should be of size
52  /// d / M)
54 
55  /// Centroid table, size M * ksub * dsub
56  std::vector<float> centroids;
57 
58  /// return the centroids associated with subvector m
59  float * get_centroids (size_t m, size_t i) {
60  return &centroids [(m * ksub + i) * dsub];
61  }
62  const float * get_centroids (size_t m, size_t i) const {
63  return &centroids [(m * ksub + i) * dsub];
64  }
65 
66  // Train the product quantizer on a set of points. A clustering
67  // can be set on input to define non-default clustering parameters
68  void train (int n, const float *x);
69 
70  ProductQuantizer(size_t d, /* dimensionality of the input vectors */
71  size_t M, /* number of subquantizers */
72  size_t nbits); /* number of bit per subvector index */
73 
74  ProductQuantizer ();
75 
76  /// compute derived values when d, M and nbits have been set
77  void set_derived_values ();
78 
79  /// Define the centroids for subquantizer m
80  void set_params (const float * centroids, int m);
81 
82  /// Quantize one vector with the product quantizer
83  void compute_code (const float * x, uint8_t * code) const ;
84 
85  /// same as compute_code for several vectors
86  void compute_codes (const float * x,
87  uint8_t * codes,
88  size_t n) const ;
89 
90  /// decode a vector from a given code (or n vectors if third argument)
91  void decode (const uint8_t *code, float *x) const;
92  void decode (const uint8_t *code, float *x, size_t n) const;
93 
94  /// If we happen to have the distance tables precomputed, this is
95  /// more efficient to compute the codes.
96  void compute_code_from_distance_table (const float *tab,
97  uint8_t *code) const;
98 
99 
100  /** Compute distance table for one vector.
101  *
102  * The distance table for x = [x_0 x_1 .. x_(M-1)] is a M * ksub
103  * matrix that contains
104  *
105  * dis_table (m, j) = || x_m - c_(m, j)||^2
106  * for m = 0..M-1 and j = 0 .. ksub - 1
107  *
108  * where c_(m, j) is the centroid no j of sub-quantizer m.
109  *
110  * @param x input vector size d
111  * @param dis_table output table, size M * ksub
112  */
113  void compute_distance_table (const float * x,
114  float * dis_table) const;
115 
116  void compute_inner_prod_table (const float * x,
117  float * dis_table) const;
118 
119 
120  /** compute distance table for several vectors
121  * @param nx nb of input vectors
122  * @param x input vector size nx * d
123  * @param dis_table output table, size nx * M * ksub
124  */
125  void compute_distance_tables (size_t nx,
126  const float * x,
127  float * dis_tables) const;
128 
129  void compute_inner_prod_tables (size_t nx,
130  const float * x,
131  float * dis_tables) const;
132 
133 
134  /** perform a search (L2 distance)
135  * @param x query vectors, size nx * d
136  * @param nx nb of queries
137  * @param codes database codes, size ncodes * byte_per_idx
138  * @param ncodes nb of nb vectors
139  * @param res heap array to store results (nh == nx)
140  * @param init_finalize_heap initialize heap (input) and sort (output)?
141  */
142  void search (const float * x,
143  size_t nx,
144  const uint8_t * codes,
145  const size_t ncodes,
146  float_maxheap_array_t *res,
147  bool init_finalize_heap = true) const;
148 
149  /** same as search, but with inner product similarity */
150  void search_ip (const float * x,
151  size_t nx,
152  const uint8_t * codes,
153  const size_t ncodes,
154  float_minheap_array_t *res,
155  bool init_finalize_heap = true) const;
156 
157 
158  /// Symmetric Distance Table
159  std::vector<float> sdc_table;
160 
161  // intitialize the SDC table from the centroids
162  void compute_sdc_table ();
163 
164  void search_sdc (const uint8_t * qcodes,
165  size_t nq,
166  const uint8_t * bcodes,
167  const size_t ncodes,
168  float_maxheap_array_t * res,
169  bool init_finalize_heap = true) const;
170 
171 };
172 
173 
174 
175 } // namespace faiss
176 
177 
178 #endif
void set_params(const float *centroids, int m)
Define the centroids for subquantizer m.
intialize centroids with nbits-D hypercube
size_t nbits
number of bits per quantization index
void decode(const uint8_t *code, float *x) const
decode a vector from a given code (or n vectors if third argument)
size_t byte_per_idx
nb bytes per code component (1 or 2)
intialize centroids with nbits-D hypercube
void set_derived_values()
compute derived values when d, M and nbits have been set
std::vector< float > sdc_table
Symmetric Distance Table.
share dictionary accross PQ segments
size_t dsub
dimensionality of each subvector
void compute_distance_tables(size_t nx, const float *x, float *dis_tables) const
void compute_code_from_distance_table(const float *tab, uint8_t *code) const
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
void compute_distance_table(const float *x, float *dis_table) const
void search(const float *x, size_t nx, const uint8_t *codes, const size_t ncodes, float_maxheap_array_t *res, bool init_finalize_heap=true) const
size_t code_size
byte per indexed vector
size_t ksub
number of centroids for each subquantizer
void search_ip(const float *x, size_t nx, const uint8_t *codes, const size_t ncodes, float_minheap_array_t *res, bool init_finalize_heap=true) const
void compute_code(const float *x, uint8_t *code) const
Quantize one vector with the product quantizer.
the centroids are already initialized
ClusteringParameters cp
parameters used during clustering
size_t M
number of subquantizers
float * get_centroids(size_t m, size_t i)
return the centroids associated with subvector m
size_t d
size of the input vectors
bool verbose
verbose during training?
std::vector< float > centroids
Centroid table, size M * ksub * dsub.
train_type_t
initialization