Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/IndexIVF.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #ifndef FAISS_INDEX_IVF_H
11 #define FAISS_INDEX_IVF_H
12 
13 
14 #include <vector>
15 
16 
17 #include "Index.h"
18 #include "InvertedLists.h"
19 #include "Clustering.h"
20 #include "Heap.h"
21 
22 
23 namespace faiss {
24 
25 
26 /** Encapsulates a quantizer object for the IndexIVF
27  *
28  * The class isolates the fields that are independent of the storage
29  * of the lists (especially training)
30  */
32  Index * quantizer; ///< quantizer that maps vectors to inverted lists
33  size_t nlist; ///< number of possible key values
34 
35  /**
36  * = 0: use the quantizer as index in a kmeans training
37  * = 1: just pass on the training set to the train() of the quantizer
38  * = 2: kmeans training on a flat index + add the centroids to the quantizer
39  */
41  bool own_fields; ///< whether object owns the quantizer
42 
43  ClusteringParameters cp; ///< to override default clustering params
44  Index *clustering_index; ///< to override index used during clustering
45 
46  /// Trains the quantizer and calls train_residual to train sub-quantizers
47  void train_q1 (size_t n, const float *x, bool verbose,
48  MetricType metric_type);
49 
51 
52  Level1Quantizer ();
53 
54  ~Level1Quantizer ();
55 
56 };
57 
58 
59 
61  size_t nprobe; ///< number of probes at query time
62  size_t max_codes; ///< max nb of codes to visit to do a query
63  virtual ~IVFSearchParameters () {}
64 };
65 
66 
67 
68 struct InvertedListScanner;
69 
70 /** Index based on a inverted file (IVF)
71  *
72  * In the inverted file, the quantizer (an Index instance) provides a
73  * quantization index for each vector to be added. The quantization
74  * index maps to a list (aka inverted list or posting list), where the
75  * id of the vector is stored.
76  *
77  * The inverted list object is required only after trainng. If none is
78  * set externally, an ArrayInvertedLists is used automatically.
79  *
80  * At search time, the vector to be searched is also quantized, and
81  * only the list corresponding to the quantization index is
82  * searched. This speeds up the search by making it
83  * non-exhaustive. This can be relaxed using multi-probe search: a few
84  * (nprobe) quantization indices are selected and several inverted
85  * lists are visited.
86  *
87  * Sub-classes implement a post-filtering of the index that refines
88  * the distance estimation from the query to databse vectors.
89  */
91  /// Acess to the actual data
93  bool own_invlists;
94 
95  size_t code_size; ///< code size per vector in bytes
96 
97  size_t nprobe; ///< number of probes at query time
98  size_t max_codes; ///< max nb of codes to visit to do a query
99 
100  /** Parallel mode determines how queries are parallelized with OpenMP
101  *
102  * 0 (default): parallelize over queries
103  * 1: parallelize over over inverted lists
104  * 2: parallelize over both
105  */
107 
108  /// map for direct access to the elements. Enables reconstruct().
110  std::vector <idx_t> direct_map;
111 
112  /** The Inverted file takes a quantizer (an Index) on input,
113  * which implements the function mapping a vector to a list
114  * identifier. The pointer is borrowed: the quantizer should not
115  * be deleted while the IndexIVF is in use.
116  */
117  IndexIVF (Index * quantizer, size_t d,
118  size_t nlist, size_t code_size,
119  MetricType metric = METRIC_L2);
120 
121  void reset() override;
122 
123  /// Trains the quantizer and calls train_residual to train sub-quantizers
124  void train(idx_t n, const float* x) override;
125 
126  /// Calls add_with_ids with NULL ids
127  void add(idx_t n, const float* x) override;
128 
129  /// default implementation that calls encode_vectors
130  void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
131 
132  /** Encodes a set of vectors as they would appear in the inverted lists
133  *
134  * @param list_nos inverted list ids as returned by the
135  * quantizer (size n). -1s are ignored.
136  * @param codes output codes, size n * code_size
137  */
138  virtual void encode_vectors(idx_t n, const float* x,
139  const idx_t *list_nos,
140  uint8_t * codes) const = 0;
141 
142  /// Sub-classes that encode the residuals can train their encoders here
143  /// does nothing by default
144  virtual void train_residual (idx_t n, const float *x);
145 
146  /** search a set of vectors, that are pre-quantized by the IVF
147  * quantizer. Fill in the corresponding heaps with the query
148  * results. The default implementation uses InvertedListScanners
149  * to do the search.
150  *
151  * @param n nb of vectors to query
152  * @param x query vectors, size nx * d
153  * @param assign coarse quantization indices, size nx * nprobe
154  * @param centroid_dis
155  * distances to coarse centroids, size nx * nprobe
156  * @param distance
157  * output distances, size n * k
158  * @param labels output labels, size n * k
159  * @param store_pairs store inv list index + inv list offset
160  * instead in upper/lower 32 bit of result,
161  * instead of ids (used for reranking).
162  * @param params used to override the object's search parameters
163  */
164  virtual void search_preassigned (idx_t n, const float *x, idx_t k,
165  const idx_t *assign,
166  const float *centroid_dis,
167  float *distances, idx_t *labels,
168  bool store_pairs,
169  const IVFSearchParameters *params=nullptr
170  ) const;
171 
172  /** assign the vectors, then call search_preassign */
173  void search (idx_t n, const float *x, idx_t k,
174  float *distances, idx_t *labels) const override;
175 
176  void range_search (idx_t n, const float* x, float radius,
177  RangeSearchResult* result) const override;
178 
179  void range_search_preassigned(idx_t nx, const float *x, float radius,
180  const idx_t *keys, const float *coarse_dis,
181  RangeSearchResult *result) const;
182 
183  /// get a scanner for this index (store_pairs means ignore labels)
185  bool store_pairs=false) const;
186 
187  void reconstruct (idx_t key, float* recons) const override;
188 
189  /** Reconstruct a subset of the indexed vectors.
190  *
191  * Overrides default implementation to bypass reconstruct() which requires
192  * direct_map to be maintained.
193  *
194  * @param i0 first vector to reconstruct
195  * @param ni nb of vectors to reconstruct
196  * @param recons output array of reconstructed vectors, size ni * d
197  */
198  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
199 
200  /** Similar to search, but also reconstructs the stored vectors (or an
201  * approximation in the case of lossy coding) for the search results.
202  *
203  * Overrides default implementation to avoid having to maintain direct_map
204  * and instead fetch the code offsets through the `store_pairs` flag in
205  * search_preassigned().
206  *
207  * @param recons reconstructed vectors size (n, k, d)
208  */
209  void search_and_reconstruct (idx_t n, const float *x, idx_t k,
210  float *distances, idx_t *labels,
211  float *recons) const override;
212 
213  /** Reconstruct a vector given the location in terms of (inv list index +
214  * inv list offset) instead of the id.
215  *
216  * Useful for reconstructing when the direct_map is not maintained and
217  * the inv list offset is computed by search_preassigned() with
218  * `store_pairs` set.
219  */
220  virtual void reconstruct_from_offset (idx_t list_no, idx_t offset,
221  float* recons) const;
222 
223 
224  /// Dataset manipulation functions
225 
226  idx_t remove_ids(const IDSelector& sel) override;
227 
228  /** check that the two indexes are compatible (ie, they are
229  * trained in the same way and have the same
230  * parameters). Otherwise throw. */
231  void check_compatible_for_merge (const IndexIVF &other) const;
232 
233  /** moves the entries from another dataset to self. On output,
234  * other is empty. add_id is added to all moved ids (for
235  * sequential ids, this would be this->ntotal */
236  virtual void merge_from (IndexIVF &other, idx_t add_id);
237 
238  /** copy a subset of the entries index to the other index
239  *
240  * if subset_type == 0: copies ids in [a1, a2)
241  * if subset_type == 1: copies ids if id % a1 == a2
242  * if subset_type == 2: copies inverted lists such that a1
243  * elements are left before and a2 elements are after
244  */
245  virtual void copy_subset_to (IndexIVF & other, int subset_type,
246  idx_t a1, idx_t a2) const;
247 
248  ~IndexIVF() override;
249 
250  size_t get_list_size (size_t list_no) const
251  { return invlists->list_size(list_no); }
252 
253  /** intialize a direct map
254  *
255  * @param new_maintain_direct_map if true, create a direct map,
256  * else clear it
257  */
258  void make_direct_map (bool new_maintain_direct_map=true);
259 
260  /// replace the inverted lists, old one is deallocated if own_invlists
261  void replace_invlists (InvertedLists *il, bool own=false);
262 
263  IndexIVF ();
264 };
265 
266 struct RangeQueryResult;
267 
268 /** Object that handles a query. The inverted lists to scan are
269  * provided externally. The object has a lot of state, but
270  * distance_to_code and scan_codes can be called in multiple
271  * threads */
273 
274  using idx_t = Index::idx_t;
275 
276  /// from now on we handle this query.
277  virtual void set_query (const float *query_vector) = 0;
278 
279  /// following codes come from this inverted list
280  virtual void set_list (idx_t list_no, float coarse_dis) = 0;
281 
282  /// compute a single query-to-code distance
283  virtual float distance_to_code (const uint8_t *code) const = 0;
284 
285  /** scan a set of codes, compute distances to current query and
286  * update heap of results if necessary.
287  *
288  * @param n number of codes to scan
289  * @param codes codes to scan (n * code_size)
290  * @param ids corresponding ids (ignored if store_pairs)
291  * @param distances heap distances (size k)
292  * @param labels heap labels (size k)
293  * @param k heap size
294  * @return number of heap updates performed
295  */
296  virtual size_t scan_codes (size_t n,
297  const uint8_t *codes,
298  const idx_t *ids,
299  float *distances, idx_t *labels,
300  size_t k) const = 0;
301 
302  /** scan a set of codes, compute distances to current query and
303  * update results if distances are below radius
304  *
305  * (default implementation fails) */
306  virtual void scan_codes_range (size_t n,
307  const uint8_t *codes,
308  const idx_t *ids,
309  float radius,
310  RangeQueryResult &result) const;
311 
312  virtual ~InvertedListScanner () {}
313 
314 };
315 
316 
318  size_t nq; // nb of queries run
319  size_t nlist; // nb of inverted lists scanned
320  size_t ndis; // nb of distancs computed
321  size_t nheap_updates; // nb of times the heap was updated
322  double quantization_time; // time spent quantizing vectors (in ms)
323  double search_time; // time spent searching lists (in ms)
324 
325  IndexIVFStats () {reset (); }
326  void reset ();
327 };
328 
329 // global var that collects them all
330 extern IndexIVFStats indexIVF_stats;
331 
332 
333 } // namespace faiss
334 
335 
336 #endif
virtual void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const =0
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
Definition: IndexIVF.cpp:250
result structure for a single query
void check_compatible_for_merge(const IndexIVF &other) const
Definition: IndexIVF.cpp:710
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
Definition: IndexIVF.cpp:595
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:97
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
idx_t remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
Definition: IndexIVF.cpp:648
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:562
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:703
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: IndexIVF.cpp:434
int d
vector dimension
Definition: Index.h:66
long idx_t
all indices are this type
Definition: Index.h:62
virtual void copy_subset_to(IndexIVF &other, int subset_type, idx_t a1, idx_t a2) const
Definition: IndexIVF.cpp:748
virtual float distance_to_code(const uint8_t *code) const =0
compute a single query-to-code distance
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:688
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVF.cpp:574
virtual void set_list(idx_t list_no, float coarse_dis)=0
following codes come from this inverted list
void add_with_ids(idx_t n, const float *x, const idx_t *xids) override
default implementation that calls encode_vectors
Definition: IndexIVF.cpp:149
virtual InvertedListScanner * get_InvertedListScanner(bool store_pairs=false) const
get a scanner for this index (store_pairs means ignore labels)
Definition: IndexIVF.cpp:556
virtual size_t scan_codes(size_t n, const uint8_t *codes, const idx_t *ids, float *distances, idx_t *labels, size_t k) const =0
void replace_invlists(InvertedLists *il, bool own=false)
replace the inverted lists, old one is deallocated if own_invlists
Definition: IndexIVF.cpp:735
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:43
void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:640
Index * clustering_index
to override index used during clustering
Definition: IndexIVF.h:44
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:60
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:61
void make_direct_map(bool new_maintain_direct_map=true)
Definition: IndexIVF.cpp:202
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:92
virtual void reconstruct_from_offset(idx_t list_no, idx_t offset, float *recons) const
Definition: IndexIVF.cpp:633
void add(idx_t n, const float *x) override
Calls add_with_ids with NULL ids.
Definition: IndexIVF.cpp:143
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:62
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:32
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:98
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:109
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:41
virtual void set_query(const float *query_vector)=0
from now on we handle this query.
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:228
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:721
size_t nlist
number of possible key values
Definition: IndexIVF.h:33
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:95
virtual void scan_codes_range(size_t n, const uint8_t *codes, const idx_t *ids, float radius, RangeQueryResult &result) const
Definition: IndexIVF.cpp:830
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44