Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/IndexIVF.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #ifndef FAISS_INDEX_IVF_H
12 #define FAISS_INDEX_IVF_H
13 
14 
15 #include <vector>
16 
17 
18 #include "Index.h"
19 #include "InvertedLists.h"
20 #include "Clustering.h"
21 #include "Heap.h"
22 
23 
24 namespace faiss {
25 
26 
27 /** Encapsulates a quantizer object for the IndexIVF
28  *
29  * The class isolates the fields that are independent of the storage
30  * of the lists (especially training)
31  */
33  Index * quantizer; ///< quantizer that maps vectors to inverted lists
34  size_t nlist; ///< number of possible key values
35 
36  /**
37  * = 0: use the quantizer as index in a kmeans training
38  * = 1: just pass on the training set to the train() of the quantizer
39  * = 2: kmeans training on a flat index + add the centroids to the quantizer
40  */
42  bool own_fields; ///< whether object owns the quantizer
43 
44  ClusteringParameters cp; ///< to override default clustering params
45  Index *clustering_index; ///< to override index used during clustering
46 
47  /// Trains the quantizer and calls train_residual to train sub-quantizers
48  void train_q1 (size_t n, const float *x, bool verbose,
49  MetricType metric_type);
50 
52 
53  Level1Quantizer ();
54 
55  ~Level1Quantizer ();
56 
57 };
58 
59 
60 
62  size_t nprobe; ///< number of probes at query time
63  size_t max_codes; ///< max nb of codes to visit to do a query
64  virtual ~IVFSearchParameters () {}
65 };
66 
67 
68 
69 struct InvertedListScanner;
70 
71 /** Index based on a inverted file (IVF)
72  *
73  * In the inverted file, the quantizer (an Index instance) provides a
74  * quantization index for each vector to be added. The quantization
75  * index maps to a list (aka inverted list or posting list), where the
76  * id of the vector is stored.
77  *
78  * The inverted list object is required only after trainng. If none is
79  * set externally, an ArrayInvertedLists is used automatically.
80  *
81  * At search time, the vector to be searched is also quantized, and
82  * only the list corresponding to the quantization index is
83  * searched. This speeds up the search by making it
84  * non-exhaustive. This can be relaxed using multi-probe search: a few
85  * (nprobe) quantization indices are selected and several inverted
86  * lists are visited.
87  *
88  * Sub-classes implement a post-filtering of the index that refines
89  * the distance estimation from the query to databse vectors.
90  */
92  /// Acess to the actual data
94  bool own_invlists;
95 
96  size_t code_size; ///< code size per vector in bytes
97 
98  size_t nprobe; ///< number of probes at query time
99  size_t max_codes; ///< max nb of codes to visit to do a query
100 
101  /// map for direct access to the elements. Enables reconstruct().
103  std::vector <long> direct_map;
104 
105  /** The Inverted file takes a quantizer (an Index) on input,
106  * which implements the function mapping a vector to a list
107  * identifier. The pointer is borrowed: the quantizer should not
108  * be deleted while the IndexIVF is in use.
109  */
110  IndexIVF (Index * quantizer, size_t d,
111  size_t nlist, size_t code_size,
112  MetricType metric = METRIC_L2);
113 
114  void reset() override;
115 
116  /// Trains the quantizer and calls train_residual to train sub-quantizers
117  void train(idx_t n, const float* x) override;
118 
119  /// Calls add_with_ids with NULL ids
120  void add(idx_t n, const float* x) override;
121 
122  /** Encodes a set of vectors as they would appear in the inverted lists
123  *
124  * @param list_nos inverted list ids as returned by the
125  * quantizer (size n). -1s are ignored.
126  * @param codes output codes, size n * code_size
127  */
128  virtual void encode_vectors(idx_t n, const float* x,
129  const idx_t *list_nos,
130  uint8_t * codes) const = 0;
131 
132  /// Sub-classes that encode the residuals can train their encoders here
133  /// does nothing by default
134  virtual void train_residual (idx_t n, const float *x);
135 
136  /** search a set of vectors, that are pre-quantized by the IVF
137  * quantizer. Fill in the corresponding heaps with the query
138  * results. The default implementation uses InvertedListScanners
139  * to do the search.
140  *
141  * @param n nb of vectors to query
142  * @param x query vectors, size nx * d
143  * @param assign coarse quantization indices, size nx * nprobe
144  * @param centroid_dis
145  * distances to coarse centroids, size nx * nprobe
146  * @param distance
147  * output distances, size n * k
148  * @param labels output labels, size n * k
149  * @param store_pairs store inv list index + inv list offset
150  * instead in upper/lower 32 bit of result,
151  * instead of ids (used for reranking).
152  * @param params used to override the object's search parameters
153  */
154  virtual void search_preassigned (idx_t n, const float *x, idx_t k,
155  const idx_t *assign,
156  const float *centroid_dis,
157  float *distances, idx_t *labels,
158  bool store_pairs,
159  const IVFSearchParameters *params=nullptr
160  ) const;
161 
162  /** assign the vectors, then call search_preassign */
163  virtual void search (idx_t n, const float *x, idx_t k,
164  float *distances, idx_t *labels) const override;
165 
166  /// get a scanner for this index (store_pairs means ignore labels)
168  bool store_pairs=false) const {
169  return nullptr;
170  }
171 
172  void reconstruct (idx_t key, float* recons) const override;
173 
174  /** Reconstruct a subset of the indexed vectors.
175  *
176  * Overrides default implementation to bypass reconstruct() which requires
177  * direct_map to be maintained.
178  *
179  * @param i0 first vector to reconstruct
180  * @param ni nb of vectors to reconstruct
181  * @param recons output array of reconstructed vectors, size ni * d
182  */
183  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
184 
185  /** Similar to search, but also reconstructs the stored vectors (or an
186  * approximation in the case of lossy coding) for the search results.
187  *
188  * Overrides default implementation to avoid having to maintain direct_map
189  * and instead fetch the code offsets through the `store_pairs` flag in
190  * search_preassigned().
191  *
192  * @param recons reconstructed vectors size (n, k, d)
193  */
194  void search_and_reconstruct (idx_t n, const float *x, idx_t k,
195  float *distances, idx_t *labels,
196  float *recons) const override;
197 
198  /** Reconstruct a vector given the location in terms of (inv list index +
199  * inv list offset) instead of the id.
200  *
201  * Useful for reconstructing when the direct_map is not maintained and
202  * the inv list offset is computed by search_preassigned() with
203  * `store_pairs` set.
204  */
205  virtual void reconstruct_from_offset (long list_no, long offset,
206  float* recons) const;
207 
208 
209  /// Dataset manipulation functions
210 
211  long remove_ids(const IDSelector& sel) override;
212 
213  /** check that the two indexes are compatible (ie, they are
214  * trained in the same way and have the same
215  * parameters). Otherwise throw. */
216  void check_compatible_for_merge (const IndexIVF &other) const;
217 
218  /** moves the entries from another dataset to self. On output,
219  * other is empty. add_id is added to all moved ids (for
220  * sequential ids, this would be this->ntotal */
221  virtual void merge_from (IndexIVF &other, idx_t add_id);
222 
223  /** copy a subset of the entries index to the other index
224  *
225  * if subset_type == 0: copies ids in [a1, a2)
226  * if subset_type == 1: copies ids if id % a1 == a2
227  * if subset_type == 2: copies inverted lists such that a1
228  * elements are left before and a2 elements are after
229  */
230  virtual void copy_subset_to (IndexIVF & other, int subset_type,
231  long a1, long a2) const;
232 
233  ~IndexIVF() override;
234 
235  size_t get_list_size (size_t list_no) const
236  { return invlists->list_size(list_no); }
237 
238  /** intialize a direct map
239  *
240  * @param new_maintain_direct_map if true, create a direct map,
241  * else clear it
242  */
243  void make_direct_map (bool new_maintain_direct_map=true);
244 
245  /// 1= perfectly balanced, >1: imbalanced
246  double imbalance_factor () const;
247 
248  /// display some stats about the inverted lists
249  void print_stats () const;
250 
251  /// replace the inverted lists, old one is deallocated if own_invlists
252  void replace_invlists (InvertedLists *il, bool own=false);
253 
254  IndexIVF ();
255 };
256 
257 /** Object that handles a query. The inverted lists to scan are
258  * provided externally. The object has a lot of state, but
259  * distance_to_code and scan_codes can be called in multiple
260  * threads */
262 
263  using idx_t = Index::idx_t;
264 
265  /// from now on we handle this query.
266  virtual void set_query (const float *query_vector) = 0;
267 
268  /// following codes come from this inverted list
269  virtual void set_list (idx_t list_no, float coarse_dis) = 0;
270 
271  /// compute a single query-to-code distance
272  virtual float distance_to_code (const uint8_t *code) const = 0;
273 
274  /** compute the distances to codes. (distances, labels) should be
275  * organized ad a min- or max-heap
276  *
277  * @param n number of codes to scan
278  * @param codes codes to scan (n * code_size)
279  * @param ids corresponding ids (ignored if store_pairs)
280  * @param distances heap distances (size k)
281  * @param labels heap labels (size k)
282  * @param k heap size
283  */
284  virtual size_t scan_codes (size_t n,
285  const uint8_t *codes,
286  const idx_t *ids,
287  float *distances, idx_t *labels,
288  size_t k) const = 0;
289 
290  virtual ~InvertedListScanner () {}
291 
292 };
293 
294 
296  size_t nq; // nb of queries run
297  size_t nlist; // nb of inverted lists scanned
298  size_t ndis; // nb of distancs computed
299  size_t nheap_updates; // nb of times the heap was updated
300 
301  IndexIVFStats () {reset (); }
302  void reset ();
303 };
304 
305 // global var that collects them all
306 extern IndexIVFStats indexIVF_stats;
307 
308 
309 } // namespace faiss
310 
311 
312 #endif
virtual void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const =0
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs, const IVFSearchParameters *params=nullptr) const
Definition: IndexIVF.cpp:189
void check_compatible_for_merge(const IndexIVF &other) const
Definition: IndexIVF.cpp:461
double imbalance_factor() const
1= perfectly balanced, &gt;1: imbalanced
Definition: IndexIVF.cpp:431
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
Definition: IndexIVF.cpp:315
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
Definition: IndexIVF.cpp:499
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
Definition: IndexIVF.cpp:353
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:98
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:35
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:284
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:423
int d
vector dimension
Definition: Index.h:66
virtual float distance_to_code(const uint8_t *code) const =0
compute a single query-to-code distance
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:408
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVF.cpp:294
virtual void set_list(idx_t list_no, float coarse_dis)=0
following codes come from this inverted list
virtual size_t scan_codes(size_t n, const uint8_t *codes, const idx_t *ids, float *distances, idx_t *labels, size_t k) const =0
long idx_t
all indices are this type
Definition: Index.h:64
void replace_invlists(InvertedLists *il, bool own=false)
replace the inverted lists, old one is deallocated if own_invlists
Definition: IndexIVF.cpp:486
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:44
void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:360
Index * clustering_index
to override index used during clustering
Definition: IndexIVF.h:45
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:57
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:62
void make_direct_map(bool new_maintain_direct_map=true)
Definition: IndexIVF.cpp:144
void print_stats() const
display some stats about the inverted lists
Definition: IndexIVF.cpp:440
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:93
void add(idx_t n, const float *x) override
Calls add_with_ids with NULL ids.
Definition: IndexIVF.cpp:139
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:63
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
Definition: IndexIVF.cpp:368
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:99
virtual InvertedListScanner * get_InvertedListScanner(bool store_pairs=false) const
get a scanner for this index (store_pairs means ignore labels)
Definition: IndexIVF.h:167
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:102
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:42
virtual void set_query(const float *query_vector)=0
from now on we handle this query.
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:170
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:472
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:96
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45