Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVF.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 // -*- c++ -*-
11 
12 #ifndef FAISS_INDEX_IVF_H
13 #define FAISS_INDEX_IVF_H
14 
15 
16 #include <vector>
17 
18 
19 #include "Index.h"
20 #include "Clustering.h"
21 #include "Heap.h"
22 
23 
24 namespace faiss {
25 
26 
27 /** Encapsulates a quantizer object for the IndexIVF
28  *
29  * The class isolates the fields that are independent of the storage
30  * of the lists (especially training)
31  */
33  Index * quantizer; ///< quantizer that maps vectors to inverted lists
34  size_t nlist; ///< number of possible key values
35 
36  /**
37  * = 0: use the quantizer as index in a kmeans training
38  * = 1: just pass on the training set to the train() of the quantizer
39  * = 2: kmeans training on a flat index + add the centroids to the quantizer
40  */
42  bool own_fields; ///< whether object owns the quantizer
43 
44  ClusteringParameters cp; ///< to override default clustering params
45  Index *clustering_index; ///< to override index used during clustering
46 
47  /// Trains the quantizer and calls train_residual to train sub-quantizers
48  void train_q1 (size_t n, const float *x, bool verbose,
49  MetricType metric_type);
50 
52 
53  Level1Quantizer ();
54 
55  ~Level1Quantizer ();
56 
57 };
58 
59 
60 /** Table of inverted lists
61  * multithreading rules:
62  * - concurrent read accesses are allowed
63  * - concurrent update accesses are allowed
64  * - for resize and add_entries, only concurrent access to different lists
65  * are allowed
66  */
67 struct InvertedLists {
68  typedef Index::idx_t idx_t;
69 
70  size_t nlist; ///< number of possible key values
71  size_t code_size; ///< code size per vector in bytes
72 
73  InvertedLists (size_t nlist, size_t code_size);
74 
75  /*************************
76  * Read only functions */
77 
78  /// get the size of a list
79  virtual size_t list_size(size_t list_no) const = 0;
80 
81  /// @return codes size list_size * code_size
82  virtual const uint8_t * get_codes (size_t list_no) const = 0;
83 
84  /// @return ids size list_size
85  virtual const idx_t * get_ids (size_t list_no) const = 0;
86 
87  /// @return a single id in an inverted list
88  virtual idx_t get_single_id (size_t list_no, size_t offset) const;
89 
90  /// @return a single code in an inverted list
91  virtual const uint8_t * get_single_code (
92  size_t list_no, size_t offset) const;
93 
94  /// prepare the following lists (default does nothing)
95  /// a list can be -1 hence the signed long
96  virtual void prefetch_lists (const long *list_nos, int nlist) const;
97 
98  /*************************
99  * writing functions */
100 
101  /// add one entry to an inverted list
102  virtual size_t add_entry (size_t list_no, idx_t theid,
103  const uint8_t *code);
104 
105  virtual size_t add_entries (
106  size_t list_no, size_t n_entry,
107  const idx_t* ids, const uint8_t *code) = 0;
108 
109  virtual void update_entry (size_t list_no, size_t offset,
110  idx_t id, const uint8_t *code);
111 
112  virtual void update_entries (size_t list_no, size_t offset, size_t n_entry,
113  const idx_t *ids, const uint8_t *code) = 0;
114 
115  virtual void resize (size_t list_no, size_t new_size) = 0;
116 
117  virtual void reset ();
118 
119  virtual ~InvertedLists ();
120 };
121 
122 
124  std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
125  std::vector < std::vector<idx_t> > ids; ///< Inverted lists for indexes
126 
127  ArrayInvertedLists (size_t nlist, size_t code_size);
128 
129  size_t list_size(size_t list_no) const override;
130  const uint8_t * get_codes (size_t list_no) const override;
131  const idx_t * get_ids (size_t list_no) const override;
132 
133  size_t add_entries (
134  size_t list_no, size_t n_entry,
135  const idx_t* ids, const uint8_t *code) override;
136 
137  void update_entries (size_t list_no, size_t offset, size_t n_entry,
138  const idx_t *ids, const uint8_t *code) override;
139 
140  void resize (size_t list_no, size_t new_size) override;
141 
142  virtual ~ArrayInvertedLists ();
143 };
144 
145 
146 /** Index based on a inverted file (IVF)
147  *
148  * In the inverted file, the quantizer (an Index instance) provides a
149  * quantization index for each vector to be added. The quantization
150  * index maps to a list (aka inverted list or posting list), where the
151  * id of the vector is stored.
152  *
153  * The inverted list object is required only after trainng. If none is
154  * set externally, an ArrayInvertedLists is used automatically.
155  *
156  * At search time, the vector to be searched is also quantized, and
157  * only the list corresponding to the quantization index is
158  * searched. This speeds up the search by making it
159  * non-exhaustive. This can be relaxed using multi-probe search: a few
160  * (nprobe) quantization indices are selected and several inverted
161  * lists are visited.
162  *
163  * Sub-classes implement a post-filtering of the index that refines
164  * the distance estimation from the query to databse vectors.
165  */
167  /// Acess to the actual data
169  bool own_invlists;
170 
171  size_t code_size; ///< code size per vector in bytes
172 
173  size_t nprobe; ///< number of probes at query time
174  size_t max_codes; ///< max nb of codes to visit to do a query
175 
176  /// map for direct access to the elements. Enables reconstruct().
178  std::vector <long> direct_map;
179 
180  /** The Inverted file takes a quantizer (an Index) on input,
181  * which implements the function mapping a vector to a list
182  * identifier. The pointer is borrowed: the quantizer should not
183  * be deleted while the IndexIVF is in use.
184  */
185  IndexIVF (Index * quantizer, size_t d,
186  size_t nlist, size_t code_size,
187  MetricType metric = METRIC_L2);
188 
189  void reset() override;
190 
191  /// Trains the quantizer and calls train_residual to train sub-quantizers
192  void train(idx_t n, const float* x) override;
193 
194  /// Quantizes x and calls add_with_key
195  void add(idx_t n, const float* x) override;
196 
197  /// Sub-classes that encode the residuals can train their encoders here
198  /// does nothing by default
199  virtual void train_residual (idx_t n, const float *x);
200 
201  /** search a set of vectors, that are pre-quantized by the IVF
202  * quantizer. Fill in the corresponding heaps with the query
203  * results. search() calls this.
204  *
205  * @param n nb of vectors to query
206  * @param x query vectors, size nx * d
207  * @param assign coarse quantization indices, size nx * nprobe
208  * @param centroid_dis
209  * distances to coarse centroids, size nx * nprobe
210  * @param distance
211  * output distances, size n * k
212  * @param labels output labels, size n * k
213  * @param store_pairs store inv list index + inv list offset
214  * instead in upper/lower 32 bit of result,
215  * instead of ids (used for reranking).
216  */
217  virtual void search_preassigned (idx_t n, const float *x, idx_t k,
218  const idx_t *assign,
219  const float *centroid_dis,
220  float *distances, idx_t *labels,
221  bool store_pairs) const = 0;
222 
223  /** assign the vectors, then call search_preassign */
224  virtual void search (idx_t n, const float *x, idx_t k,
225  float *distances, idx_t *labels) const override;
226 
227  void reconstruct (idx_t key, float* recons) const override;
228 
229  /** Reconstruct a subset of the indexed vectors.
230  *
231  * Overrides default implementation to bypass reconstruct() which requires
232  * direct_map to be maintained.
233  *
234  * @param i0 first vector to reconstruct
235  * @param ni nb of vectors to reconstruct
236  * @param recons output array of reconstructed vectors, size ni * d
237  */
238  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
239 
240  /** Similar to search, but also reconstructs the stored vectors (or an
241  * approximation in the case of lossy coding) for the search results.
242  *
243  * Overrides default implementation to avoid having to maintain direct_map
244  * and instead fetch the code offsets through the `store_pairs` flag in
245  * search_preassigned().
246  *
247  * @param recons reconstructed vectors size (n, k, d)
248  */
249  void search_and_reconstruct (idx_t n, const float *x, idx_t k,
250  float *distances, idx_t *labels,
251  float *recons) const override;
252 
253  /** Reconstruct a vector given the location in terms of (inv list index +
254  * inv list offset) instead of the id.
255  *
256  * Useful for reconstructing when the direct_map is not maintained and
257  * the inv list offset is computed by search_preassigned() with
258  * `store_pairs` set.
259  */
260  virtual void reconstruct_from_offset (long list_no, long offset,
261  float* recons) const;
262 
263 
264  /// Dataset manipulation functions
265 
266  long remove_ids(const IDSelector& sel) override;
267 
268  /** moves the entries from another dataset to self. On output,
269  * other is empty. add_id is added to all moved ids (for
270  * sequential ids, this would be this->ntotal */
271  virtual void merge_from (IndexIVF &other, idx_t add_id);
272 
273  /** copy a subset of the entries index to the other index
274  *
275  * if subset_type == 0: copies ids in [a1, a2)
276  * if subset_type == 1: copies ids if id % a1 == a2
277  * if subset_type == 2: copies inverted lists such that a1
278  * elements are left before and a2 elements are after
279  */
280  virtual void copy_subset_to (IndexIVF & other, int subset_type,
281  long a1, long a2) const;
282 
283  ~IndexIVF() override;
284 
285  size_t get_list_size (size_t list_no) const
286  { return invlists->list_size(list_no); }
287 
288  /** intialize a direct map
289  *
290  * @param new_maintain_direct_map if true, create a direct map,
291  * else clear it
292  */
293  void make_direct_map (bool new_maintain_direct_map=true);
294 
295  /// 1= perfectly balanced, >1: imbalanced
296  double imbalance_factor () const;
297 
298  /// display some stats about the inverted lists
299  void print_stats () const;
300 
301  void replace_invlists (InvertedLists *il, bool own=false);
302 
303  IndexIVF ();
304 };
305 
306 
308  size_t nq; // nb of queries run
309  size_t nlist; // nb of inverted lists scanned
310  size_t ndis; // nb of distancs computed
311 
312  IndexIVFStats () {reset (); }
313  void reset ();
314 };
315 
316 // global var that collects them all
317 extern IndexIVFStats indexIVF_stats;
318 
319 
320 
321 } // namespace faiss
322 
323 
324 
325 
326 
327 #endif
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const =0
const uint8_t * get_codes(size_t list_no) const override
Definition: IndexIVF.cpp:185
const idx_t * get_ids(size_t list_no) const override
Definition: IndexIVF.cpp:191
virtual const idx_t * get_ids(size_t list_no) const =0
double imbalance_factor() const
1= perfectly balanced, &gt;1: imbalanced
Definition: IndexIVF.cpp:447
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
Definition: IndexIVF.cpp:332
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
Definition: IndexIVF.cpp:527
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
Definition: IndexIVF.cpp:370
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:173
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:301
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:439
virtual idx_t get_single_id(size_t list_no, size_t offset) const
Definition: IndexIVF.cpp:118
int d
vector dimension
Definition: Index.h:64
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:71
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:424
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
Definition: IndexIVF.cpp:129
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVF.cpp:311
virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t *code)
add one entry to an inverted list
Definition: IndexIVF.cpp:136
long idx_t
all indices are this type
Definition: Index.h:62
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:44
void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:376
virtual void prefetch_lists(const long *list_nos, int nlist) const
Definition: IndexIVF.cpp:126
Index * clustering_index
to override index used during clustering
Definition: IndexIVF.h:45
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:56
size_t nlist
number of possible key values
Definition: IndexIVF.h:70
void make_direct_map(bool new_maintain_direct_map=true)
Definition: IndexIVF.cpp:257
void print_stats() const
display some stats about the inverted lists
Definition: IndexIVF.cpp:456
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:168
std::vector< std::vector< idx_t > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:125
void add(idx_t n, const float *x) override
Quantizes x and calls add_with_key.
Definition: IndexIVF.cpp:252
virtual const uint8_t * get_codes(size_t list_no) const =0
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
Definition: IndexIVF.cpp:384
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:174
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:177
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:42
size_t list_size(size_t list_no) const override
get the size of a list
Definition: IndexIVF.cpp:179
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:283
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:476
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:171
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:43