Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexIVF.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 // -*- c++ -*-
11 
12 #ifndef FAISS_INDEX_IVF_H
13 #define FAISS_INDEX_IVF_H
14 
15 
16 #include <vector>
17 
18 
19 #include "Index.h"
20 #include "Clustering.h"
21 #include "Heap.h"
22 
23 
24 namespace faiss {
25 
26 
27 /** Encapsulates a quantizer object for the IndexIVF
28  *
29  * The class isolates the fields that are independent of the storage
30  * of the lists (especially training)
31  */
33  Index * quantizer; ///< quantizer that maps vectors to inverted lists
34  size_t nlist; ///< number of possible key values
35 
36  /**
37  * = 0: use the quantizer as index in a kmeans training
38  * = 1: just pass on the training set to the train() of the quantizer
39  * = 2: kmeans training on a flat index + add the centroids to the quantizer
40  */
42  bool own_fields; ///< whether object owns the quantizer
43 
44  ClusteringParameters cp; ///< to override default clustering params
45  Index *clustering_index; ///< to override index used during clustering
46 
47  /// Trains the quantizer and calls train_residual to train sub-quantizers
48  void train_q1 (size_t n, const float *x, bool verbose,
49  MetricType metric_type);
50 
52 
53  Level1Quantizer ();
54 
55  ~Level1Quantizer ();
56 
57 };
58 
59 
60 /** Index based on a inverted file (IVF)
61  *
62  * In the inverted file, the quantizer (an Index instance) provides a
63  * quantization index for each vector to be added. The quantization
64  * index maps to a list (aka inverted list or posting list), where the
65  * id of the vector is then stored.
66  *
67  * At search time, the vector to be searched is also quantized, and
68  * only the list corresponding to the quantization index is
69  * searched. This speeds up the search by making it
70  * non-exhaustive. This can be relaxed using multi-probe search: a few
71  * (nprobe) quantization indices are selected and several inverted
72  * lists are visited.
73  *
74  * Sub-classes implement a post-filtering of the index that refines
75  * the distance estimation from the query to databse vectors.
76  */
78  size_t nprobe; ///< number of probes at query time
79  size_t max_codes; ///< max nb of codes to visit to do a query
80 
81  std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
82 
83  size_t code_size; ///< code size per vector in bytes
84  std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
85 
86  /// map for direct access to the elements. Enables reconstruct().
88  std::vector <long> direct_map;
89 
90  /** The Inverted file takes a quantizer (an Index) on input,
91  * which implements the function mapping a vector to a list
92  * identifier. The pointer is borrowed: the quantizer should not
93  * be deleted while the IndexIVF is in use.
94  */
95  IndexIVF (Index * quantizer, size_t d, size_t nlist,
96  MetricType metric = METRIC_L2);
97 
98  void reset() override;
99 
100  /// Trains the quantizer and calls train_residual to train sub-quantizers
101  void train(idx_t n, const float* x) override;
102 
103  /// Quantizes x and calls add_with_key
104  void add(idx_t n, const float* x) override;
105 
106  /// Sub-classes that encode the residuals can train their encoders here
107  /// does nothing by default
108  virtual void train_residual (idx_t n, const float *x);
109 
110 
111  /** search a set of vectors, that are pre-quantized by the IVF
112  * quantizer. Fill in the corresponding heaps with the query
113  * results. search() calls this.
114  *
115  * @param n nb of vectors to query
116  * @param x query vectors, size nx * d
117  * @param assign coarse quantization indices, size nx * nprobe
118  * @param centroid_dis
119  * distances to coarse centroids, size nx * nprobe
120  * @param distance
121  * output distances, size n * k
122  * @param labels output labels, size n * k
123  * @param store_pairs store inv list index + inv list offset
124  * instead in upper/lower 32 bit of result,
125  * instead of ids (used for reranking).
126  */
127  virtual void search_preassigned (idx_t n, const float *x, idx_t k,
128  const idx_t *assign,
129  const float *centroid_dis,
130  float *distances, idx_t *labels,
131  bool store_pairs) const = 0;
132 
133  /** assign the vectors, then call search_preassign */
134  virtual void search (idx_t n, const float *x, idx_t k,
135  float *distances, idx_t *labels) const override;
136 
137  void reconstruct (idx_t key, float* recons) const override;
138 
139  /** Reconstruct a subset of the indexed vectors.
140  *
141  * Overrides default implementation to bypass reconstruct() which requires
142  * direct_map to be maintained.
143  *
144  * @param i0 first vector to reconstruct
145  * @param ni nb of vectors to reconstruct
146  * @param recons output array of reconstructed vectors, size ni * d
147  */
148  void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
149 
150  /** Similar to search, but also reconstructs the stored vectors (or an
151  * approximation in the case of lossy coding) for the search results.
152  *
153  * Overrides default implementation to avoid having to maintain direct_map
154  * and instead fetch the code offsets through the `store_pairs` flag in
155  * search_preassigned().
156  *
157  * @param recons reconstructed vectors size (n, k, d)
158  */
159  void search_and_reconstruct (idx_t n, const float *x, idx_t k,
160  float *distances, idx_t *labels,
161  float *recons) const override;
162 
163  /** Reconstruct a vector given the location in terms of (inv list index +
164  * inv list offset) instead of the id.
165  *
166  * Useful for reconstructing when the direct_map is not maintained and
167  * the inv list offset is computed by search_preassigned() with
168  * `store_pairs` set.
169  */
170  virtual void reconstruct_from_offset (long list_no, long offset,
171  float* recons) const;
172 
173 
174  /// Dataset manipulation functions
175 
176  long remove_ids(const IDSelector& sel) override;
177 
178  /** moves the entries from another dataset to self. On output,
179  * other is empty. add_id is added to all moved ids (for
180  * sequential ids, this would be this->ntotal */
181  virtual void merge_from (IndexIVF &other, idx_t add_id);
182 
183  /** copy a subset of the entries index to the other index
184  *
185  * if subset_type == 0: copies ids in [a1, a2)
186  * if subset_type == 1: copies ids if id % a1 == a2
187  * if subset_type == 2: copies inverted lists such that a1
188  * elements are left before and a2 elements are after
189  */
190  virtual void copy_subset_to (IndexIVF & other, int subset_type,
191  long a1, long a2) const;
192 
193  ~IndexIVF() override;
194 
195  size_t get_list_size (size_t list_no) const
196  { return ids[list_no].size(); }
197 
198  /** intialize a direct map
199  *
200  * @param new_maintain_direct_map if true, create a direct map,
201  * else clear it
202  */
203  void make_direct_map (bool new_maintain_direct_map=true);
204 
205  /// 1= perfectly balanced, >1: imbalanced
206  double imbalance_factor () const;
207 
208  /// display some stats about the inverted lists
209  void print_stats () const;
210 
211  IndexIVF ();
212 };
213 
214 
216  size_t nq; // nb of queries run
217  size_t nlist; // nb of inverted lists scanned
218  size_t ndis; // nb of distancs computed
219 
220  IndexIVFStats () {reset (); }
221  void reset ();
222 };
223 
224 // global var that collects them all
225 extern IndexIVFStats indexIVF_stats;
226 
227 
228 
229 
230 
231 /** Inverted file with stored vectors. Here the inverted file
232  * pre-selects the vectors to be searched, but they are not otherwise
233  * encoded, the code array just contains the raw float entries.
234  */
236 
237  IndexIVFFlat (
238  Index * quantizer, size_t d, size_t nlist_,
239  MetricType = METRIC_L2);
240 
241  /// same as add_with_ids, with precomputed coarse quantizer
242  virtual void add_core (idx_t n, const float * x, const long *xids,
243  const long *precomputed_idx);
244 
245  /// implemented for all IndexIVF* classes
246  void add_with_ids(idx_t n, const float* x, const long* xids) override;
247 
248  void search_preassigned (idx_t n, const float *x, idx_t k,
249  const idx_t *assign,
250  const float *centroid_dis,
251  float *distances, idx_t *labels,
252  bool store_pairs) const override;
253 
254  void range_search(
255  idx_t n,
256  const float* x,
257  float radius,
258  RangeSearchResult* result) const override;
259 
260  /** Update a subset of vectors.
261  *
262  * The index must have a direct_map
263  *
264  * @param nv nb of vectors to update
265  * @param idx vector indices to update, size nv
266  * @param v vectors of new values, size nv*d
267  */
268  void update_vectors (int nv, idx_t *idx, const float *v);
269 
270  void reconstruct_from_offset (long list_no, long offset,
271  float* recons) const override;
272 
273  IndexIVFFlat () {}
274 };
275 
276 
277 
278 } // namespace faiss
279 
280 
281 
282 
283 
284 #endif
virtual void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const =0
double imbalance_factor() const
1= perfectly balanced, &gt;1: imbalanced
Definition: IndexIVF.cpp:319
void search_and_reconstruct(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels, float *recons) const override
Definition: IndexIVF.cpp:210
virtual void copy_subset_to(IndexIVF &other, int subset_type, long a1, long a2) const
Definition: IndexIVF.cpp:375
virtual void reconstruct_from_offset(long list_no, long offset, float *recons) const
Definition: IndexIVF.cpp:246
size_t nprobe
number of probes at query time
Definition: IndexIVF.h:78
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:34
void range_search(idx_t n, const float *x, float radius, RangeSearchResult *result) const override
Definition: IndexIVF.cpp:659
void reconstruct(idx_t key, float *recons) const override
Definition: IndexIVF.cpp:180
virtual void train_residual(idx_t n, const float *x)
Definition: IndexIVF.cpp:311
std::vector< std::vector< long > > ids
Inverted lists for indexes.
Definition: IndexIVF.h:81
int d
vector dimension
Definition: Index.h:64
void train(idx_t n, const float *x) override
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:297
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
Definition: IndexIVF.cpp:190
void add_with_ids(idx_t n, const float *x, const long *xids) override
implemented for all IndexIVF* classes
Definition: IndexIVF.cpp:463
long idx_t
all indices are this type
Definition: Index.h:62
ClusteringParameters cp
to override default clustering params
Definition: IndexIVF.h:44
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
Definition: IndexIVF.cpp:753
void reset() override
removes all elements from the database.
Definition: IndexIVF.cpp:252
void update_vectors(int nv, idx_t *idx, const float *v)
Definition: IndexIVF.cpp:709
Index * clustering_index
to override index used during clustering
Definition: IndexIVF.h:45
void train_q1(size_t n, const float *x, bool verbose, MetricType metric_type)
Trains the quantizer and calls train_residual to train sub-quantizers.
Definition: IndexIVF.cpp:53
void make_direct_map(bool new_maintain_direct_map=true)
Definition: IndexIVF.cpp:139
void print_stats() const
display some stats about the inverted lists
Definition: IndexIVF.cpp:328
void add(idx_t n, const float *x) override
Quantizes x and calls add_with_key.
Definition: IndexIVF.cpp:134
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
long remove_ids(const IDSelector &sel) override
Dataset manipulation functions.
Definition: IndexIVF.cpp:263
size_t max_codes
max nb of codes to visit to do a query
Definition: IndexIVF.h:79
bool maintain_direct_map
map for direct access to the elements. Enables reconstruct().
Definition: IndexIVF.h:87
bool own_fields
whether object owns the quantizer
Definition: IndexIVF.h:42
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
Definition: IndexIVF.cpp:164
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
Definition: IndexIVF.cpp:640
virtual void merge_from(IndexIVF &other, idx_t add_id)
Definition: IndexIVF.cpp:348
size_t nlist
number of possible key values
Definition: IndexIVF.h:34
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:83
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:43
virtual void add_core(idx_t n, const float *x, const long *xids, const long *precomputed_idx)
same as add_with_ids, with precomputed coarse quantizer
Definition: IndexIVF.cpp:468