Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/AuxIndexStructures.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 // Auxiliary index structures, that are used in indexes but that can
11 // be forward-declared
12 
13 #ifndef FAISS_AUX_INDEX_STRUCTURES_H
14 #define FAISS_AUX_INDEX_STRUCTURES_H
15 
16 #include <stdint.h>
17 
18 #include <vector>
19 #include <unordered_set>
20 #include <memory>
21 
22 
23 #include "Index.h"
24 
25 namespace faiss {
26 
27 /** The objective is to have a simple result structure while
28  * minimizing the number of mem copies in the result. The method
29  * do_allocation can be overloaded to allocate the result tables in
30  * the matrix type of a scripting language like Lua or Python. */
32  size_t nq; ///< nb of queries
33  size_t *lims; ///< size (nq + 1)
34 
35  typedef Index::idx_t idx_t;
36 
37  idx_t *labels; ///< result for query i is labels[lims[i]:lims[i+1]]
38  float *distances; ///< corresponding distances (not sorted)
39 
40  size_t buffer_size; ///< size of the result buffers used
41 
42  /// lims must be allocated on input to range_search.
43  explicit RangeSearchResult (idx_t nq, bool alloc_lims=true);
44 
45  /// called when lims contains the nb of elements result entries
46  /// for each query
47  virtual void do_allocation ();
48 
49  virtual ~RangeSearchResult ();
50 };
51 
52 
53 /** Encapsulates a set of ids to remove. */
54 struct IDSelector {
55  typedef Index::idx_t idx_t;
56  virtual bool is_member (idx_t id) const = 0;
57  virtual ~IDSelector() {}
58 };
59 
60 
61 
62 /** remove ids between [imni, imax) */
64  idx_t imin, imax;
65 
66  IDSelectorRange (idx_t imin, idx_t imax);
67  bool is_member(idx_t id) const override;
68  ~IDSelectorRange() override {}
69 };
70 
71 
72 /** Remove ids from a set. Repetitions of ids in the indices set
73  * passed to the constructor does not hurt performance. The hash
74  * function used for the bloom filter and GCC's implementation of
75  * unordered_set are just the least significant bits of the id. This
76  * works fine for random ids or ids in sequences but will produce many
77  * hash collisions if lsb's are always the same */
79 
80  std::unordered_set<idx_t> set;
81 
82  typedef unsigned char uint8_t;
83  std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
84  int nbits;
85  idx_t mask;
86 
87  IDSelectorBatch (long n, const idx_t *indices);
88  bool is_member(idx_t id) const override;
89  ~IDSelectorBatch() override {}
90 };
91 
92 /****************************************************************
93  * Result structures for range search.
94  *
95  * The main constraint here is that we want to support parallel
96  * queries from different threads in various ways: 1 thread per query,
97  * several threads per query. We store the actual results in blocks of
98  * fixed size rather than exponentially increasing memory. At the end,
99  * we copy the block content to a linear result array.
100  *****************************************************************/
101 
102 /** List of temporary buffers used to store results before they are
103  * copied to the RangeSearchResult object. */
104 struct BufferList {
105  typedef Index::idx_t idx_t;
106 
107  // buffer sizes in # entries
108  size_t buffer_size;
109 
110  struct Buffer {
111  idx_t *ids;
112  float *dis;
113  };
114 
115  std::vector<Buffer> buffers;
116  size_t wp; ///< write pointer in the last buffer.
117 
118  explicit BufferList (size_t buffer_size);
119 
120  ~BufferList ();
121 
122  /// create a new buffer
123  void append_buffer ();
124 
125  /// add one result, possibly appending a new buffer if needed
126  void add (idx_t id, float dis);
127 
128  /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
129  /// tables dest_ids, dest_dis
130  void copy_range (size_t ofs, size_t n,
131  idx_t * dest_ids, float *dest_dis);
132 
133 };
134 
136 
137 /// result structure for a single query
139  using idx_t = Index::idx_t;
140  idx_t qno; //< id of the query
141  size_t nres; //< nb of results for this query
143 
144  /// called by search function to report a new result
145  void add (float dis, idx_t id);
146 };
147 
148 /// the entries in the buffers are split per query
150  RangeSearchResult * res;
151 
152  /// eventually the result will be stored in res_in
153  explicit RangeSearchPartialResult (RangeSearchResult * res_in);
154 
155  /// query ids + nb of results per query.
156  std::vector<RangeQueryResult> queries;
157 
158  /// begin a new result
159  RangeQueryResult & new_result (idx_t qno);
160 
161  /*****************************************
162  * functions used at the end of the search to merge the result
163  * lists */
164  void finalize ();
165 
166  /// called by range_search before do_allocation
167  void set_lims ();
168 
169  /// called by range_search after do_allocation
170  void copy_result (bool incremental = false);
171 
172  /// merge a set of PartialResult's into one RangeSearchResult
173  /// on ouptut the partialresults are empty!
174  static void merge (std::vector <RangeSearchPartialResult *> &
175  partial_results, bool do_delete=true);
176 
177 };
178 
179 /***********************************************************
180  * Abstract I/O objects
181  ***********************************************************/
182 
183 struct IOReader {
184  // name that can be used in error messages
185  std::string name;
186 
187  // fread
188  virtual size_t operator()(
189  void *ptr, size_t size, size_t nitems) = 0;
190 
191  // return a file number that can be memory-mapped
192  virtual int fileno ();
193 
194  virtual ~IOReader() {}
195 };
196 
197 struct IOWriter {
198  // name that can be used in error messages
199  std::string name;
200 
201  // fwrite
202  virtual size_t operator()(
203  const void *ptr, size_t size, size_t nitems) = 0;
204 
205  // return a file number that can be memory-mapped
206  virtual int fileno ();
207 
208  virtual ~IOWriter() {}
209 };
210 
211 
213  std::vector<uint8_t> data;
214  size_t rp = 0;
215  size_t operator()(void *ptr, size_t size, size_t nitems) override;
216 };
217 
219  std::vector<uint8_t> data;
220  size_t operator()(const void *ptr, size_t size, size_t nitems) override;
221 };
222 
223 /***********************************************************
224  * The distance computer maintains a current query and computes
225  * distances to elements in an index that supports random access.
226  *
227  * The DistanceComputer is not intended to be thread-safe (eg. because
228  * it maintains counters) so the distance functions are not const,
229  * instanciate one from each thread if needed.
230  ***********************************************************/
232  using idx_t = Index::idx_t;
233 
234  /// called before computing distances
235  virtual void set_query(const float *x) = 0;
236 
237  /// compute distance of vector i to current query
238  virtual float operator () (idx_t i) = 0;
239 
240  /// compute distance between two stored vectors
241  virtual float symmetric_dis (idx_t i, idx_t j) = 0;
242 
243  virtual ~DistanceComputer() {}
244 };
245 
246 /***********************************************************
247  * Interrupt callback
248  ***********************************************************/
249 
251  virtual bool want_interrupt () = 0;
252  virtual ~InterruptCallback() {}
253 
254  static std::unique_ptr<InterruptCallback> instance;
255 
256  /** check if:
257  * - an interrupt callback is set
258  * - the callback retuns true
259  * if this is the case, then throw an exception
260  */
261  static void check ();
262 
263  /// same as check() but return true if is interrupted instead of
264  /// throwing
265  static bool is_interrupted ();
266 
267  /** assuming each iteration takes a certain number of flops, what
268  * is a reasonable interval to check for interrupts?
269  */
270  static size_t get_period_hint (size_t flops);
271 
272 };
273 
274 
275 
276 }; // namespace faiss
277 
278 
279 
280 #endif
virtual float symmetric_dis(idx_t i, idx_t j)=0
compute distance between two stored vectors
std::vector< RangeQueryResult > queries
query ids + nb of results per query.
size_t nq
nb of queries
virtual void set_query(const float *x)=0
called before computing distances
result structure for a single query
void append_buffer()
create a new buffer
void copy_range(size_t ofs, size_t n, idx_t *dest_ids, float *dest_dis)
void set_lims()
called by range_search before do_allocation
virtual float operator()(idx_t i)=0
compute distance of vector i to current query
RangeSearchResult(idx_t nq, bool alloc_lims=true)
lims must be allocated on input to range_search.
long idx_t
all indices are this type
Definition: Index.h:62
size_t wp
write pointer in the last buffer.
float * distances
corresponding distances (not sorted)
void add(idx_t id, float dis)
add one result, possibly appending a new buffer if needed
the entries in the buffers are split per query
void copy_result(bool incremental=false)
called by range_search after do_allocation
static void merge(std::vector< RangeSearchPartialResult * > &partial_results, bool do_delete=true)
RangeQueryResult & new_result(idx_t qno)
begin a new result
size_t buffer_size
size of the result buffers used
static size_t get_period_hint(size_t flops)
size_t * lims
size (nq + 1)
void add(float dis, idx_t id)
called by search function to report a new result
idx_t * labels
result for query i is labels[lims[i]:lims[i+1]]
RangeSearchPartialResult(RangeSearchResult *res_in)
eventually the result will be stored in res_in