Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/IndexHNSW.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include <vector>
12 #include <omp.h>
13 
14 #include "IndexFlat.h"
15 #include "IndexPQ.h"
16 #include "IndexScalarQuantizer.h"
17 #include "utils.h"
18 
19 
20 namespace faiss {
21 
22 /** Implementation of the Hierarchical Navigable Small World
23  * datastructure.
24  *
25  * Efficient and robust approximate nearest neighbor search using
26  * Hierarchical Navigable Small World graphs
27  *
28  * Yu. A. Malkov, D. A. Yashunin, arXiv 2017
29  *
30  * This implmentation is heavily influenced by the NMSlib
31  * implementation by Yury Malkov and Leonid Boystov
32  * (https://github.com/searchivarius/nmslib)
33  *
34  * The HNSW object stores only the neighbor link structure, see
35  * IndexHNSW below for the full index object.
36  */
37 
38 struct VisitedTable;
39 
40 struct HNSW {
41 
42  /// internal storage of vectors (32 bits: this is expensive)
43  typedef int storage_idx_t;
44 
45  /// Faiss results are 64-bit
47 
48  /** The HNSW structure does not store vectors, it only accesses
49  * them through this class.
50  *
51  * Functions are guaranteed to be be accessed only from 1 thread. */
53 
54  idx_t d;
55 
56  /// called before computing distances
57  virtual void set_query (const float *x) = 0;
58 
59  /// compute distance of vector i to current query
60  virtual float operator () (storage_idx_t i) = 0;
61 
62  /// compute distance between two stored vectors
63  virtual float symmetric_dis(storage_idx_t i, storage_idx_t j) = 0;
64 
65  virtual ~DistanceComputer () {}
66 
67  };
68 
69  /// assignment probability to each layer (sum=1)
70  std::vector<double> assign_probas;
71 
72  /// number of neighbors stored per layer (cumulative), should not
73  /// be changed after first add
74  std::vector<int> cum_nneighbor_per_level;
75 
76  /// level of each vector (base level = 1), size = ntotal
77  std::vector<int> levels;
78 
79  /// offsets[i] is the offset in the neighbors array where vector i is stored
80  /// size ntotal + 1
81  std::vector<size_t> offsets;
82 
83  /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
84  /// for all levels. this is where all storage goes.
85  std::vector<storage_idx_t> neighbors;
86 
87  /// entry point in the search structure (one of the points with maximum level
89 
91 
92  /// maximum level
93  int max_level;
94 
95  /// expansion factor at construction time
97 
98  /// expansion factor at search time
99  int efSearch;
100 
101  /// during search: do we check whether the next best distance is good enough?
103 
104  /// number of entry points in levels > 0.
106 
107  // methods that initialize the tree sizes
108 
109  /// initialize the assign_probas and cum_nneighbor_per_level to
110  /// have 2*M links on level 0 and M links on levels > 0
111  void set_default_probas(int M, float levelMult);
112 
113  /// set nb of neighbors for this level (before adding anything)
114  void set_nb_neighbors(int level_no, int n);
115 
116  // methods that access the tree sizes
117 
118  /// nb of neighbors for this level
119  int nb_neighbors(int layer_no) const;
120 
121  /// cumumlative nb up to (and excluding) this level
122  int cum_nb_neighbors(int layer_no) const;
123 
124  /// range of entries in the neighbors table of vertex no at layer_no
125  void neighbor_range(idx_t no, int layer_no,
126  size_t * begin, size_t * end) const;
127 
128  /// only mandatory parameter: nb of neighbors
129  explicit HNSW(int M = 32);
130 
131  /// pick a random level for a new point
132  int random_level();
133 
134  /// add n random levels to table (for debugging...)
135  void fill_with_random_links(size_t n);
136 
137  /** add point pt_id on all levels <= pt_level and build the link
138  * structure for them. */
139  void add_with_locks(DistanceComputer & ptdis, int pt_level, int pt_id,
140  std::vector<omp_lock_t> & locks,
141  VisitedTable &vt);
142 
143 
144  /// search interface
145  void search(DistanceComputer & qdis, int k,
146  idx_t *I, float * D,
147  VisitedTable &vt) const;
148 
149  void reset();
150 
151  void clear_neighbor_tables(int level);
152  void print_neighbor_stats(int level) const;
153 };
154 
155 
156 struct HNSWStats {
157  size_t n1, n2, n3;
158  size_t ndis;
159  size_t nreorder;
160  bool view;
161 
162  HNSWStats () {reset (); }
163  void reset ();
164 };
165 
166 // global var that collects them all
167 extern HNSWStats hnsw_stats;
168 
169 class IndexHNSW;
170 
172  typedef Index::idx_t idx_t;
173  typedef HNSW::storage_idx_t storage_idx_t;
174 
175  const IndexHNSW & index;
176  size_t M; // number of neighbors
177  size_t k; // number of codebook entries
178  size_t nsq; // number of subvectors
179  size_t code_size;
180  int k_reorder; // nb to reorder. -1 = all
181 
182  std::vector<float> codebook; // size nsq * k * (M + 1)
183 
184  std::vector<uint8_t> codes; // size ntotal * code_size
185  size_t ntotal;
186  size_t d, dsub; // derived values
187 
188  ReconstructFromNeighbors(const IndexHNSW & index,
189  size_t k=256, size_t nsq=1);
190 
191  /// codes must be added in the correct order and the IndexHNSW
192  /// must be populated and sorted
193  void add_codes(size_t n, const float *x);
194 
195  size_t compute_distances(size_t n, const idx_t *shortlist,
196  const float *query, float *distances) const;
197 
198  /// called by add_codes
199  void estimate_code(const float *x, storage_idx_t i, uint8_t *code) const;
200 
201  /// called by compute_distances
202  void reconstruct(storage_idx_t i, float *x, float *tmp) const;
203 
204  void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float *x) const;
205 
206  /// get the M+1 -by-d table for neighbor coordinates for vector i
207  void get_neighbor_table(storage_idx_t i, float *out) const;
208 
209 };
210 
211 
212 /** The HNSW index is a normal random-access index with a HNSW
213  * link structure built on top */
214 
215 struct IndexHNSW: Index {
216 
217  typedef HNSW::storage_idx_t storage_idx_t;
218 
219  // the link strcuture
220  HNSW hnsw;
221 
222  // the sequential storage
223  bool own_fields;
224  Index * storage;
225 
226  ReconstructFromNeighbors *reconstruct_from_neighbors;
227 
228  explicit IndexHNSW (int d = 0, int M = 32);
229  explicit IndexHNSW (Index * storage, int M = 32);
230 
231  ~IndexHNSW() override;
232 
233  // get a DistanceComputer object for this kind of storage
234  virtual HNSW::DistanceComputer * get_distance_computer() const = 0;
235 
236  void add(idx_t n, const float *x) override;
237 
238  /// Trains the storage if needed
239  void train(idx_t n, const float* x) override;
240 
241  /// entry point for search
242  void search (idx_t n, const float *x, idx_t k,
243  float *distances, idx_t *labels) const override;
244 
245  void reconstruct(idx_t key, float* recons) const override;
246 
247  void reset () override;
248 
249  void shrink_level_0_neighbors(int size);
250 
251  /** Perform search only on level 0, given the starting points for
252  * each vertex.
253  *
254  * @param search_type 1:perform one search per nprobe, 2: enqueue
255  * all entry points
256  */
257  void search_level_0(idx_t n, const float *x, idx_t k,
258  const storage_idx_t *nearest, const float *nearest_d,
259  float *distances, idx_t *labels, int nprobe = 1,
260  int search_type = 1) const;
261 
262  /// alternative graph building
264  int k, const float *D, const idx_t *I);
265 
266  /// alternative graph building
268  int npt, const storage_idx_t *points,
269  const storage_idx_t *nearests);
270 
271  // reorder links from nearest to farthest
272  void reorder_links();
273 
274  void link_singletons();
275 };
276 
277 
278 
279 /** Flat index topped with with a HNSW structure to access elements
280  * more efficiently.
281  */
282 
284  IndexHNSWFlat();
285  IndexHNSWFlat(int d, int M);
286  HNSW::DistanceComputer * get_distance_computer() const override;
287 };
288 
289 /** PQ index topped with with a HNSW structure to access elements
290  * more efficiently.
291  */
293  IndexHNSWPQ();
294  IndexHNSWPQ(int d, int pq_m, int M);
295  void train(idx_t n, const float* x) override;
296  HNSW::DistanceComputer * get_distance_computer() const override;
297 };
298 
299 /** SQ index topped with with a HNSW structure to access elements
300  * more efficiently.
301  */
303  IndexHNSWSQ();
304  IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
305  HNSW::DistanceComputer * get_distance_computer() const override;
306 };
307 
308 /** 2-level code structure with fast random access
309  */
311  IndexHNSW2Level();
312  IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M);
313  HNSW::DistanceComputer * get_distance_computer() const override;
314  void flip_to_ivf();
315 
316  /// entry point for search
317  void search (idx_t n, const float *x, idx_t k,
318  float *distances, idx_t *labels) const override;
319 
320 };
321 
322 
323 
324 };
random generator that can be used in multithreaded contexts
Definition: utils.h:48
void add_with_locks(DistanceComputer &ptdis, int pt_level, int pt_id, std::vector< omp_lock_t > &locks, VisitedTable &vt)
Definition: IndexHNSW.cpp:657
void neighbor_range(idx_t no, int layer_no, size_t *begin, size_t *end) const
range of entries in the neighbors table of vertex no at layer_no
Definition: IndexHNSW.cpp:517
void train(idx_t n, const float *x) override
Trains the storage if needed.
Definition: IndexHNSW.cpp:1733
virtual float operator()(storage_idx_t i)=0
compute distance of vector i to current query
int nb_neighbors(int layer_no) const
nb of neighbors for this level
Definition: IndexHNSW.cpp:497
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
entry point for search
Definition: IndexHNSW.cpp:2071
storage_idx_t entry_point
entry point in the search structure (one of the points with maximum level
Definition: IndexHNSW.h:88
int cum_nb_neighbors(int layer_no) const
cumumlative nb up to (and excluding) this level
Definition: IndexHNSW.cpp:512
virtual float symmetric_dis(storage_idx_t i, storage_idx_t j)=0
compute distance between two stored vectors
std::vector< double > assign_probas
assignment probability to each layer (sum=1)
Definition: IndexHNSW.h:70
std::vector< int > cum_nneighbor_per_level
Definition: IndexHNSW.h:74
void get_neighbor_table(storage_idx_t i, float *out) const
get the M+1 -by-d table for neighbor coordinates for vector i
Definition: IndexHNSW.cpp:1512
void train(idx_t n, const float *x) override
Trains the storage if needed.
Definition: IndexHNSW.cpp:978
int d
vector dimension
Definition: Index.h:64
std::vector< size_t > offsets
Definition: IndexHNSW.h:81
set implementation optimized for fast access.
Definition: IndexHNSW.cpp:54
void add(idx_t n, const float *x) override
Definition: IndexHNSW.cpp:1031
void reconstruct(storage_idx_t i, float *x, float *tmp) const
called by compute_distances
Definition: IndexHNSW.cpp:1378
int efSearch
expansion factor at search time
Definition: IndexHNSW.h:99
long idx_t
all indices are this type
Definition: Index.h:62
virtual void set_query(const float *x)=0
called before computing distances
bool check_relative_distance
during search: do we check whether the next best distance is good enough?
Definition: IndexHNSW.h:102
HNSW(int M=32)
only mandatory parameter: nb of neighbors
Definition: IndexHNSW.cpp:527
int upper_beam
number of entry points in levels &gt; 0.
Definition: IndexHNSW.h:105
void set_nb_neighbors(int level_no, int n)
set nb of neighbors for this level (before adding anything)
Definition: IndexHNSW.cpp:503
void estimate_code(const float *x, storage_idx_t i, uint8_t *code) const
called by add_codes
Definition: IndexHNSW.cpp:1531
faiss::Index::idx_t idx_t
Faiss results are 64-bit.
Definition: IndexHNSW.h:46
void add_codes(size_t n, const float *x)
Definition: IndexHNSW.cpp:1572
int random_level()
pick a random level for a new point
Definition: IndexHNSW.cpp:539
void set_default_probas(int M, float levelMult)
Definition: IndexHNSW.cpp:553
void reset() override
removes all elements from the database.
Definition: IndexHNSW.cpp:1043
void search(DistanceComputer &qdis, int k, idx_t *I, float *D, VisitedTable &vt) const
search interface
Definition: IndexHNSW.cpp:711
void init_level_0_from_entry_points(int npt, const storage_idx_t *points, const storage_idx_t *nearests)
alternative graph building
Definition: IndexHNSW.cpp:1200
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
entry point for search
Definition: IndexHNSW.cpp:985
void init_level_0_from_knngraph(int k, const float *D, const idx_t *I)
alternative graph building
Definition: IndexHNSW.cpp:1162
void search_level_0(idx_t n, const float *x, idx_t k, const storage_idx_t *nearest, const float *nearest_d, float *distances, idx_t *labels, int nprobe=1, int search_type=1) const
Definition: IndexHNSW.cpp:1092
void fill_with_random_links(size_t n)
add n random levels to table (for debugging...)
Definition: IndexHNSW.cpp:913
std::vector< storage_idx_t > neighbors
Definition: IndexHNSW.h:85
void reconstruct(idx_t key, float *recons) const override
Definition: IndexHNSW.cpp:1050
int efConstruction
expansion factor at construction time
Definition: IndexHNSW.h:96
int storage_idx_t
internal storage of vectors (32 bits: this is expensive)
Definition: IndexHNSW.h:43
std::vector< int > levels
level of each vector (base level = 1), size = ntotal
Definition: IndexHNSW.h:77
int max_level
maximum level
Definition: IndexHNSW.h:93