Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/HNSW.h
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #pragma once
12 
13 #include <vector>
14 #include <unordered_set>
15 #include <queue>
16 
17 #include <omp.h>
18 
19 #include "Index.h"
20 #include "FaissAssert.h"
21 #include "utils.h"
22 
23 
24 namespace faiss {
25 
26 
27 /** Implementation of the Hierarchical Navigable Small World
28  * datastructure.
29  *
30  * Efficient and robust approximate nearest neighbor search using
31  * Hierarchical Navigable Small World graphs
32  *
33  * Yu. A. Malkov, D. A. Yashunin, arXiv 2017
34  *
35  * This implmentation is heavily influenced by the NMSlib
36  * implementation by Yury Malkov and Leonid Boystov
37  * (https://github.com/searchivarius/nmslib)
38  *
39  * The HNSW object stores only the neighbor link structure, see
40  * IndexHNSW below for the full index object.
41  */
42 
43 
44 struct VisitedTable;
45 
46 
47 struct HNSW {
48  /// internal storage of vectors (32 bits: this is expensive)
49  typedef int storage_idx_t;
50 
51  /// Faiss results are 64-bit
53 
54  typedef std::pair<float, storage_idx_t> Node;
55 
56  /** The HNSW structure does not store vectors, it only accesses
57  * them through this class.
58  *
59  * Functions are guaranteed to be be accessed only from 1 thread. */
61  idx_t d;
62 
63  /// called before computing distances
64  virtual void set_query(const float *x) = 0;
65 
66  /// compute distance of vector i to current query
67  virtual float operator () (storage_idx_t i) = 0;
68 
69  /// compute distance between two stored vectors
70  virtual float symmetric_dis(storage_idx_t i, storage_idx_t j) = 0;
71 
72  virtual ~DistanceComputer() {}
73  };
74 
75 
76  /** Heap structure that allows fast
77  */
78  struct MinimaxHeap {
79  int n;
80  int k;
81 
82  std::vector<storage_idx_t> ids;
83  std::vector<float> dis;
85 
86  explicit MinimaxHeap(int n): n(n), k(0), ids(n), dis(n) {}
87 
88  void push(storage_idx_t i, float v);
89 
90  float max() const;
91 
92  int size() const;
93 
94  void clear();
95 
96  int pop_min(float *vmin_out = nullptr);
97 
98  int count_below(float thresh);
99  };
100 
101 
102  /// to sort pairs of (id, distance) from nearest to fathest or the reverse
103  struct NodeDistCloser {
104  float d;
105  int id;
106  NodeDistCloser(float d, int id): d(d), id(id) {}
107  bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
108  };
109 
111  float d;
112  int id;
113  NodeDistFarther(float d, int id): d(d), id(id) {}
114  bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
115  };
116 
117 
118  /// assignment probability to each layer (sum=1)
119  std::vector<double> assign_probas;
120 
121  /// number of neighbors stored per layer (cumulative), should not
122  /// be changed after first add
123  std::vector<int> cum_nneighbor_per_level;
124 
125  /// level of each vector (base level = 1), size = ntotal
126  std::vector<int> levels;
127 
128  /// offsets[i] is the offset in the neighbors array where vector i is stored
129  /// size ntotal + 1
130  std::vector<size_t> offsets;
131 
132  /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
133  /// for all levels. this is where all storage goes.
134  std::vector<storage_idx_t> neighbors;
135 
136  /// entry point in the search structure (one of the points with maximum level
138 
140 
141  /// maximum level
143 
144  /// expansion factor at construction time
146 
147  /// expansion factor at search time
148  int efSearch;
149 
150  /// number of entry points in levels > 0.
152 
153  // methods that initialize the tree sizes
154 
155  /// initialize the assign_probas and cum_nneighbor_per_level to
156  /// have 2*M links on level 0 and M links on levels > 0
157  void set_default_probas(int M, float levelMult);
158 
159  /// set nb of neighbors for this level (before adding anything)
160  void set_nb_neighbors(int level_no, int n);
161 
162  // methods that access the tree sizes
163 
164  /// nb of neighbors for this level
165  int nb_neighbors(int layer_no) const;
166 
167  /// cumumlative nb up to (and excluding) this level
168  int cum_nb_neighbors(int layer_no) const;
169 
170  /// range of entries in the neighbors table of vertex no at layer_no
171  void neighbor_range(idx_t no, int layer_no,
172  size_t * begin, size_t * end) const;
173 
174  /// only mandatory parameter: nb of neighbors
175  explicit HNSW(int M = 32);
176 
177  /// pick a random level for a new point
178  int random_level();
179 
180  /// add n random levels to table (for debugging...)
181  void fill_with_random_links(size_t n);
182 
184  storage_idx_t pt_id,
185  storage_idx_t nearest,
186  float d_nearest,
187  int level,
188  omp_lock_t *locks,
189  VisitedTable &vt);
190 
191 
192  /** add point pt_id on all levels <= pt_level and build the link
193  * structure for them. */
194  void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
195  std::vector<omp_lock_t>& locks,
196  VisitedTable& vt);
197 
198  int search_from_candidates(DistanceComputer& qdis, int k,
199  idx_t *I, float *D,
200  MinimaxHeap& candidates,
201  VisitedTable &vt,
202  int level, int nres_in = 0) const;
203 
204  std::priority_queue<Node> search_from(const Node& node,
205  DistanceComputer& qdis,
206  int ef,
207  VisitedTable *vt) const;
208 
209  /// search interface
210  void search(DistanceComputer& qdis, int k,
211  idx_t *I, float *D,
212  VisitedTable& vt) const;
213 
214  void reset();
215 
216  void clear_neighbor_tables(int level);
217  void print_neighbor_stats(int level) const;
218 
219  int prepare_level_tab(size_t n, bool preset_levels = false);
220 
221  static void shrink_neighbor_list(
222  DistanceComputer& qdis,
223  std::priority_queue<NodeDistFarther>& input,
224  std::vector<NodeDistFarther>& output,
225  int max_size);
226 
227 };
228 
229 
230 /**************************************************************
231  * Auxiliary structures
232  **************************************************************/
233 
234 /// set implementation optimized for fast access.
235 struct VisitedTable {
236  std::vector<uint8_t> visited;
237  int visno;
238 
239  explicit VisitedTable(int size)
240  : visited(size), visno(1) {}
241 
242  /// set flog #no to true
243  void set(int no) {
244  visited[no] = visno;
245  }
246 
247  /// get flag #no
248  bool get(int no) const {
249  return visited[no] == visno;
250  }
251 
252  /// reset all flags to false
253  void advance() {
254  visno++;
255  if (visno == 250) {
256  // 250 rather than 255 because sometimes we use visno and visno+1
257  memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
258  visno = 1;
259  }
260  }
261 };
262 
263 
264 struct HNSWStats {
265  size_t n1, n2, n3;
266  size_t ndis;
267  size_t nreorder;
268  bool view;
269 
270  HNSWStats() {
271  reset();
272  }
273 
274  void reset() {
275  n1 = n2 = n3 = 0;
276  ndis = 0;
277  nreorder = 0;
278  view = false;
279  }
280 };
281 
282 // global var that collects them all
283 extern HNSWStats hnsw_stats;
284 
285 
286 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:48
void add_with_locks(DistanceComputer &ptdis, int pt_level, int pt_id, std::vector< omp_lock_t > &locks, VisitedTable &vt)
Definition: HNSW.cpp:479
void neighbor_range(idx_t no, int layer_no, size_t *begin, size_t *end) const
range of entries in the neighbors table of vertex no at layer_no
Definition: HNSW.cpp:43
virtual float operator()(storage_idx_t i)=0
compute distance of vector i to current query
int nb_neighbors(int layer_no) const
nb of neighbors for this level
Definition: HNSW.cpp:23
storage_idx_t entry_point
entry point in the search structure (one of the points with maximum level
Definition: HNSW.h:137
int cum_nb_neighbors(int layer_no) const
cumumlative nb up to (and excluding) this level
Definition: HNSW.cpp:38
Index::idx_t idx_t
Faiss results are 64-bit.
Definition: HNSW.h:52
virtual float symmetric_dis(storage_idx_t i, storage_idx_t j)=0
compute distance between two stored vectors
std::vector< double > assign_probas
assignment probability to each layer (sum=1)
Definition: HNSW.h:119
std::vector< int > cum_nneighbor_per_level
Definition: HNSW.h:123
void advance()
reset all flags to false
Definition: HNSW.h:253
void add_links_starting_from(DistanceComputer &ptdis, storage_idx_t pt_id, storage_idx_t nearest, float d_nearest, int level, omp_lock_t *locks, VisitedTable &vt)
Definition: HNSW.cpp:443
std::vector< size_t > offsets
Definition: HNSW.h:130
set implementation optimized for fast access.
Definition: HNSW.h:235
int efSearch
expansion factor at search time
Definition: HNSW.h:148
long idx_t
all indices are this type
Definition: Index.h:64
virtual void set_query(const float *x)=0
called before computing distances
HNSW(int M=32)
only mandatory parameter: nb of neighbors
Definition: HNSW.cpp:53
to sort pairs of (id, distance) from nearest to fathest or the reverse
Definition: HNSW.h:103
int upper_beam
number of entry points in levels &gt; 0.
Definition: HNSW.h:151
void set_nb_neighbors(int level_no, int n)
set nb of neighbors for this level (before adding anything)
Definition: HNSW.cpp:29
int search_from_candidates(DistanceComputer &qdis, int k, idx_t *I, float *D, MinimaxHeap &candidates, VisitedTable &vt, int level, int nres_in=0) const
Definition: HNSW.cpp:525
int random_level()
pick a random level for a new point
Definition: HNSW.cpp:64
void set_default_probas(int M, float levelMult)
Definition: HNSW.cpp:78
void search(DistanceComputer &qdis, int k, idx_t *I, float *D, VisitedTable &vt) const
search interface
Definition: HNSW.cpp:665
void fill_with_random_links(size_t n)
add n random levels to table (for debugging...)
Definition: HNSW.cpp:172
std::vector< storage_idx_t > neighbors
Definition: HNSW.h:134
int efConstruction
expansion factor at construction time
Definition: HNSW.h:145
int storage_idx_t
internal storage of vectors (32 bits: this is expensive)
Definition: HNSW.h:49
void set(int no)
set flog #no to true
Definition: HNSW.h:243
std::vector< int > levels
level of each vector (base level = 1), size = ntotal
Definition: HNSW.h:126
int max_level
maximum level
Definition: HNSW.h:142
static void shrink_neighbor_list(DistanceComputer &qdis, std::priority_queue< NodeDistFarther > &input, std::vector< NodeDistFarther > &output, int max_size)
Definition: HNSW.cpp:237