Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/HNSW.h
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #pragma once
11 
12 #include <vector>
13 #include <unordered_set>
14 #include <queue>
15 
16 #include <omp.h>
17 
18 #include "Index.h"
19 #include "FaissAssert.h"
20 #include "utils.h"
21 
22 
23 namespace faiss {
24 
25 
26 /** Implementation of the Hierarchical Navigable Small World
27  * datastructure.
28  *
29  * Efficient and robust approximate nearest neighbor search using
30  * Hierarchical Navigable Small World graphs
31  *
32  * Yu. A. Malkov, D. A. Yashunin, arXiv 2017
33  *
34  * This implmentation is heavily influenced by the NMSlib
35  * implementation by Yury Malkov and Leonid Boystov
36  * (https://github.com/searchivarius/nmslib)
37  *
38  * The HNSW object stores only the neighbor link structure, see
39  * IndexHNSW.h for the full index object.
40  */
41 
42 
43 struct VisitedTable;
44 struct DistanceComputer; // from AuxIndexStructures
45 
46 struct HNSW {
47  /// internal storage of vectors (32 bits: this is expensive)
48  typedef int storage_idx_t;
49 
50  /// Faiss results are 64-bit
52 
53  typedef std::pair<float, storage_idx_t> Node;
54 
55  /** Heap structure that allows fast
56  */
57  struct MinimaxHeap {
58  int n;
59  int k;
60  int nvalid;
61 
62  std::vector<storage_idx_t> ids;
63  std::vector<float> dis;
65 
66  explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
67 
68  void push(storage_idx_t i, float v);
69 
70  float max() const;
71 
72  int size() const;
73 
74  void clear();
75 
76  int pop_min(float *vmin_out = nullptr);
77 
78  int count_below(float thresh);
79  };
80 
81 
82  /// to sort pairs of (id, distance) from nearest to fathest or the reverse
83  struct NodeDistCloser {
84  float d;
85  int id;
86  NodeDistCloser(float d, int id): d(d), id(id) {}
87  bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
88  };
89 
90  struct NodeDistFarther {
91  float d;
92  int id;
93  NodeDistFarther(float d, int id): d(d), id(id) {}
94  bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
95  };
96 
97 
98  /// assignment probability to each layer (sum=1)
99  std::vector<double> assign_probas;
100 
101  /// number of neighbors stored per layer (cumulative), should not
102  /// be changed after first add
103  std::vector<int> cum_nneighbor_per_level;
104 
105  /// level of each vector (base level = 1), size = ntotal
106  std::vector<int> levels;
107 
108  /// offsets[i] is the offset in the neighbors array where vector i is stored
109  /// size ntotal + 1
110  std::vector<size_t> offsets;
111 
112  /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
113  /// for all levels. this is where all storage goes.
114  std::vector<storage_idx_t> neighbors;
115 
116  /// entry point in the search structure (one of the points with maximum level
118 
120 
121  /// maximum level
123 
124  /// expansion factor at construction time
126 
127  /// expansion factor at search time
128  int efSearch;
129 
130  /// during search: do we check whether the next best distance is good enough?
132 
133  /// number of entry points in levels > 0.
135 
136  /// use bounded queue during exploration
137  bool search_bounded_queue = true;
138 
139  // methods that initialize the tree sizes
140 
141  /// initialize the assign_probas and cum_nneighbor_per_level to
142  /// have 2*M links on level 0 and M links on levels > 0
143  void set_default_probas(int M, float levelMult);
144 
145  /// set nb of neighbors for this level (before adding anything)
146  void set_nb_neighbors(int level_no, int n);
147 
148  // methods that access the tree sizes
149 
150  /// nb of neighbors for this level
151  int nb_neighbors(int layer_no) const;
152 
153  /// cumumlative nb up to (and excluding) this level
154  int cum_nb_neighbors(int layer_no) const;
155 
156  /// range of entries in the neighbors table of vertex no at layer_no
157  void neighbor_range(idx_t no, int layer_no,
158  size_t * begin, size_t * end) const;
159 
160  /// only mandatory parameter: nb of neighbors
161  explicit HNSW(int M = 32);
162 
163  /// pick a random level for a new point
164  int random_level();
165 
166  /// add n random levels to table (for debugging...)
167  void fill_with_random_links(size_t n);
168 
170  storage_idx_t pt_id,
171  storage_idx_t nearest,
172  float d_nearest,
173  int level,
174  omp_lock_t *locks,
175  VisitedTable &vt);
176 
177 
178  /** add point pt_id on all levels <= pt_level and build the link
179  * structure for them. */
180  void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
181  std::vector<omp_lock_t>& locks,
182  VisitedTable& vt);
183 
184  int search_from_candidates(DistanceComputer& qdis, int k,
185  idx_t *I, float *D,
186  MinimaxHeap& candidates,
187  VisitedTable &vt,
188  int level, int nres_in = 0) const;
189 
190  std::priority_queue<Node> search_from_candidate_unbounded(
191  const Node& node,
192  DistanceComputer& qdis,
193  int ef,
194  VisitedTable *vt
195  ) const;
196 
197  /// search interface
198  void search(DistanceComputer& qdis, int k,
199  idx_t *I, float *D,
200  VisitedTable& vt) const;
201 
202  void reset();
203 
204  void clear_neighbor_tables(int level);
205  void print_neighbor_stats(int level) const;
206 
207  int prepare_level_tab(size_t n, bool preset_levels = false);
208 
209  static void shrink_neighbor_list(
210  DistanceComputer& qdis,
211  std::priority_queue<NodeDistFarther>& input,
212  std::vector<NodeDistFarther>& output,
213  int max_size);
214 
215 };
216 
217 
218 /**************************************************************
219  * Auxiliary structures
220  **************************************************************/
221 
222 /// set implementation optimized for fast access.
223 struct VisitedTable {
224  std::vector<uint8_t> visited;
225  int visno;
226 
227  explicit VisitedTable(int size)
228  : visited(size), visno(1) {}
229 
230  /// set flog #no to true
231  void set(int no) {
232  visited[no] = visno;
233  }
234 
235  /// get flag #no
236  bool get(int no) const {
237  return visited[no] == visno;
238  }
239 
240  /// reset all flags to false
241  void advance() {
242  visno++;
243  if (visno == 250) {
244  // 250 rather than 255 because sometimes we use visno and visno+1
245  memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
246  visno = 1;
247  }
248  }
249 };
250 
251 
252 struct HNSWStats {
253  size_t n1, n2, n3;
254  size_t ndis;
255  size_t nreorder;
256  bool view;
257 
258  HNSWStats() {
259  reset();
260  }
261 
262  void reset() {
263  n1 = n2 = n3 = 0;
264  ndis = 0;
265  nreorder = 0;
266  view = false;
267  }
268 };
269 
270 // global var that collects them all
271 extern HNSWStats hnsw_stats;
272 
273 
274 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:47
void add_with_locks(DistanceComputer &ptdis, int pt_level, int pt_id, std::vector< omp_lock_t > &locks, VisitedTable &vt)
Definition: HNSW.cpp:477
void neighbor_range(idx_t no, int layer_no, size_t *begin, size_t *end) const
range of entries in the neighbors table of vertex no at layer_no
Definition: HNSW.cpp:41
int nb_neighbors(int layer_no) const
nb of neighbors for this level
Definition: HNSW.cpp:21
storage_idx_t entry_point
entry point in the search structure (one of the points with maximum level
Definition: HNSW.h:117
int cum_nb_neighbors(int layer_no) const
cumumlative nb up to (and excluding) this level
Definition: HNSW.cpp:36
Index::idx_t idx_t
Faiss results are 64-bit.
Definition: HNSW.h:51
std::vector< double > assign_probas
assignment probability to each layer (sum=1)
Definition: HNSW.h:99
bool search_bounded_queue
use bounded queue during exploration
Definition: HNSW.h:137
std::vector< int > cum_nneighbor_per_level
Definition: HNSW.h:103
void advance()
reset all flags to false
Definition: HNSW.h:241
void add_links_starting_from(DistanceComputer &ptdis, storage_idx_t pt_id, storage_idx_t nearest, float d_nearest, int level, omp_lock_t *locks, VisitedTable &vt)
Definition: HNSW.cpp:441
long idx_t
all indices are this type
Definition: Index.h:62
std::vector< size_t > offsets
Definition: HNSW.h:110
set implementation optimized for fast access.
Definition: HNSW.h:223
int efSearch
expansion factor at search time
Definition: HNSW.h:128
bool check_relative_distance
during search: do we check whether the next best distance is good enough?
Definition: HNSW.h:131
HNSW(int M=32)
only mandatory parameter: nb of neighbors
Definition: HNSW.cpp:51
to sort pairs of (id, distance) from nearest to fathest or the reverse
Definition: HNSW.h:83
int upper_beam
number of entry points in levels &gt; 0.
Definition: HNSW.h:134
void set_nb_neighbors(int level_no, int n)
set nb of neighbors for this level (before adding anything)
Definition: HNSW.cpp:27
int search_from_candidates(DistanceComputer &qdis, int k, idx_t *I, float *D, MinimaxHeap &candidates, VisitedTable &vt, int level, int nres_in=0) const
Definition: HNSW.cpp:523
int random_level()
pick a random level for a new point
Definition: HNSW.cpp:62
void set_default_probas(int M, float levelMult)
Definition: HNSW.cpp:76
void search(DistanceComputer &qdis, int k, idx_t *I, float *D, VisitedTable &vt) const
search interface
Definition: HNSW.cpp:676
void fill_with_random_links(size_t n)
add n random levels to table (for debugging...)
Definition: HNSW.cpp:170
std::vector< storage_idx_t > neighbors
Definition: HNSW.h:114
int efConstruction
expansion factor at construction time
Definition: HNSW.h:125
int storage_idx_t
internal storage of vectors (32 bits: this is expensive)
Definition: HNSW.h:48
void set(int no)
set flog #no to true
Definition: HNSW.h:231
std::vector< int > levels
level of each vector (base level = 1), size = ntotal
Definition: HNSW.h:106
int max_level
maximum level
Definition: HNSW.h:122
static void shrink_neighbor_list(DistanceComputer &qdis, std::priority_queue< NodeDistFarther > &input, std::vector< NodeDistFarther > &output, int max_size)
Definition: HNSW.cpp:235