11 #include "AuxIndexStructures.h"
29 FAISS_THROW_IF_NOT(
levels.size() == 0);
42 size_t * begin,
size_t * end)
const
80 for (
int level = 0; ;level++) {
81 float proba = exp(-level / levelMult) * (1 - exp(-1 / levelMult));
82 if (proba < 1e-9)
break;
84 nn += level == 0 ? M * 2 : M;
89 void HNSW::clear_neighbor_tables(
int level)
91 for (
int i = 0; i <
levels.size(); i++) {
94 for (
size_t j = begin; j < end; j++) {
112 void HNSW::print_neighbor_stats(
int level)
const
115 printf(
"stats on level %d, max %d neighbors per vertex:\n",
117 size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
118 #pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
119 reduction(+: tot_reciprocal) reduction(+: n_node)
120 for (
int i = 0; i <
levels.size(); i++) {
125 std::unordered_set<int> neighset;
126 for (
size_t j = begin; j < end; j++) {
130 int n_neigh = neighset.size();
132 int n_reciprocal = 0;
133 for (
size_t j = begin; j < end; j++) {
136 FAISS_ASSERT(i2 != i);
139 for (
size_t j2 = begin2; j2 < end2; j2++) {
146 if (neighset.count(i3)) {
152 tot_neigh += n_neigh;
153 tot_common += n_common;
154 tot_reciprocal += n_reciprocal;
157 float normalizer = n_node;
158 printf(
" nb of nodes at that level %ld\n", n_node);
159 printf(
" neighbors per node: %.2f (%ld)\n",
160 tot_neigh / normalizer, tot_neigh);
161 printf(
" nb of reciprocal neighbors: %.2f\n", tot_reciprocal / normalizer);
162 printf(
" nb of neighbors that are also neighbor-of-neighbors: %.2f (%ld)\n",
163 tot_common / normalizer, tot_common);
175 for (
int level = max_level - 1; level >= 0; --level) {
176 std::vector<int> elts;
177 for (
int i = 0; i < n; i++) {
182 printf (
"linking %ld elements in level %d\n",
185 if (elts.size() == 1)
continue;
187 for (
int ii = 0; ii < elts.size(); ii++) {
191 for (
size_t j = begin; j < end; j++) {
194 other = elts[rng2.
rand_int(elts.size())];
204 int HNSW::prepare_level_tab(
size_t n,
bool preset_levels)
206 size_t n0 =
offsets.size() - 1;
209 FAISS_ASSERT (n0 + n ==
levels.size());
211 FAISS_ASSERT (n0 ==
levels.size());
212 for (
int i = 0; i < n; i++) {
214 levels.push_back(pt_level + 1);
219 for (
int i = 0; i < n; i++) {
220 int pt_level =
levels[i + n0] - 1;
221 if (pt_level > max_level) max_level = pt_level;
237 std::priority_queue<NodeDistFarther>& input,
238 std::vector<NodeDistFarther>& output,
241 while (input.size() > 0) {
244 float dist_v1_q = v1.d;
250 if (dist_v1_v2 < dist_v1_q) {
257 output.push_back(v1);
258 if (output.size() >= max_size) {
280 void shrink_neighbor_list(
282 std::priority_queue<NodeDistCloser>& resultSet1,
285 if (resultSet1.size() < max_size) {
288 std::priority_queue<NodeDistFarther> resultSet;
289 std::vector<NodeDistFarther> returnlist;
291 while (resultSet1.size() > 0) {
292 resultSet.emplace(resultSet1.top().d, resultSet1.top().id);
298 for (NodeDistFarther curen2 : returnlist) {
299 resultSet1.emplace(curen2.d, curen2.id);
307 void add_link(HNSW& hnsw,
308 DistanceComputer& qdis,
309 storage_idx_t src, storage_idx_t dest,
313 hnsw.neighbor_range(src, level, &begin, &end);
314 if (hnsw.neighbors[end - 1] == -1) {
318 if (hnsw.neighbors[i - 1] != -1)
break;
321 hnsw.neighbors[i] = dest;
328 std::priority_queue<NodeDistCloser> resultSet;
329 resultSet.emplace(qdis.symmetric_dis(src, dest), dest);
330 for (
size_t i = begin; i < end; i++) {
331 storage_idx_t neigh = hnsw.neighbors[i];
332 resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
335 shrink_neighbor_list(qdis, resultSet, end - begin);
339 while (resultSet.size()) {
340 hnsw.neighbors[i++] = resultSet.top().id;
345 hnsw.neighbors[i++] = -1;
350 void search_neighbors_to_add(
352 DistanceComputer& qdis,
353 std::priority_queue<NodeDistCloser>& results,
360 std::priority_queue<NodeDistFarther> candidates;
362 NodeDistFarther ev(d_entry_point, entry_point);
364 results.emplace(d_entry_point, entry_point);
367 while (!candidates.empty()) {
369 const NodeDistFarther &currEv = candidates.top();
371 if (currEv.d > results.top().d) {
374 int currNode = currEv.id;
379 hnsw.neighbor_range(currNode, level, &begin, &end);
380 for(
size_t i = begin; i < end; i++) {
381 storage_idx_t nodeId = hnsw.neighbors[i];
382 if (nodeId < 0)
break;
383 if (vt.get(nodeId))
continue;
386 float dis = qdis(nodeId);
387 NodeDistFarther evE1(dis, nodeId);
389 if (results.size() < hnsw.efConstruction ||
390 results.top().d > dis) {
392 results.emplace(dis, nodeId);
393 candidates.emplace(dis, nodeId);
394 if (results.size() > hnsw.efConstruction) {
409 void greedy_update_nearest(
const HNSW& hnsw,
410 DistanceComputer& qdis,
412 storage_idx_t& nearest,
416 storage_idx_t prev_nearest = nearest;
419 hnsw.neighbor_range(nearest, level, &begin, &end);
420 for(
size_t i = begin; i < end; i++) {
421 storage_idx_t v = hnsw.neighbors[i];
424 if (dis < d_nearest) {
429 if (nearest == prev_nearest) {
449 std::priority_queue<NodeDistCloser> link_targets;
451 search_neighbors_to_add(*
this, ptdis, link_targets, nearest, d_nearest,
457 ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
459 while (!link_targets.empty()) {
460 int other_id = link_targets.top().id;
462 omp_set_lock(&locks[other_id]);
463 add_link(*
this, ptdis, other_id, pt_id, level);
464 omp_unset_lock(&locks[other_id]);
466 add_link(*
this, ptdis, pt_id, other_id, level);
478 std::vector<omp_lock_t>& locks,
489 max_level = pt_level;
498 omp_set_lock(&locks[pt_id]);
501 float d_nearest = ptdis(nearest);
503 for(; level > pt_level; level--) {
504 greedy_update_nearest(*
this, ptdis, level, nearest, d_nearest);
507 for(; level >= 0; level--) {
509 level, locks.data(), vt);
512 omp_unset_lock(&locks[pt_id]);
514 if (pt_level > max_level) {
515 max_level = pt_level;
528 int level,
int nres_in)
const
532 for (
int i = 0; i < candidates.size(); i++) {
533 idx_t v1 = candidates.ids[i];
534 float d = candidates.dis[i];
535 FAISS_ASSERT(v1 >= 0);
537 faiss::maxheap_push(++nres, D, I, d, v1);
538 }
else if (d < D[0]) {
539 faiss::maxheap_pop(nres--, D, I);
540 faiss::maxheap_push(++nres, D, I, d, v1);
548 while (candidates.size() > 0) {
550 int v0 = candidates.pop_min(&d0);
557 int n_dis_below = candidates.count_below(d0);
566 for (
size_t j = begin; j < end; j++) {
576 faiss::maxheap_push(++nres, D, I, d, v1);
577 }
else if (d < D[0]) {
578 faiss::maxheap_pop(nres--, D, I);
579 faiss::maxheap_push(++nres, D, I, d, v1);
581 candidates.push(v1, d);
585 if (!do_dis_check && nstep >
efSearch) {
594 if (candidates.size() == 0) {
597 hnsw_stats.n3 += ndis;
609 std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
616 std::priority_queue<Node> top_candidates;
617 std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;
619 top_candidates.push(node);
620 candidates.push(node);
622 vt->
set(node.second);
624 while (!candidates.empty()) {
627 std::tie(d0, v0) = candidates.top();
629 if (d0 > top_candidates.top().first) {
638 for (
size_t j = begin; j < end; ++j) {
653 if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
654 candidates.emplace(d1, v1);
655 top_candidates.emplace(d1, v1);
657 if (top_candidates.size() > ef) {
658 top_candidates.pop();
667 if (candidates.size() == 0) {
670 hnsw_stats.n3 += ndis;
673 return top_candidates;
684 float d_nearest = qdis(nearest);
686 for(
int level = max_level; level >= 1; level--) {
687 greedy_update_nearest(*
this, qdis, level, nearest, d_nearest);
694 candidates.push(nearest, d_nearest);
698 std::priority_queue<Node> top_candidates =
699 search_from_candidate_unbounded(Node(d_nearest, nearest),
702 while (top_candidates.size() > k) {
703 top_candidates.pop();
707 while (!top_candidates.empty()) {
710 std::tie(d, label) = top_candidates.top();
711 faiss::maxheap_push(++nres, D, I, d, label);
712 top_candidates.pop();
722 std::vector<idx_t> I_to_next(candidates_size);
723 std::vector<float> D_to_next(candidates_size);
727 D_to_next[0] = qdis(entry_point);
729 for(
int level = max_level; level >= 0; level--) {
735 for (
int i = 0; i < nres; i++) {
736 candidates.push(I_to_next[i], D_to_next[i]);
743 qdis, candidates_size,
744 I_to_next.data(), D_to_next.data(),
745 candidates, vt, level
754 void HNSW::MinimaxHeap::push(storage_idx_t i,
float v) {
756 if (v >= dis[0])
return;
757 faiss::heap_pop<HC> (k--, dis.data(), ids.data());
760 faiss::heap_push<HC> (++k, dis.data(), ids.data(), v, i);
764 float HNSW::MinimaxHeap::max()
const {
768 int HNSW::MinimaxHeap::size()
const {
772 void HNSW::MinimaxHeap::clear() {
776 int HNSW::MinimaxHeap::pop_min(
float *vmin_out) {
781 if (ids[i] != -1)
break;
784 if (i == -1)
return -1;
789 if (ids[i] != -1 && dis[i] < vmin) {
795 if (vmin_out) *vmin_out = vmin;
803 int HNSW::MinimaxHeap::count_below(
float thresh) {
805 for(
int i = 0; i < k; i++) {
806 if (dis[i] < thresh) {
random generator that can be used in multithreaded contexts
void add_with_locks(DistanceComputer &ptdis, int pt_level, int pt_id, std::vector< omp_lock_t > &locks, VisitedTable &vt)
virtual float symmetric_dis(idx_t i, idx_t j)=0
compute distance between two stored vectors
void neighbor_range(idx_t no, int layer_no, size_t *begin, size_t *end) const
range of entries in the neighbors table of vertex no at layer_no
bool get(int no) const
get flag #no
int nb_neighbors(int layer_no) const
nb of neighbors for this level
storage_idx_t entry_point
entry point in the search structure (one of the points with maximum level
int cum_nb_neighbors(int layer_no) const
cumumlative nb up to (and excluding) this level
Index::idx_t idx_t
Faiss results are 64-bit.
std::vector< double > assign_probas
assignment probability to each layer (sum=1)
float rand_float()
between 0 and 1
bool search_bounded_queue
use bounded queue during exploration
std::vector< int > cum_nneighbor_per_level
void advance()
reset all flags to false
void add_links_starting_from(DistanceComputer &ptdis, storage_idx_t pt_id, storage_idx_t nearest, float d_nearest, int level, omp_lock_t *locks, VisitedTable &vt)
long idx_t
all indices are this type
std::vector< size_t > offsets
set implementation optimized for fast access.
int rand_int()
random positive integer
int efSearch
expansion factor at search time
bool check_relative_distance
during search: do we check whether the next best distance is good enough?
HNSW(int M=32)
only mandatory parameter: nb of neighbors
to sort pairs of (id, distance) from nearest to fathest or the reverse
int upper_beam
number of entry points in levels > 0.
void set_nb_neighbors(int level_no, int n)
set nb of neighbors for this level (before adding anything)
int search_from_candidates(DistanceComputer &qdis, int k, idx_t *I, float *D, MinimaxHeap &candidates, VisitedTable &vt, int level, int nres_in=0) const
int random_level()
pick a random level for a new point
void set_default_probas(int M, float levelMult)
void search(DistanceComputer &qdis, int k, idx_t *I, float *D, VisitedTable &vt) const
search interface
void fill_with_random_links(size_t n)
add n random levels to table (for debugging...)
std::vector< storage_idx_t > neighbors
int efConstruction
expansion factor at construction time
int storage_idx_t
internal storage of vectors (32 bits: this is expensive)
void set(int no)
set flog #no to true
std::vector< int > levels
level of each vector (base level = 1), size = ntotal
int max_level
maximum level
static void shrink_neighbor_list(DistanceComputer &qdis, std::priority_queue< NodeDistFarther > &input, std::vector< NodeDistFarther > &output, int max_size)