faiss/impl/HNSW.h

276 lines
6.8 KiB
C++

/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#pragma once
#include <vector>
#include <unordered_set>
#include <queue>
#include <omp.h>
#include <faiss/Index.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/utils/random.h>
#include <faiss/utils/Heap.h>
namespace faiss {
/** Implementation of the Hierarchical Navigable Small World
* datastructure.
*
* Efficient and robust approximate nearest neighbor search using
* Hierarchical Navigable Small World graphs
*
* Yu. A. Malkov, D. A. Yashunin, arXiv 2017
*
* This implmentation is heavily influenced by the NMSlib
* implementation by Yury Malkov and Leonid Boystov
* (https://github.com/searchivarius/nmslib)
*
* The HNSW object stores only the neighbor link structure, see
* IndexHNSW.h for the full index object.
*/
struct VisitedTable;
struct DistanceComputer; // from AuxIndexStructures
struct HNSW {
/// internal storage of vectors (32 bits: this is expensive)
typedef int storage_idx_t;
/// Faiss results are 64-bit
typedef Index::idx_t idx_t;
typedef std::pair<float, storage_idx_t> Node;
/** Heap structure that allows fast
*/
struct MinimaxHeap {
int n;
int k;
int nvalid;
std::vector<storage_idx_t> ids;
std::vector<float> dis;
typedef faiss::CMax<float, storage_idx_t> HC;
explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
void push(storage_idx_t i, float v);
float max() const;
int size() const;
void clear();
int pop_min(float *vmin_out = nullptr);
int count_below(float thresh);
};
/// to sort pairs of (id, distance) from nearest to fathest or the reverse
struct NodeDistCloser {
float d;
int id;
NodeDistCloser(float d, int id): d(d), id(id) {}
bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
};
struct NodeDistFarther {
float d;
int id;
NodeDistFarther(float d, int id): d(d), id(id) {}
bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
};
/// assignment probability to each layer (sum=1)
std::vector<double> assign_probas;
/// number of neighbors stored per layer (cumulative), should not
/// be changed after first add
std::vector<int> cum_nneighbor_per_level;
/// level of each vector (base level = 1), size = ntotal
std::vector<int> levels;
/// offsets[i] is the offset in the neighbors array where vector i is stored
/// size ntotal + 1
std::vector<size_t> offsets;
/// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
/// for all levels. this is where all storage goes.
std::vector<storage_idx_t> neighbors;
/// entry point in the search structure (one of the points with maximum level
storage_idx_t entry_point;
faiss::RandomGenerator rng;
/// maximum level
int max_level;
/// expansion factor at construction time
int efConstruction;
/// expansion factor at search time
int efSearch;
/// during search: do we check whether the next best distance is good enough?
bool check_relative_distance = true;
/// number of entry points in levels > 0.
int upper_beam;
/// use bounded queue during exploration
bool search_bounded_queue = true;
// methods that initialize the tree sizes
/// initialize the assign_probas and cum_nneighbor_per_level to
/// have 2*M links on level 0 and M links on levels > 0
void set_default_probas(int M, float levelMult);
/// set nb of neighbors for this level (before adding anything)
void set_nb_neighbors(int level_no, int n);
// methods that access the tree sizes
/// nb of neighbors for this level
int nb_neighbors(int layer_no) const;
/// cumumlative nb up to (and excluding) this level
int cum_nb_neighbors(int layer_no) const;
/// range of entries in the neighbors table of vertex no at layer_no
void neighbor_range(idx_t no, int layer_no,
size_t * begin, size_t * end) const;
/// only mandatory parameter: nb of neighbors
explicit HNSW(int M = 32);
/// pick a random level for a new point
int random_level();
/// add n random levels to table (for debugging...)
void fill_with_random_links(size_t n);
void add_links_starting_from(DistanceComputer& ptdis,
storage_idx_t pt_id,
storage_idx_t nearest,
float d_nearest,
int level,
omp_lock_t *locks,
VisitedTable &vt);
/** add point pt_id on all levels <= pt_level and build the link
* structure for them. */
void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
std::vector<omp_lock_t>& locks,
VisitedTable& vt);
int search_from_candidates(DistanceComputer& qdis, int k,
idx_t *I, float *D,
MinimaxHeap& candidates,
VisitedTable &vt,
int level, int nres_in = 0) const;
std::priority_queue<Node> search_from_candidate_unbounded(
const Node& node,
DistanceComputer& qdis,
int ef,
VisitedTable *vt
) const;
/// search interface
void search(DistanceComputer& qdis, int k,
idx_t *I, float *D,
VisitedTable& vt) const;
void reset();
void clear_neighbor_tables(int level);
void print_neighbor_stats(int level) const;
int prepare_level_tab(size_t n, bool preset_levels = false);
static void shrink_neighbor_list(
DistanceComputer& qdis,
std::priority_queue<NodeDistFarther>& input,
std::vector<NodeDistFarther>& output,
int max_size);
};
/**************************************************************
* Auxiliary structures
**************************************************************/
/// set implementation optimized for fast access.
struct VisitedTable {
std::vector<uint8_t> visited;
int visno;
explicit VisitedTable(int size)
: visited(size), visno(1) {}
/// set flog #no to true
void set(int no) {
visited[no] = visno;
}
/// get flag #no
bool get(int no) const {
return visited[no] == visno;
}
/// reset all flags to false
void advance() {
visno++;
if (visno == 250) {
// 250 rather than 255 because sometimes we use visno and visno+1
memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
visno = 1;
}
}
};
struct HNSWStats {
size_t n1, n2, n3;
size_t ndis;
size_t nreorder;
bool view;
HNSWStats() {
reset();
}
void reset() {
n1 = n2 = n3 = 0;
ndis = 0;
nreorder = 0;
view = false;
}
};
// global var that collects them all
extern HNSWStats hnsw_stats;
} // namespace faiss