10 #include "IndexBinaryHNSW.h"
21 #include <unordered_set>
24 #include <sys/types.h>
31 #include "FaissAssert.h"
32 #include "IndexBinaryFlat.h"
34 #include "AuxIndexStructures.h"
46 void hnsw_add_vertices(IndexBinaryHNSW& index_hnsw,
48 size_t n,
const uint8_t *x,
50 bool preset_levels =
false) {
51 HNSW& hnsw = index_hnsw.hnsw;
52 size_t ntotal = n0 + n;
55 printf(
"hnsw_add_vertices: adding %ld elements on top of %ld "
56 "(preset_levels=%d)\n",
57 n, n0,
int(preset_levels));
60 int max_level = hnsw.prepare_level_tab(n, preset_levels);
63 printf(
" max_level = %d\n", max_level);
66 std::vector<omp_lock_t> locks(ntotal);
67 for(
int i = 0; i < ntotal; i++) {
68 omp_init_lock(&locks[i]);
72 std::vector<int> hist;
73 std::vector<int> order(n);
78 for (
int i = 0; i < n; i++) {
80 int pt_level = hnsw.levels[pt_id] - 1;
81 while (pt_level >= hist.size()) {
88 std::vector<int> offsets(hist.size() + 1, 0);
89 for (
int i = 0; i < hist.size() - 1; i++) {
90 offsets[i + 1] = offsets[i] + hist[i];
94 for (
int i = 0; i < n; i++) {
96 int pt_level = hnsw.levels[pt_id] - 1;
97 order[offsets[pt_level]++] = pt_id;
102 RandomGenerator rng2(789);
106 for (
int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
107 int i0 = i1 - hist[pt_level];
110 printf(
"Adding %d elements at level %d\n",
115 for (
int j = i0; j < i1; j++) {
116 std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
121 VisitedTable vt (ntotal);
123 std::unique_ptr<DistanceComputer> dis(
124 index_hnsw.get_distance_computer()
126 int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
128 #pragma omp for schedule(dynamic)
129 for (
int i = i0; i < i1; i++) {
131 dis->set_query((
float *)(x + (pt_id - n0) * index_hnsw.code_size));
133 hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
135 if (prev_display >= 0 && i - i0 > prev_display + 10000) {
136 prev_display = i - i0;
137 printf(
" %d / %d\r", i - i0, i1 - i0);
144 FAISS_ASSERT(i1 == 0);
150 for(
int i = 0; i < ntotal; i++)
151 omp_destroy_lock(&locks[i]);
162 IndexBinaryHNSW::IndexBinaryHNSW()
167 IndexBinaryHNSW::IndexBinaryHNSW(
int d,
int M)
171 storage(new IndexBinaryFlat(d))
176 IndexBinaryHNSW::IndexBinaryHNSW(IndexBinary *storage,
int M)
177 : IndexBinary(storage->d),
185 IndexBinaryHNSW::~IndexBinaryHNSW() {
194 storage->
train(n, x);
199 int32_t *distances,
idx_t *labels)
const
204 std::unique_ptr<DistanceComputer> dis(get_distance_computer());
207 for(
idx_t i = 0; i < n; i++) {
208 idx_t *idxi = labels + i * k;
209 float *simi = (
float *)(distances + i * k);
211 dis->set_query((
float *)(x + i *
code_size));
213 maxheap_heapify(k, simi, idxi);
214 hnsw.search(*dis, k, idxi, simi, vt);
215 maxheap_reorder(k, simi, idxi);
219 #pragma omp parallel for
220 for (
int i = 0; i < n * k; ++i) {
221 distances[i] = std::round(((
float *)distances)[i]);
233 hnsw_add_vertices(*
this, n0, n, x, verbose,
234 hnsw.levels.size() ==
ntotal);
253 template<
class HammingComputer>
260 float operator () (idx_t i)
override {
262 return hc.hamming(b + i * code_size);
265 float symmetric_dis(idx_t i, idx_t j)
override {
266 return HammingComputerDefault(b + j * code_size, code_size)
267 .hamming(b + i * code_size);
271 explicit FlatHammingDis(
const IndexBinaryFlat& storage)
272 : code_size(storage.code_size),
273 b(storage.xb.data()),
279 void set_query(
const float *x)
override {
280 hc.set((uint8_t *)x, code_size);
283 ~FlatHammingDis()
override {
286 hnsw_stats.ndis += ndis;
295 DistanceComputer *IndexBinaryHNSW::get_distance_computer()
const {
296 IndexBinaryFlat *flat_storage =
dynamic_cast<IndexBinaryFlat *
>(storage);
298 FAISS_ASSERT(flat_storage !=
nullptr);
302 return new FlatHammingDis<HammingComputer4>(*flat_storage);
304 return new FlatHammingDis<HammingComputer8>(*flat_storage);
306 return new FlatHammingDis<HammingComputer16>(*flat_storage);
308 return new FlatHammingDis<HammingComputer20>(*flat_storage);
310 return new FlatHammingDis<HammingComputer32>(*flat_storage);
312 return new FlatHammingDis<HammingComputer64>(*flat_storage);
315 return new FlatHammingDis<HammingComputerM8>(*flat_storage);
317 return new FlatHammingDis<HammingComputerM4>(*flat_storage);
321 return new FlatHammingDis<HammingComputerDefault>(*flat_storage);
virtual void reset()=0
Removes all elements from the database.
bool is_trained
set if the Index does not require training, or if training is done already
virtual void train(idx_t n, const uint8_t *x)
int code_size
number of bytes per vector ( = d / 8 )
void add(idx_t n, const uint8_t *x) override
Index::idx_t idx_t
all indices are this type
void reconstruct(idx_t key, uint8_t *recons) const override
set implementation optimized for fast access.
virtual void reconstruct(idx_t key, uint8_t *recons) const
double getmillisecs()
ms elapsed since some arbitrary epoch
void train(idx_t n, const uint8_t *x) override
Trains the storage if needed.
idx_t ntotal
total nb of indexed vectors
virtual void add(idx_t n, const uint8_t *x)=0
void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const override
entry point for search
int storage_idx_t
internal storage of vectors (32 bits: this is expensive)
void reset() override
Removes all elements from the database.