diff --git a/Clustering.h b/Clustering.h index 4215ed336..46410af79 100644 --- a/Clustering.h +++ b/Clustering.h @@ -34,7 +34,7 @@ struct ClusteringParameters { int seed; ///< seed for the random number generator - size_t decode_block_size; /// < how many vectors at a time to decode + size_t decode_block_size; ///< how many vectors at a time to decode /// sets reasonable defaults ClusteringParameters (); @@ -42,11 +42,11 @@ struct ClusteringParameters { struct ClusteringIterationStats { - float obj; /// objective values (sum of distances reported by index) - double time; /// seconds for iteration - double time_search; /// seconds for just search - double imbalance_factor; /// imbalance factor of iteration - int nsplit; /// number of cluster splits + float obj; ///< objective values (sum of distances reported by index) + double time; ///< seconds for iteration + double time_search; ///< seconds for just search + double imbalance_factor; ///< imbalance factor of iteration + int nsplit; ///< number of cluster splits }; diff --git a/Index.h b/Index.h index efd5fecee..6d413adff 100644 --- a/Index.h +++ b/Index.h @@ -18,7 +18,7 @@ #define FAISS_VERSION_MAJOR 1 #define FAISS_VERSION_MINOR 6 -#define FAISS_VERSION_PATCH 2 +#define FAISS_VERSION_PATCH 3 /** * @namespace faiss @@ -44,12 +44,10 @@ struct IDSelector; struct RangeSearchResult; struct DistanceComputer; -/** Abstract structure for an index +/** Abstract structure for an index, supports adding vectors and searching them. * - * Supports adding vertices and searching them. - * - * Currently only asymmetric queries are supported: - * database-to-database queries are not implemented. + * All vectors provided at add or search time are 32-bit float arrays, + * although the internal representation may vary. */ struct Index { using idx_t = int64_t; ///< all indices are this type diff --git a/IndexBinary.h b/IndexBinary.h index 88042002e..e1a16f79a 100644 --- a/IndexBinary.h +++ b/IndexBinary.h @@ -99,9 +99,13 @@ struct IndexBinary { /** Query n vectors of dimension d to the index. * - * return all vectors with distance < radius. Note that many - * indexes do not implement the range_search (only the k-NN search - * is mandatory). + * return all vectors with distance < radius. Note that many indexes + * do not implement the range_search (only the k-NN search is + * mandatory). The distances are converted to float to reuse the + * RangeSearchResult structure, but they are integer. By convention, + * only distances < radius (strict comparison) are returned, + * ie. radius = 0 does not return any result and 1 returns only + * exact same vectors. * * @param x input vectors to search, size n * d / 8 * @param radius search radius diff --git a/IndexBinaryFlat.cpp b/IndexBinaryFlat.cpp index a3de92d44..d7d8b4dd8 100644 --- a/IndexBinaryFlat.cpp +++ b/IndexBinaryFlat.cpp @@ -79,5 +79,10 @@ void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const { memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size); } +void IndexBinaryFlat::range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const +{ + hamming_range_search (x, xb.data(), n, ntotal, radius, code_size, result); +} } // namespace faiss diff --git a/IndexBinaryFlat.h b/IndexBinaryFlat.h index 6f24aac5b..17d016498 100644 --- a/IndexBinaryFlat.h +++ b/IndexBinaryFlat.h @@ -38,6 +38,9 @@ struct IndexBinaryFlat : IndexBinary { void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels) const override; + void range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const override; + void reconstruct(idx_t key, uint8_t *recons) const override; /** Remove some ids. Note that because of the indexing structure, diff --git a/IndexBinaryHash.cpp b/IndexBinaryHash.cpp new file mode 100644 index 000000000..4f9ccdbae --- /dev/null +++ b/IndexBinaryHash.cpp @@ -0,0 +1,492 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Copyright 2004-present Facebook. All Rights Reserved +// -*- c++ -*- + +#include + +#include +#include + +#include +#include + +#include +#include + + +namespace faiss { + +void IndexBinaryHash::InvertedList::add ( + idx_t id, size_t code_size, const uint8_t *code) +{ + ids.push_back(id); + vecs.insert(vecs.end(), code, code + code_size); +} + +IndexBinaryHash::IndexBinaryHash(int d, int b): + IndexBinary(d), b(b), nflip(0) +{ + is_trained = true; +} + +IndexBinaryHash::IndexBinaryHash(): b(0), nflip(0) +{ + is_trained = true; +} + +void IndexBinaryHash::reset() +{ + invlists.clear(); + ntotal = 0; +} + + +void IndexBinaryHash::add(idx_t n, const uint8_t *x) +{ + add_with_ids(n, x, nullptr); +} + +void IndexBinaryHash::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) +{ + uint64_t mask = ((uint64_t)1 << b) - 1; + // simplistic add function. Cannot really be parallelized. + + for (idx_t i = 0; i < n; i++) { + idx_t id = xids ? xids[i] : ntotal + i; + const uint8_t * xi = x + i * code_size; + idx_t hash = *((uint64_t*)xi) & mask; + invlists[hash].add(id, code_size, xi); + } + ntotal += n; +} + +namespace { + + +/** Enumerate all bit vectors of size nbit with up to maxflip 1s + * test in P127257851 P127258235 + */ +struct FlipEnumerator { + int nbit, nflip, maxflip; + uint64_t mask, x; + + FlipEnumerator (int nbit, int maxflip): nbit(nbit), maxflip(maxflip) { + nflip = 0; + mask = 0; + x = 0; + } + + bool next() { + if (x == mask) { + if (nflip == maxflip) { + return false; + } + // increase Hamming radius + nflip++; + mask = (((uint64_t)1 << nflip) - 1); + x = mask << (nbit - nflip); + return true; + } + + int i = __builtin_ctzll(x); + + if (i > 0) { + x ^= (uint64_t)3 << (i - 1); + } else { + // nb of LSB 1s + int n1 = __builtin_ctzll(~x); + // clear them + x &= ((uint64_t)(-1) << n1); + int n2 = __builtin_ctzll(x); + x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1); + } + return true; + } + +}; + +using idx_t = Index::idx_t; + + +struct RangeSearchResults { + int radius; + RangeQueryResult &qres; + + inline void add (float dis, idx_t id) { + if (dis < radius) { + qres.add (dis, id); + } + } + +}; + +struct KnnSearchResults { + // heap params + idx_t k; + int32_t * heap_sim; + idx_t * heap_ids; + + using C = CMax; + + inline void add (float dis, idx_t id) { + if (dis < heap_sim[0]) { + heap_pop (k, heap_sim, heap_ids); + heap_push (k, heap_sim, heap_ids, dis, id); + } + } + +}; + +template +void +search_single_query_template(const IndexBinaryHash & index, const uint8_t *q, + SearchResults &res, + size_t &n0, size_t &nlist, size_t &ndis) +{ + size_t code_size = index.code_size; + uint64_t mask = ((uint64_t)1 << index.b) - 1; + uint64_t qhash = *((uint64_t*)q) & mask; + HammingComputer hc (q, code_size); + FlipEnumerator fe(index.b, index.nflip); + + // loop over neighbors that are at most at nflip bits + do { + uint64_t hash = qhash ^ fe.x; + auto it = index.invlists.find (hash); + + if (it == index.invlists.end()) { + continue; + } + + const IndexBinaryHash::InvertedList &il = it->second; + + size_t nv = il.ids.size(); + + if (nv == 0) { + n0++; + } else { + const uint8_t *codes = il.vecs.data(); + for (size_t i = 0; i < nv; i++) { + int dis = hc.hamming (codes); + res.add(dis, il.ids[i]); + codes += code_size; + } + ndis += nv; + nlist++; + } + } while(fe.next()); +} + +template +void +search_single_query(const IndexBinaryHash & index, const uint8_t *q, + SearchResults &res, + size_t &n0, size_t &nlist, size_t &ndis) +{ +#define HC(name) search_single_query_template(index, q, res, n0, nlist, ndis); + switch(index.code_size) { + case 4: HC(HammingComputer4); break; + case 8: HC(HammingComputer8); break; + case 16: HC(HammingComputer16); break; + case 20: HC(HammingComputer20); break; + case 32: HC(HammingComputer32); break; + default: + if (index.code_size % 8 == 0) { + HC(HammingComputerM8); + } else { + HC(HammingComputerDefault); + } + } +#undef HC +} + + +} // anonymous namespace + + + +void IndexBinaryHash::range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const +{ + + size_t nlist = 0, ndis = 0, n0 = 0; + +#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist) + { + RangeSearchPartialResult pres (result); + +#pragma omp for + for (size_t i = 0; i < n; i++) { // loop queries + RangeQueryResult & qres = pres.new_result (i); + RangeSearchResults res = {radius, qres}; + const uint8_t *q = x + i * code_size; + + search_single_query (*this, q, res, n0, nlist, ndis); + + } + pres.finalize (); + } + indexBinaryHash_stats.nq += n; + indexBinaryHash_stats.n0 += n0; + indexBinaryHash_stats.nlist += nlist; + indexBinaryHash_stats.ndis += ndis; +} + +void IndexBinaryHash::search(idx_t n, const uint8_t *x, idx_t k, + int32_t *distances, idx_t *labels) const +{ + + using HeapForL2 = CMax; + size_t nlist = 0, ndis = 0, n0 = 0; + +#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0) + for (size_t i = 0; i < n; i++) { + int32_t * simi = distances + k * i; + idx_t * idxi = labels + k * i; + + heap_heapify (k, simi, idxi); + KnnSearchResults res = {k, simi, idxi}; + const uint8_t *q = x + i * code_size; + + search_single_query (*this, q, res, n0, nlist, ndis); + + } + indexBinaryHash_stats.nq += n; + indexBinaryHash_stats.n0 += n0; + indexBinaryHash_stats.nlist += nlist; + indexBinaryHash_stats.ndis += ndis; +} + +size_t IndexBinaryHash::hashtable_size() const +{ + return invlists.size(); +} + + +void IndexBinaryHash::display() const +{ + for (auto it = invlists.begin(); it != invlists.end(); ++it) { + printf("%ld: [", it->first); + const std::vector & v = it->second.ids; + for (auto x: v) { + printf("%ld ", 0 + x); + } + printf("]\n"); + + } +} + + +void IndexBinaryHashStats::reset() +{ + memset ((void*)this, 0, sizeof (*this)); +} + +IndexBinaryHashStats indexBinaryHash_stats; + +/******************************************************* + * IndexBinaryMultiHash implementation + ******************************************************/ + + +IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b): + IndexBinary(d), + storage(new IndexBinaryFlat(d)), own_fields(true), + maps(nhash), nhash(nhash), b(b), nflip(0) +{ + FAISS_THROW_IF_NOT(nhash * b <= d); +} + +IndexBinaryMultiHash::IndexBinaryMultiHash(): + storage(nullptr), own_fields(true), + nhash(0), b(0), nflip(0) +{} + +IndexBinaryMultiHash::~IndexBinaryMultiHash() +{ + if (own_fields) { + delete storage; + } +} + + +void IndexBinaryMultiHash::reset() +{ + storage->reset(); + ntotal = 0; + for(auto map: maps) { + map.clear(); + } +} + +void IndexBinaryMultiHash::add(idx_t n, const uint8_t *x) +{ + storage->add(n, x); + // populate maps + uint64_t mask = ((uint64_t)1 << b) - 1; + + for(idx_t i = 0; i < n; i++) { + const uint8_t *xi = x + i * code_size; + int ho = 0; + for(int h = 0; h < nhash; h++) { + uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7); + hash &= mask; + maps[h][hash].push_back(i + ntotal); + ho += b; + } + } + ntotal += n; +} + + +namespace { + +template +static +void verify_shortlist( + const IndexBinaryFlat & index, + const uint8_t * q, + const std::unordered_set & shortlist, + SearchResults &res) +{ + size_t code_size = index.code_size; + size_t nlist = 0, ndis = 0, n0 = 0; + + HammingComputer hc (q, code_size); + const uint8_t *codes = index.xb.data(); + + for (auto i: shortlist) { + int dis = hc.hamming (codes + i * code_size); + res.add(dis, i); + } +} + +template +void +search_1_query_multihash(const IndexBinaryMultiHash & index, const uint8_t *xi, + SearchResults &res, + size_t &n0, size_t &nlist, size_t &ndis) +{ + + std::unordered_set shortlist; + int b = index.b; + uint64_t mask = ((uint64_t)1 << b) - 1; + + int ho = 0; + for(int h = 0; h < index.nhash; h++) { + uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7); + qhash &= mask; + const IndexBinaryMultiHash::Map & map = index.maps[h]; + + FlipEnumerator fe(index.b, index.nflip); + // loop over neighbors that are at most at nflip bits + do { + uint64_t hash = qhash ^ fe.x; + auto it = map.find (hash); + + if (it != map.end()) { + const std::vector & v = it->second; + for (auto i: v) { + shortlist.insert(i); + } + nlist++; + } else { + n0++; + } + } while(fe.next()); + + ho += b; + } + ndis += shortlist.size(); + + // verify shortlist + +#define HC(name) verify_shortlist (*index.storage, xi, shortlist, res) + switch(index.code_size) { + case 4: HC(HammingComputer4); break; + case 8: HC(HammingComputer8); break; + case 16: HC(HammingComputer16); break; + case 20: HC(HammingComputer20); break; + case 32: HC(HammingComputer32); break; + default: + if (index.code_size % 8 == 0) { + HC(HammingComputerM8); + } else { + HC(HammingComputerDefault); + } + } +#undef HC +} + +} // anonymous namespace + +void IndexBinaryMultiHash::range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const +{ + + size_t nlist = 0, ndis = 0, n0 = 0; + +#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist) + { + RangeSearchPartialResult pres (result); + +#pragma omp for + for (size_t i = 0; i < n; i++) { // loop queries + RangeQueryResult & qres = pres.new_result (i); + RangeSearchResults res = {radius, qres}; + const uint8_t *q = x + i * code_size; + + search_1_query_multihash (*this, q, res, n0, nlist, ndis); + + } + pres.finalize (); + } + indexBinaryHash_stats.nq += n; + indexBinaryHash_stats.n0 += n0; + indexBinaryHash_stats.nlist += nlist; + indexBinaryHash_stats.ndis += ndis; +} + +void IndexBinaryMultiHash::search(idx_t n, const uint8_t *x, idx_t k, + int32_t *distances, idx_t *labels) const +{ + + using HeapForL2 = CMax; + size_t nlist = 0, ndis = 0, n0 = 0; + +#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0) + for (size_t i = 0; i < n; i++) { + int32_t * simi = distances + k * i; + idx_t * idxi = labels + k * i; + + heap_heapify (k, simi, idxi); + KnnSearchResults res = {k, simi, idxi}; + const uint8_t *q = x + i * code_size; + + search_1_query_multihash (*this, q, res, n0, nlist, ndis); + + } + indexBinaryHash_stats.nq += n; + indexBinaryHash_stats.n0 += n0; + indexBinaryHash_stats.nlist += nlist; + indexBinaryHash_stats.ndis += ndis; +} + +size_t IndexBinaryMultiHash::hashtable_size() const +{ + size_t tot = 0; + for (auto map: maps) { + tot += map.size(); + } + + return tot; +} + + +} diff --git a/IndexBinaryHash.h b/IndexBinaryHash.h new file mode 100644 index 000000000..6e8e82467 --- /dev/null +++ b/IndexBinaryHash.h @@ -0,0 +1,116 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#ifndef FAISS_BINARY_HASH_H +#define FAISS_BINARY_HASH_H + + + +#include +#include + +#include +#include +#include + + +namespace faiss { + +struct RangeSearchResult; + + +/** just uses the b first bits as a hash value */ +struct IndexBinaryHash : IndexBinary { + + struct InvertedList { + std::vector ids; + std::vector vecs; + + void add (idx_t id, size_t code_size, const uint8_t *code); + }; + + using InvertedListMap = std::unordered_map; + InvertedListMap invlists; + + int b, nflip; + + IndexBinaryHash(int d, int b); + + IndexBinaryHash(); + + void reset() override; + + void add(idx_t n, const uint8_t *x) override; + + void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override; + + void range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const override; + + void search(idx_t n, const uint8_t *x, idx_t k, + int32_t *distances, idx_t *labels) const override; + + void display() const; + size_t hashtable_size() const; + +}; + +struct IndexBinaryHashStats { + size_t nq; // nb of queries run + size_t n0; // nb of empty lists + size_t nlist; // nb of non-empty inverted lists scanned + size_t ndis; // nb of distancs computed + + IndexBinaryHashStats () {reset (); } + void reset (); +}; + +extern IndexBinaryHashStats indexBinaryHash_stats; + + +/** just uses the b first bits as a hash value */ +struct IndexBinaryMultiHash: IndexBinary { + + // where the vectors are actually stored + IndexBinaryFlat *storage; + bool own_fields; + + // maps hash values to the ids that hash to them + using Map = std::unordered_map >; + + // the different hashes, size nhash + std::vector maps; + + int nhash; ///< nb of hash maps + int b; ///< nb bits per hash map + int nflip; ///< nb bit flips to use at search time + + IndexBinaryMultiHash(int d, int nhash, int b); + + IndexBinaryMultiHash(); + + ~IndexBinaryMultiHash(); + + void reset() override; + + void add(idx_t n, const uint8_t *x) override; + + void range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const override; + + void search(idx_t n, const uint8_t *x, idx_t k, + int32_t *distances, idx_t *labels) const override; + + size_t hashtable_size() const; + +}; + +} + +#endif diff --git a/IndexBinaryIVF.cpp b/IndexBinaryIVF.cpp index 735598393..524fc60f6 100644 --- a/IndexBinaryIVF.cpp +++ b/IndexBinaryIVF.cpp @@ -11,11 +11,13 @@ #include #include +#include + #include + #include #include - #include #include #include @@ -281,13 +283,15 @@ namespace { using idx_t = Index::idx_t; -template +template struct IVFBinaryScannerL2: BinaryInvertedListScanner { HammingComputer hc; size_t code_size; + bool store_pairs; - IVFBinaryScannerL2 (size_t code_size): code_size (code_size) + IVFBinaryScannerL2 (size_t code_size, bool store_pairs): + code_size (code_size), store_pairs(store_pairs) {} void set_query (const uint8_t *query_vector) override { @@ -316,7 +320,7 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner { uint32_t dis = hc.hamming (codes); if (dis < simi[0]) { heap_pop (k, simi, idxi); - idx_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + idx_t id = store_pairs ? lo_build(list_no, j) : ids[j]; heap_push (k, simi, idxi, dis, id); nup++; } @@ -325,6 +329,24 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner { return nup; } + void scan_codes_range (size_t n, + const uint8_t *codes, + const idx_t *ids, + int radius, + RangeQueryResult &result) const + { + size_t nup = 0; + for (size_t j = 0; j < n; j++) { + uint32_t dis = hc.hamming (codes); + if (dis < radius) { + int64_t id = store_pairs ? lo_build (list_no, j) : ids[j]; + result.add (dis, id); + } + codes += code_size; + } + + } + }; @@ -332,29 +354,6 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner { template BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) { - switch (code_size) { -#define HANDLE_CS(cs) \ - case cs: \ - return new IVFBinaryScannerL2 (cs); - HANDLE_CS(4); - HANDLE_CS(8); - HANDLE_CS(16); - HANDLE_CS(20); - HANDLE_CS(32); - HANDLE_CS(64); -#undef HANDLE_CS - default: - if (code_size % 8 == 0) { - return new IVFBinaryScannerL2 (code_size); - } else if (code_size % 4 == 0) { - return new IVFBinaryScannerL2 (code_size); - } else { - return new IVFBinaryScannerL2 (code_size); - } - } } @@ -425,8 +424,10 @@ void search_knn_hamming_heap(const IndexBinaryIVF& ivf, ids = sids->get(); } - nheap += scanner->scan_codes (list_size, scodes.get(), - ids, simi, idxi, k); + nheap += scanner->scan_codes ( + list_size, scodes.get(), + ids, simi, idxi, k + ); nscan += list_size; if (max_codes && nscan >= max_codes) @@ -586,11 +587,26 @@ void search_knn_hamming_count_1 ( BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner (bool store_pairs) const { - if (store_pairs) { - return select_IVFBinaryScannerL2 (code_size); - } else { - return select_IVFBinaryScannerL2 (code_size); + +#define HC(name) return new IVFBinaryScannerL2 (code_size, store_pairs) + switch (code_size) { + case 4: HC(HammingComputer4); + case 8: HC(HammingComputer8); + case 16: HC(HammingComputer16); + case 20: HC(HammingComputer20); + case 32: HC(HammingComputer32); + case 64: HC(HammingComputer64); + default: + if (code_size % 8 == 0) { + HC(HammingComputerM8); + } else if (code_size % 4 == 0) { + HC(HammingComputerM4); + } else { + HC(HammingComputerDefault); + } } +#undef HC + } void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k, @@ -616,6 +632,84 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k, } } + +void IndexBinaryIVF::range_search( + idx_t n, const uint8_t *x, int radius, + RangeSearchResult *res) const +{ + + std::unique_ptr idx(new idx_t[n * nprobe]); + std::unique_ptr coarse_dis(new int32_t[n * nprobe]); + + double t0 = getmillisecs(); + quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get()); + indexIVF_stats.quantization_time += getmillisecs() - t0; + + t0 = getmillisecs(); + invlists->prefetch_lists(idx.get(), n * nprobe); + + bool store_pairs = false; + size_t nlistv = 0, ndis = 0; + + std::vector all_pres (omp_get_max_threads()); + +#pragma omp parallel reduction(+: nlistv, ndis) + { + RangeSearchPartialResult pres(res); + std::unique_ptr scanner + (get_InvertedListScanner(store_pairs)); + FAISS_THROW_IF_NOT (scanner.get ()); + + all_pres[omp_get_thread_num()] = &pres; + + auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres) + { + + idx_t key = idx[i * nprobe + ik]; /* select the list */ + if (key < 0) return; + FAISS_THROW_IF_NOT_FMT ( + key < (idx_t) nlist, + "Invalid key=%ld at ik=%ld nlist=%ld\n", + key, ik, nlist); + const size_t list_size = invlists->list_size(key); + + if (list_size == 0) return; + + InvertedLists::ScopedCodes scodes (invlists, key); + InvertedLists::ScopedIds ids (invlists, key); + + scanner->set_list (key, coarse_dis[i * nprobe + ik]); + nlistv++; + ndis += list_size; + scanner->scan_codes_range (list_size, scodes.get(), + ids.get(), radius, qres); + }; + +#pragma omp for + for (size_t i = 0; i < n; i++) { + scanner->set_query (x + i * code_size); + + RangeQueryResult & qres = pres.new_result (i); + + for (size_t ik = 0; ik < nprobe; ik++) { + scan_list_func (i, ik, qres); + } + + } + + pres.finalize(); + + } + indexIVF_stats.nq += n; + indexIVF_stats.nlist += nlistv; + indexIVF_stats.ndis += ndis; + indexIVF_stats.search_time += getmillisecs() - t0; + +} + + + + IndexBinaryIVF::~IndexBinaryIVF() { if (own_invlists) { delete invlists; diff --git a/IndexBinaryIVF.h b/IndexBinaryIVF.h index 40635ddbf..18918f61f 100644 --- a/IndexBinaryIVF.h +++ b/IndexBinaryIVF.h @@ -109,8 +109,11 @@ struct IndexBinaryIVF : IndexBinary { bool store_pairs=false) const; /** assign the vectors, then call search_preassign */ - virtual void search(idx_t n, const uint8_t *x, idx_t k, - int32_t *distances, idx_t *labels) const override; + void search(idx_t n, const uint8_t *x, idx_t k, + int32_t *distances, idx_t *labels) const override; + + void range_search(idx_t n, const uint8_t *x, int radius, + RangeSearchResult *result) const override; void reconstruct(idx_t key, uint8_t *recons) const override; @@ -202,6 +205,12 @@ struct BinaryInvertedListScanner { int32_t *distances, idx_t *labels, size_t k) const = 0; + virtual void scan_codes_range (size_t n, + const uint8_t *codes, + const idx_t *ids, + int radius, + RangeQueryResult &result) const = 0; + virtual ~BinaryInvertedListScanner () {} }; diff --git a/IndexFlat.h b/IndexFlat.h index 7b1345121..ef928de09 100644 --- a/IndexFlat.h +++ b/IndexFlat.h @@ -19,6 +19,7 @@ namespace faiss { /** Index that stores the full vectors and performs exhaustive search */ struct IndexFlat: Index { + /// database vectors, size ntotal * d std::vector xb; @@ -144,7 +145,7 @@ struct IndexRefineFlat: Index { }; -/// optimized version for 1D "vectors" +/// optimized version for 1D "vectors". struct IndexFlat1D:IndexFlatL2 { bool continuous_update; ///< is the permutation updated continuously? diff --git a/IndexIVF.cpp b/IndexIVF.cpp index 02c79dd3a..21cfff314 100644 --- a/IndexIVF.cpp +++ b/IndexIVF.cpp @@ -612,7 +612,6 @@ InvertedListScanner *IndexIVF::get_InvertedListScanner ( void IndexIVF::reconstruct (idx_t key, float* recons) const { idx_t lo = direct_map.get (key); - reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons); } diff --git a/IndexIVFPQ.h b/IndexIVFPQ.h index 0a6adcb3d..4ca04e9ef 100644 --- a/IndexIVFPQ.h +++ b/IndexIVFPQ.h @@ -42,14 +42,14 @@ struct IndexIVFPQ: IndexIVF { int polysemous_ht; ///< Hamming thresh for polysemous filtering /** Precompute table that speed up query preprocessing at some - * memory cost + * memory cost (used only for by_residual with L2 metric) * =-1: force disable * =0: decide heuristically (default: use tables only if they are * < precomputed_tables_max_bytes) * =1: tables that work for all quantizers (size 256 * nlist * M) * =2: specific version for MultiIndexQuantizer (much more compact) */ - int use_precomputed_table; ///< if by_residual, build precompute tables + int use_precomputed_table; static size_t precomputed_table_max_bytes; /// if use_precompute_table @@ -93,9 +93,9 @@ struct IndexIVFPQ: IndexIVF { * the duplicates are returned in pre-allocated arrays (see the * max sizes). * - * @params lims limits between groups of duplicates + * @param lims limits between groups of duplicates * (max size ntotal / 2 + 1) - * @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of + * @param ids ids[lims[i]] : ids[lims[i+1]-1] is a group of * duplicates (max size ntotal) * @return n number of groups found */ @@ -135,15 +135,14 @@ struct IndexIVFPQ: IndexIVF { /// statistics are robust to internal threading, but not if /// IndexIVFPQ::search_preassigned is called by multiple threads struct IndexIVFPQStats { - size_t nrefine; // nb of refines (IVFPQR) + size_t nrefine; ///< nb of refines (IVFPQR) size_t n_hamming_pass; - // nb of passed Hamming distance tests (for polysemous) + ///< nb of passed Hamming distance tests (for polysemous) - // timings measured with the CPU RTC - // on all threads + // timings measured with the CPU RTC on all threads size_t search_cycles; - size_t refine_cycles; // only for IVFPQR + size_t refine_cycles; ///< only for IVFPQR IndexIVFPQStats () {reset (); } void reset (); diff --git a/demos/demo_weighted_kmeans.cpp b/demos/demo_weighted_kmeans.cpp new file mode 100644 index 000000000..eee188e4b --- /dev/null +++ b/demos/demo_weighted_kmeans.cpp @@ -0,0 +1,185 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include +#include + + +namespace { + + +enum WeightedKMeansType { + WKMT_FlatL2, + WKMT_FlatIP, + WKMT_FlatIP_spherical, + WKMT_HNSW, +}; + + +float weighted_kmeans_clustering (size_t d, size_t n, size_t k, + const float *input, + const float *weights, + float *centroids, + WeightedKMeansType index_num) +{ + using namespace faiss; + Clustering clus (d, k); + clus.verbose = true; + + std::unique_ptr index; + + switch (index_num) { + case WKMT_FlatL2: + index.reset(new IndexFlatL2 (d)); + break; + case WKMT_FlatIP: + index.reset(new IndexFlatIP (d)); + break; + case WKMT_FlatIP_spherical: + index.reset(new IndexFlatIP (d)); + clus.spherical = true; + break; + case WKMT_HNSW: + IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32); + ihnsw->hnsw.efSearch = 128; + index.reset(ihnsw); + break; + } + + clus.train(n, input, *index.get(), weights); + // on output the index contains the centroids. + memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k); + return clus.iteration_stats.back().obj; +} + + +int d = 32; +float sigma = 0.1; + +#define BIGTEST + +#ifdef BIGTEST +// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs +int nc = 200000; +int n_big = 4; +int n_small = 2; +#else +int nc = 5; +int n_big = 100; +int n_small = 10; +#endif + +int n; // number of training points + +void generate_trainset (std::vector & ccent, + std::vector & x, + std::vector & weights) +{ + // same sampling as test_build_blocks.py test_weighted + + ccent.resize (d * 2 * nc); + faiss::float_randn (ccent.data(), d * 2 * nc, 123); + faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data()); + n = nc * n_big + nc * n_small; + x.resize(d * n); + weights.resize(n); + faiss::float_randn (x.data(), x.size(), 1234); + + float *xi = x.data(); + float *w = weights.data(); + for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids + int np = ci < nc ? n_big : n_small; // nb of points around this centroid + for (int i = 0; i < np; i++) { + for (int j = 0; j < d; j++) { + xi[j] = xi[j] * sigma + ccent[ci * d + j]; + } + *w++ = ci < nc ? 0.1 : 10; + xi += d; + } + } +} + +} + + +int main(int argc, char **argv) { + std::vector ccent; + std::vector x; + std::vector weights; + + printf("generate training set\n"); + generate_trainset(ccent, x, weights); + + std::vector centroids; + centroids.resize(nc * d); + + int the_index_num = -1; + int the_with_weights = -1; + + if (argc == 3) { + the_index_num = atoi(argv[1]); + the_with_weights = atoi(argv[2]); + } + + + for (int index_num = WKMT_FlatL2; + index_num <= WKMT_HNSW; + index_num++) { + + if (the_index_num >= 0 && index_num != the_index_num) { + continue; + } + + for (int with_weights = 0; with_weights <= 1; with_weights++) { + if (the_with_weights >= 0 && with_weights != the_with_weights) { + continue; + } + + printf("=================== index_num=%d Run %s weights\n", + index_num, with_weights ? "with" : "without"); + + weighted_kmeans_clustering ( + d, n, nc, x.data(), + with_weights ? weights.data() : nullptr, + centroids.data(), (WeightedKMeansType)index_num + ); + + { // compute distance of points to centroids + faiss::IndexFlatL2 cent_index(d); + cent_index.add(nc, centroids.data()); + std::vector dis (n); + std::vector idx (n); + + cent_index.search (nc * 2, ccent.data(), 1, + dis.data(), idx.data()); + + float dis1 = 0, dis2 = 0; + for (int i = 0; i < nc ; i++) { + dis1 += dis[i]; + } + printf("average distance of points from big clusters: %g\n", + dis1 / nc); + + for (int i = 0; i < nc ; i++) { + dis2 += dis[i + nc]; + } + + printf("average distance of points from small clusters: %g\n", + dis2 / nc); + + } + + } + } + return 0; +} diff --git a/gpu/GpuIndexFlat.cu b/gpu/GpuIndexFlat.cu index 6ad556060..bbfabb275 100644 --- a/gpu/GpuIndexFlat.cu +++ b/gpu/GpuIndexFlat.cu @@ -29,8 +29,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources, config), config_(std::move(config)), data_(nullptr) { - verifySettings_(); - // Flat index doesn't need training this->is_trained = true; @@ -44,8 +42,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources, GpuIndex(resources, dims, metric, 0, config), config_(std::move(config)), data_(nullptr) { - verifySettings_(); - // Flat index doesn't need training this->is_trained = true; @@ -298,21 +294,6 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n, fromDevice(residualDevice, residuals, stream); } -void -GpuIndexFlat::verifySettings_() const { - // If we want Hgemm, ensure that it is supported on this device - if (config_.useFloat16Accumulator) { - FAISS_THROW_IF_NOT_MSG(config_.useFloat16, - "useFloat16Accumulator can only be enabled " - "with useFloat16"); - - FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device), - "Device %d does not support Hgemm " - "(useFloat16Accumulator)", - config_.device); - } -} - // // GpuIndexFlatL2 // diff --git a/gpu/GpuIndexFlat.h b/gpu/GpuIndexFlat.h index 89491eb1d..782792674 100644 --- a/gpu/GpuIndexFlat.h +++ b/gpu/GpuIndexFlat.h @@ -25,17 +25,12 @@ struct FlatIndex; struct GpuIndexFlatConfig : public GpuIndexConfig { inline GpuIndexFlatConfig() : useFloat16(false), - useFloat16Accumulator(false), storeTransposed(false) { } /// Whether or not data is stored as float16 bool useFloat16; - /// This option is now deprecated and doesn't do anything. All accumulation of - /// float16 or float32 data is now done in float32. - bool useFloat16Accumulator; - /// Whether or not data is stored (transparently) in a transposed /// layout, enabling use of the NN GEMM call, which is ~10% faster. /// This will improve the speed of the flat index, but will @@ -123,10 +118,6 @@ class GpuIndexFlat : public GpuIndex { float* distances, faiss::Index::idx_t* labels) const override; - private: - /// Checks user settings for consistency - void verifySettings_() const; - protected: /// Our config object const GpuIndexFlatConfig config_; diff --git a/gpu/impl/FlatIndex.cu b/gpu/impl/FlatIndex.cu index eb58505eb..1e300795f 100644 --- a/gpu/impl/FlatIndex.cu +++ b/gpu/impl/FlatIndex.cu @@ -62,6 +62,22 @@ FlatIndex::reserve(size_t numVecs, cudaStream_t stream) { } } +template <> +Tensor& +FlatIndex::getVectorsRef() { + // Should not call this unless we are in float32 mode + FAISS_ASSERT(!useFloat16_); + return getVectorsFloat32Ref(); +} + +template <> +Tensor& +FlatIndex::getVectorsRef() { + // Should not call this unless we are in float16 mode + FAISS_ASSERT(useFloat16_); + return getVectorsFloat16Ref(); +} + Tensor& FlatIndex::getVectorsFloat32Ref() { // Should not call this unless we are in float32 mode diff --git a/gpu/impl/FlatIndex.cuh b/gpu/impl/FlatIndex.cuh index e48391210..3b8181868 100644 --- a/gpu/impl/FlatIndex.cuh +++ b/gpu/impl/FlatIndex.cuh @@ -26,16 +26,23 @@ class FlatIndex { bool storeTransposed, MemorySpace space); + /// Whether or not this flat index primarily stores data in float16 bool getUseFloat16() const; /// Returns the number of vectors we contain int getSize() const; + /// Returns the dimensionality of the vectors int getDim() const; /// Reserve storage that can contain at least this many vectors void reserve(size_t numVecs, cudaStream_t stream); + /// Returns the vectors based on the type desired; the FlatIndex must be of + /// the same type (float16 or float32) to not assert + template + Tensor& getVectorsRef(); + /// Returns a reference to our vectors currently in use Tensor& getVectorsFloat32Ref(); diff --git a/gpu/impl/IVFPQ.cu b/gpu/impl/IVFPQ.cu index 402bda0d9..ac9b47aec 100644 --- a/gpu/impl/IVFPQ.cu +++ b/gpu/impl/IVFPQ.cu @@ -123,8 +123,6 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0)); FAISS_ASSERT(vecs.getSize(1) == dim_); - FAISS_ASSERT(!quantizer_->getUseFloat16()); - auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); @@ -155,7 +153,13 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, DeviceTensor residuals( mem, {vecs.getSize(0), vecs.getSize(1)}, stream); - runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); + if (quantizer_->getUseFloat16()) { + auto& coarseCentroids = quantizer_->getVectorsFloat16Ref(); + runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); + } else { + auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); + runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); + } // Residuals are in the form // (vec x numSubQuantizer x dimPerSubQuantizer) @@ -437,8 +441,9 @@ IVFPQ::setPQCentroids_(float* data) { pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode); } +template void -IVFPQ::precomputeCodes_() { +IVFPQ::precomputeCodesT_() { FAISS_ASSERT(metric_ == MetricType::METRIC_L2); // @@ -449,8 +454,6 @@ IVFPQ::precomputeCodes_() { // Terms 1 and 3 are available only at query time. We compute term 2 // here. - FAISS_ASSERT(!quantizer_->getUseFloat16()); - auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); // Compute ||y_R||^2 by treating // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim) @@ -473,9 +476,10 @@ IVFPQ::precomputeCodes_() { // (centroid id)(sub q)(dim) // Transpose (centroid id)(sub q)(sub dim) to // (sub q)(centroid id)(sub dim) - auto centroidView = coarseCentroids.view<3>( + auto& coarseCentroids = quantizer_->template getVectorsRef(); + auto centroidView = coarseCentroids.template view<3>( {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_}); - DeviceTensor centroidsTransposed( + DeviceTensor centroidsTransposed( {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_}); runTransposeAny(centroidView, 0, 1, centroidsTransposed, @@ -521,6 +525,15 @@ IVFPQ::precomputeCodes_() { } } +void +IVFPQ::precomputeCodes_() { + if (quantizer_->getUseFloat16()) { + precomputeCodesT_(); + } else { + precomputeCodesT_(); + } +} + void IVFPQ::query(Tensor& queries, int nprobe, @@ -688,16 +701,16 @@ IVFPQ::runPQPrecomputedCodes_( resources_); } +template void -IVFPQ::runPQNoPrecomputedCodes_( +IVFPQ::runPQNoPrecomputedCodesT_( Tensor& queries, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, Tensor& outDistances, Tensor& outIndices) { - FAISS_ASSERT(!quantizer_->getUseFloat16()); - auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); + auto& coarseCentroids = quantizer_->template getVectorsRef(); runPQScanMultiPassNoPrecomputed(queries, coarseCentroids, @@ -719,4 +732,29 @@ IVFPQ::runPQNoPrecomputedCodes_( resources_); } +void +IVFPQ::runPQNoPrecomputedCodes_( + Tensor& queries, + DeviceTensor& coarseDistances, + DeviceTensor& coarseIndices, + int k, + Tensor& outDistances, + Tensor& outIndices) { + if (quantizer_->getUseFloat16()) { + runPQNoPrecomputedCodesT_(queries, + coarseDistances, + coarseIndices, + k, + outDistances, + outIndices); + } else { + runPQNoPrecomputedCodesT_(queries, + coarseDistances, + coarseIndices, + k, + outDistances, + outIndices); + } +} + } } // namespace diff --git a/gpu/impl/IVFPQ.cuh b/gpu/impl/IVFPQ.cuh index d071705c8..ce6c7acb1 100644 --- a/gpu/impl/IVFPQ.cuh +++ b/gpu/impl/IVFPQ.cuh @@ -83,6 +83,11 @@ class IVFPQ : public IVFBase { /// Calculate precomputed residual distance information void precomputeCodes_(); + /// Calculate precomputed residual distance information (for different coarse + /// centroid type) + template + void precomputeCodesT_(); + /// Runs kernels for scanning inverted lists with precomputed codes void runPQPrecomputedCodes_(Tensor& queries, DeviceTensor& coarseDistances, @@ -99,6 +104,16 @@ class IVFPQ : public IVFBase { Tensor& outDistances, Tensor& outIndices); + /// Runs kernels for scanning inverted lists without precomputed codes (for + /// different coarse centroid type) + template + void runPQNoPrecomputedCodesT_(Tensor& queries, + DeviceTensor& coarseDistances, + DeviceTensor& coarseIndices, + int k, + Tensor& outDistances, + Tensor& outIndices); + private: /// Number of sub-quantizers per vector const int numSubQuantizers_; diff --git a/gpu/impl/PQCodeDistances-inl.cuh b/gpu/impl/PQCodeDistances-inl.cuh new file mode 100644 index 000000000..c3ef87f2e --- /dev/null +++ b/gpu/impl/PQCodeDistances-inl.cuh @@ -0,0 +1,561 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { namespace gpu { + +// Kernel responsible for calculating distance from residual vector to +// each product quantizer code centroid +template +__global__ void +__launch_bounds__(288, 4) +pqCodeDistances(Tensor queries, + int queriesPerBlock, + Tensor coarseCentroids, + Tensor pqCentroids, + Tensor topQueryToCentroid, + // (query id)(coarse)(subquantizer)(code) -> dist + Tensor outCodeDistances) { + const auto numSubQuantizers = pqCentroids.getSize(0); + const auto dimsPerSubQuantizer = pqCentroids.getSize(1); + assert(DimsPerSubQuantizer == dimsPerSubQuantizer); + const auto codesPerSubQuantizer = pqCentroids.getSize(2); + + bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer; + int loadingThreadId = threadIdx.x - codesPerSubQuantizer; + + extern __shared__ float smem[]; + + // Each thread calculates a single code + float subQuantizerData[DimsPerSubQuantizer]; + + auto code = threadIdx.x; + auto subQuantizer = blockIdx.y; + + // Each thread will load the pq centroid data for the code that it + // is processing +#pragma unroll + for (int i = 0; i < DimsPerSubQuantizer; ++i) { + subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg(); + } + + // Where we store our query vector + float* smemQuery = smem; + + // Where we store our residual vector; this is double buffered so we + // can be loading the next one while processing the current one + float* smemResidual1 = &smemQuery[DimsPerSubQuantizer]; + float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer]; + + // Where we pre-load the coarse centroid IDs + int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer]; + + // Each thread is calculating the distance for a single code, + // performing the reductions locally + + // Handle multiple queries per block + auto startQueryId = blockIdx.x * queriesPerBlock; + auto numQueries = queries.getSize(0) - startQueryId; + if (numQueries > queriesPerBlock) { + numQueries = queriesPerBlock; + } + + for (int query = 0; query < numQueries; ++query) { + auto queryId = startQueryId + query; + + auto querySubQuantizer = + queries[queryId][subQuantizer * DimsPerSubQuantizer].data(); + + // Load current query vector + for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) { + smemQuery[i] = querySubQuantizer[i]; + } + + // Load list of coarse centroids found + for (int i = threadIdx.x; + i < topQueryToCentroid.getSize(1); i += blockDim.x) { + coarseIds[i] = topQueryToCentroid[queryId][i]; + } + + // We need coarseIds below + // FIXME: investigate loading separately, so we don't need this + __syncthreads(); + + // Preload first buffer of residual data + if (isLoadingThread) { + for (int i = loadingThreadId; + i < DimsPerSubQuantizer; + i += blockDim.x - codesPerSubQuantizer) { + auto coarseId = coarseIds[0]; + // In case NaNs were in the original query data + coarseId = coarseId == -1 ? 0 : coarseId; + auto coarseCentroidSubQuantizer = + coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data(); + + if (L2Distance) { + smemResidual1[i] = smemQuery[i] - + ConvertTo::to(coarseCentroidSubQuantizer[i]); + } else { + smemResidual1[i] = + ConvertTo::to(coarseCentroidSubQuantizer[i]); + } + } + } + + // The block walks the list for a single query + for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) { + // Wait for smemResidual1 to be loaded + __syncthreads(); + + if (isLoadingThread) { + // Preload second buffer of residual data + for (int i = loadingThreadId; + i < DimsPerSubQuantizer; + i += blockDim.x - codesPerSubQuantizer) { + // FIXME: try always making this centroid id 0 so we can + // terminate + if (coarse != (topQueryToCentroid.getSize(1) - 1)) { + auto coarseId = coarseIds[coarse + 1]; + // In case NaNs were in the original query data + coarseId = coarseId == -1 ? 0 : coarseId; + + auto coarseCentroidSubQuantizer = + coarseCentroids[coarseId] + [subQuantizer * dimsPerSubQuantizer].data(); + + if (L2Distance) { + smemResidual2[i] = smemQuery[i] - + ConvertTo::to(coarseCentroidSubQuantizer[i]); + } else { + smemResidual2[i] = + ConvertTo::to(coarseCentroidSubQuantizer[i]); + } + } + } + } else { + // These are the processing threads + float dist = 0.0f; + + constexpr int kUnroll = 4; + constexpr int kRemainder = DimsPerSubQuantizer % kUnroll; + constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder; + float vals[kUnroll]; + + // Calculate residual - pqCentroid for each dim that we're + // processing + + // Unrolled loop + if (L2Distance) { +#pragma unroll + for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] = smemResidual1[i * kUnroll + j]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] -= subQuantizerData[i * kUnroll + j]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] *= vals[j]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + dist += vals[j]; + } + } + } else { + // Inner product: query slice against the reconstructed sub-quantizer + // for this coarse cell (query o (centroid + subQCentroid)) +#pragma unroll + for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) { +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] = smemResidual1[i * kUnroll + j]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] += subQuantizerData[i * kUnroll + j]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] *= smemQuery[i * kUnroll + j]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + dist += vals[j]; + } + } + } + + // Remainder loop + if (L2Distance) { +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + vals[j] = smemResidual1[kRemainderBase + j]; + } + +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + vals[j] -= subQuantizerData[kRemainderBase + j]; + } + +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + vals[j] *= vals[j]; + } + } else { + // Inner product + // Inner product: query slice against the reconstructed sub-quantizer + // for this coarse cell (query o (centroid + subQCentroid)) +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + vals[j] = smemResidual1[kRemainderBase + j]; + } + +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + vals[j] += subQuantizerData[kRemainderBase + j]; + } + +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + vals[j] *= smemQuery[kRemainderBase + j]; + } + } + +#pragma unroll + for (int j = 0; j < kRemainder; ++j) { + dist += vals[j]; + } + + // We have the distance for our code; write it out + outCodeDistances[queryId][coarse][subQuantizer][code] = + ConvertTo::to(dist); + } // !isLoadingThread + + // Swap residual buffers + float* tmp = smemResidual1; + smemResidual1 = smemResidual2; + smemResidual2 = tmp; + } + } +} + +template +__global__ void +residualVector(Tensor queries, + Tensor coarseCentroids, + Tensor topQueryToCentroid, + int numSubDim, + // output is transposed: + // (sub q)(query id)(centroid id)(sub dim) + Tensor residual) { + // block x is query id + // block y is centroid id + // thread x is dim + auto queryId = blockIdx.x; + auto centroidId = blockIdx.y; + + int realCentroidId = topQueryToCentroid[queryId][centroidId]; + + for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) { + float q = queries[queryId][dim]; + float c = ConvertTo::to(coarseCentroids[realCentroidId][dim]); + + residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = q - c; + } +} + +template +void +runResidualVector(Tensor& pqCentroids, + Tensor& queries, + Tensor& coarseCentroids, + Tensor& topQueryToCentroid, + Tensor& residual, + cudaStream_t stream) { + auto grid = + dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1)); + auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice())); + + residualVector<<>>( + queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1), + residual); + + CUDA_TEST_ERROR(); +} + +template +void +runPQCodeDistancesMM(Tensor& pqCentroids, + Tensor& queries, + Tensor& coarseCentroids, + Tensor& topQueryToCentroid, + NoTypeTensor<4, true>& outCodeDistances, + bool useFloat16Lookup, + DeviceMemory& mem, + cublasHandle_t handle, + cudaStream_t stream) { + // Calculate (q - c) residual vector + // (sub q)(query id)(centroid id)(sub dim) + DeviceTensor residual( + mem, + {pqCentroids.getSize(0), + topQueryToCentroid.getSize(0), + topQueryToCentroid.getSize(1), + pqCentroids.getSize(1)}, + stream); + + runResidualVector(pqCentroids, queries, + coarseCentroids, topQueryToCentroid, + residual, stream); + + // Calculate ||q - c||^2 + DeviceTensor residualNorms( + mem, + {pqCentroids.getSize(0) * + topQueryToCentroid.getSize(0) * + topQueryToCentroid.getSize(1)}, + stream); + + auto residualView2 = residual.view<2>( + {pqCentroids.getSize(0) * + topQueryToCentroid.getSize(0) * + topQueryToCentroid.getSize(1), + pqCentroids.getSize(1)}); + + runL2Norm(residualView2, true, residualNorms, true, stream); + + // Perform a batch MM: + // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} => + // (sub q) x {(q * c)(code)} + auto residualView3 = residual.view<3>( + {pqCentroids.getSize(0), + topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), + pqCentroids.getSize(1)}); + + DeviceTensor residualDistance( + mem, + {pqCentroids.getSize(0), + topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), + pqCentroids.getSize(2)}, + stream); + + runIteratedMatrixMult(residualDistance, false, + residualView3, false, + pqCentroids, false, + -2.0f, 0.0f, + handle, + stream); + + // Sum ||q - c||^2 along rows + auto residualDistanceView2 = residualDistance.view<2>( + {pqCentroids.getSize(0) * + topQueryToCentroid.getSize(0) * + topQueryToCentroid.getSize(1), + pqCentroids.getSize(2)}); + + runSumAlongRows(residualNorms, residualDistanceView2, false, stream); + + Tensor outCodeDistancesF; + DeviceTensor outCodeDistancesFloatMem; + + if (useFloat16Lookup) { + outCodeDistancesFloatMem = DeviceTensor( + mem, {outCodeDistances.getSize(0), + outCodeDistances.getSize(1), + outCodeDistances.getSize(2), + outCodeDistances.getSize(3)}, + stream); + + outCodeDistancesF = outCodeDistancesFloatMem; + } else { + outCodeDistancesF = outCodeDistances.toTensor(); + } + + // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which + // is where we build our output distances) + auto outCodeDistancesView = outCodeDistancesF.view<3>( + {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), + outCodeDistances.getSize(2), + outCodeDistances.getSize(3)}); + + runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream); + + // Calculate code norms per each sub-dim + // (sub q)(sub dim)(code) is pqCentroids + // transpose to (sub q)(code)(sub dim) + DeviceTensor pqCentroidsTranspose( + mem, + {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)}, + stream); + + runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream); + + auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>( + {pqCentroids.getSize(0) * pqCentroids.getSize(2), + pqCentroids.getSize(1)}); + + DeviceTensor pqCentroidsNorm( + mem, + {pqCentroids.getSize(0) * pqCentroids.getSize(2)}, + stream); + + runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream); + + // View output as (q * c)(sub q * code), and add centroid norm to + // each row + auto outDistancesCodeViewCols = outCodeDistancesView.view<2>( + {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1), + outCodeDistances.getSize(2) * outCodeDistances.getSize(3)}); + + runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream); + + if (useFloat16Lookup) { + // Need to convert back + auto outCodeDistancesH = outCodeDistances.toTensor(); + convertTensor(stream, + outCodeDistancesF, + outCodeDistancesH); + } +} + +template +void +runPQCodeDistances(Tensor& pqCentroids, + Tensor& queries, + Tensor& coarseCentroids, + Tensor& topQueryToCentroid, + NoTypeTensor<4, true>& outCodeDistances, + bool l2Distance, + bool useFloat16Lookup, + cudaStream_t stream) { + const auto numSubQuantizers = pqCentroids.getSize(0); + const auto dimsPerSubQuantizer = pqCentroids.getSize(1); + const auto codesPerSubQuantizer = pqCentroids.getSize(2); + + // FIXME: tune + // Reuse of pq centroid data is based on both # of queries * nprobe, + // and we should really be tiling in both dimensions + constexpr int kQueriesPerBlock = 8; + + auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock), + numSubQuantizers); + + // Reserve one block of threads for double buffering + // FIXME: probably impractical for large # of dims? + auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize); + auto block = dim3(codesPerSubQuantizer + loadingThreads); + + auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) + + topQueryToCentroid.getSize(1) * sizeof(int); + +#define RUN_CODE(DIMS, L2) \ + do { \ + if (useFloat16Lookup) { \ + auto outCodeDistancesT = outCodeDistances.toTensor(); \ + \ + pqCodeDistances<<>>( \ + queries, kQueriesPerBlock, \ + coarseCentroids, pqCentroids, \ + topQueryToCentroid, outCodeDistancesT); \ + } else { \ + auto outCodeDistancesT = outCodeDistances.toTensor(); \ + \ + pqCodeDistances<<>>( \ + queries, kQueriesPerBlock, \ + coarseCentroids, pqCentroids, \ + topQueryToCentroid, outCodeDistancesT); \ + } \ + } while (0) + +#define CODE_L2(DIMS) \ + do { \ + if (l2Distance) { \ + RUN_CODE(DIMS, true); \ + } else { \ + RUN_CODE(DIMS, false); \ + } \ + } while (0) + + switch (dimsPerSubQuantizer) { + case 1: + CODE_L2(1); + break; + case 2: + CODE_L2(2); + break; + case 3: + CODE_L2(3); + break; + case 4: + CODE_L2(4); + break; + case 6: + CODE_L2(6); + break; + case 8: + CODE_L2(8); + break; + case 10: + CODE_L2(10); + break; + case 12: + CODE_L2(12); + break; + case 16: + CODE_L2(16); + break; + case 20: + CODE_L2(20); + break; + case 24: + CODE_L2(24); + break; + case 28: + CODE_L2(28); + break; + case 32: + CODE_L2(32); + break; + // FIXME: larger sizes require too many registers - we need the + // MM implementation working + default: + FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer " + "not currently supported"); + } + +#undef RUN_CODE +#undef CODE_L2 + + CUDA_TEST_ERROR(); +} + +} } // namespace diff --git a/gpu/impl/PQCodeDistances.cuh b/gpu/impl/PQCodeDistances.cuh index b68694715..0add947f2 100644 --- a/gpu/impl/PQCodeDistances.cuh +++ b/gpu/impl/PQCodeDistances.cuh @@ -20,18 +20,20 @@ class DeviceMemory; /// Calculates the distance from the (query - centroid) residual to /// each sub-code vector, for the given list of query results in /// topQueryToCentroid +template void runPQCodeDistances(Tensor& pqCentroids, Tensor& queries, - Tensor& coarseCentroids, + Tensor& coarseCentroids, Tensor& topQueryToCentroid, NoTypeTensor<4, true>& outCodeDistances, bool l2Distance, bool useFloat16Lookup, cudaStream_t stream); +template void runPQCodeDistancesMM(Tensor& pqCentroids, Tensor& queries, - Tensor& coarseCentroids, + Tensor& coarseCentroids, Tensor& topQueryToCentroid, NoTypeTensor<4, true>& outCodeDistances, bool useFloat16Lookup, @@ -40,3 +42,5 @@ void runPQCodeDistancesMM(Tensor& pqCentroids, cudaStream_t stream); } } // namespace + +#include diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh new file mode 100644 index 000000000..95a4b586d --- /dev/null +++ b/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh @@ -0,0 +1,599 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace faiss { namespace gpu { + +// This must be kept in sync with PQCodeDistances.cu +inline bool isSupportedNoPrecomputedSubDimSize(int dims) { + switch (dims) { + case 1: + case 2: + case 3: + case 4: + case 6: + case 8: + case 10: + case 12: + case 16: + case 20: + case 24: + case 28: + case 32: + return true; + default: + // FIXME: larger sizes require too many registers - we need the + // MM implementation working + return false; + } +} + +template +struct LoadCodeDistances { + static inline __device__ void load(LookupT* smem, + LookupT* codes, + int numCodes) { + constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT); + + // We can only use the vector type if the data is guaranteed to be + // aligned. The codes are innermost, so if it is evenly divisible, + // then any slice will be aligned. + if (numCodes % kWordSize == 0) { + // Load the data by float4 for efficiency, and then handle any remainder + // limitVec is the number of whole vec words we can load, in terms + // of whole blocks performing the load + constexpr int kUnroll = 2; + int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x); + limitVec *= kUnroll * blockDim.x; + + LookupVecT* smemV = (LookupVecT*) smem; + LookupVecT* codesV = (LookupVecT*) codes; + + for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) { + LookupVecT vals[kUnroll]; + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] = + LoadStore::load(&codesV[i + j * blockDim.x]); + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + LoadStore::store(&smemV[i + j * blockDim.x], vals[j]); + } + } + + // This is where we start loading the remainder that does not evenly + // fit into kUnroll x blockDim.x + int remainder = limitVec * kWordSize; + + for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) { + smem[i] = codes[i]; + } + } else { + // Potential unaligned load + constexpr int kUnroll = 4; + + int limit = utils::roundDown(numCodes, kUnroll * blockDim.x); + + int i = threadIdx.x; + for (; i < limit; i += kUnroll * blockDim.x) { + LookupT vals[kUnroll]; + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + vals[j] = codes[i + j * blockDim.x]; + } + +#pragma unroll + for (int j = 0; j < kUnroll; ++j) { + smem[i + j * blockDim.x] = vals[j]; + } + } + + for (; i < numCodes; i += blockDim.x) { + smem[i] = codes[i]; + } + } + } +}; + +template +__global__ void +pqScanNoPrecomputedMultiPass(Tensor queries, + Tensor pqCentroids, + Tensor topQueryToCentroid, + Tensor codeDistances, + void** listCodes, + int* listLengths, + Tensor prefixSumOffsets, + Tensor distance) { + const auto codesPerSubQuantizer = pqCentroids.getSize(2); + + // Where the pq code -> residual distance is stored + extern __shared__ char smemCodeDistances[]; + LookupT* codeDist = (LookupT*) smemCodeDistances; + + // Each block handles a single query + auto queryId = blockIdx.y; + auto probeId = blockIdx.x; + + // This is where we start writing out data + // We ensure that before the array (at offset -1), there is a 0 value + int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1); + float* distanceOut = distance[outBase].data(); + + auto listId = topQueryToCentroid[queryId][probeId]; + // Safety guard in case NaNs in input cause no list ID to be generated + if (listId == -1) { + return; + } + + unsigned char* codeList = (unsigned char*) listCodes[listId]; + int limit = listLengths[listId]; + + constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 : + (NumSubQuantizers / 4); + unsigned int code32[kNumCode32]; + unsigned int nextCode32[kNumCode32]; + + // We double-buffer the code loading, which improves memory utilization + if (threadIdx.x < limit) { + LoadCode32::load(code32, codeList, threadIdx.x); + } + + LoadCodeDistances::load( + codeDist, + codeDistances[queryId][probeId].data(), + codeDistances.getSize(2) * codeDistances.getSize(3)); + + // Prevent WAR dependencies + __syncthreads(); + + // Each thread handles one code element in the list, with a + // block-wide stride + for (int codeIndex = threadIdx.x; + codeIndex < limit; + codeIndex += blockDim.x) { + // Prefetch next codes + if (codeIndex + blockDim.x < limit) { + LoadCode32::load( + nextCode32, codeList, codeIndex + blockDim.x); + } + + float dist = 0.0f; + +#pragma unroll + for (int word = 0; word < kNumCode32; ++word) { + constexpr int kBytesPerCode32 = + NumSubQuantizers < 4 ? NumSubQuantizers : 4; + + if (kBytesPerCode32 == 1) { + auto code = code32[0]; + dist = ConvertTo::to(codeDist[code]); + + } else { +#pragma unroll + for (int byte = 0; byte < kBytesPerCode32; ++byte) { + auto code = getByte(code32[word], byte * 8, 8); + + auto offset = + codesPerSubQuantizer * (word * kBytesPerCode32 + byte); + + dist += ConvertTo::to(codeDist[offset + code]); + } + } + } + + // Write out intermediate distance result + // We do not maintain indices here, in order to reduce global + // memory traffic. Those are recovered in the final selection step. + distanceOut[codeIndex] = dist; + + // Rotate buffers +#pragma unroll + for (int word = 0; word < kNumCode32; ++word) { + code32[word] = nextCode32[word]; + } + } +} + +template +void +runMultiPassTile(Tensor& queries, + Tensor& centroids, + Tensor& pqCentroidsInnermostCode, + NoTypeTensor<4, true>& codeDistances, + Tensor& topQueryToCentroid, + bool useFloat16Lookup, + int bytesPerCode, + int numSubQuantizers, + int numSubQuantizerCodes, + thrust::device_vector& listCodes, + thrust::device_vector& listIndices, + IndicesOptions indicesOptions, + thrust::device_vector& listLengths, + Tensor& thrustMem, + Tensor& prefixSumOffsets, + Tensor& allDistances, + Tensor& heapDistances, + Tensor& heapIndices, + int k, + faiss::MetricType metric, + Tensor& outDistances, + Tensor& outIndices, + cudaStream_t stream) { + // We only support two metrics at the moment + FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT || + metric == MetricType::METRIC_L2); + + bool l2Distance = metric == MetricType::METRIC_L2; + + // Calculate offset lengths, so we know where to write out + // intermediate results + runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets, + thrustMem, stream); + + // Calculate residual code distances, since this is without + // precomputed codes + runPQCodeDistances(pqCentroidsInnermostCode, + queries, + centroids, + topQueryToCentroid, + codeDistances, + l2Distance, + useFloat16Lookup, + stream); + + // Convert all codes to a distance, and write out (distance, + // index) values for all intermediate results + { + auto kThreadsPerBlock = 256; + + auto grid = dim3(topQueryToCentroid.getSize(1), + topQueryToCentroid.getSize(0)); + auto block = dim3(kThreadsPerBlock); + + // pq centroid distances + auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); + + smem *= numSubQuantizers * numSubQuantizerCodes; + FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); + +#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \ + do { \ + auto codeDistancesT = codeDistances.toTensor(); \ + \ + pqScanNoPrecomputedMultiPass \ + <<>>( \ + queries, \ + pqCentroidsInnermostCode, \ + topQueryToCentroid, \ + codeDistancesT, \ + listCodes.data().get(), \ + listLengths.data().get(), \ + prefixSumOffsets, \ + allDistances); \ + } while (0) + +#define RUN_PQ(NUM_SUB_Q) \ + do { \ + if (useFloat16Lookup) { \ + RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \ + } else { \ + RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ + } \ + } while (0) + + switch (bytesPerCode) { + case 1: + RUN_PQ(1); + break; + case 2: + RUN_PQ(2); + break; + case 3: + RUN_PQ(3); + break; + case 4: + RUN_PQ(4); + break; + case 8: + RUN_PQ(8); + break; + case 12: + RUN_PQ(12); + break; + case 16: + RUN_PQ(16); + break; + case 20: + RUN_PQ(20); + break; + case 24: + RUN_PQ(24); + break; + case 28: + RUN_PQ(28); + break; + case 32: + RUN_PQ(32); + break; + case 40: + RUN_PQ(40); + break; + case 48: + RUN_PQ(48); + break; + case 56: + RUN_PQ(56); + break; + case 64: + RUN_PQ(64); + break; + case 96: + RUN_PQ(96); + break; + default: + FAISS_ASSERT(false); + break; + } + +#undef RUN_PQ +#undef RUN_PQ_OPT + } + + CUDA_TEST_ERROR(); + + // k-select the output in chunks, to increase parallelism + runPass1SelectLists(prefixSumOffsets, + allDistances, + topQueryToCentroid.getSize(1), + k, + !l2Distance, // L2 distance chooses smallest + heapDistances, + heapIndices, + stream); + + // k-select final output + auto flatHeapDistances = heapDistances.downcastInner<2>(); + auto flatHeapIndices = heapIndices.downcastInner<2>(); + + runPass2SelectLists(flatHeapDistances, + flatHeapIndices, + listIndices, + indicesOptions, + prefixSumOffsets, + topQueryToCentroid, + k, + !l2Distance, // L2 distance chooses smallest + outDistances, + outIndices, + stream); +} + +template +void +runPQScanMultiPassNoPrecomputed(Tensor& queries, + Tensor& centroids, + Tensor& pqCentroidsInnermostCode, + Tensor& topQueryToCentroid, + bool useFloat16Lookup, + int bytesPerCode, + int numSubQuantizers, + int numSubQuantizerCodes, + thrust::device_vector& listCodes, + thrust::device_vector& listIndices, + IndicesOptions indicesOptions, + thrust::device_vector& listLengths, + int maxListLength, + int k, + faiss::MetricType metric, + // output + Tensor& outDistances, + // output + Tensor& outIndices, + GpuResources* res) { + constexpr int kMinQueryTileSize = 8; + constexpr int kMaxQueryTileSize = 128; + constexpr int kThrustMemSize = 16384; + + int nprobe = topQueryToCentroid.getSize(1); + + auto& mem = res->getMemoryManagerCurrentDevice(); + auto stream = res->getDefaultStreamCurrentDevice(); + + // Make a reservation for Thrust to do its dirty work (global memory + // cross-block reduction space); hopefully this is large enough. + DeviceTensor thrustMem1( + mem, {kThrustMemSize}, stream); + DeviceTensor thrustMem2( + mem, {kThrustMemSize}, stream); + DeviceTensor* thrustMem[2] = + {&thrustMem1, &thrustMem2}; + + // How much temporary storage is available? + // If possible, we'd like to fit within the space available. + size_t sizeAvailable = mem.getSizeAvailable(); + + // We run two passes of heap selection + // This is the size of the first-level heap passes + constexpr int kNProbeSplit = 8; + int pass2Chunks = std::min(nprobe, kNProbeSplit); + + size_t sizeForFirstSelectPass = + pass2Chunks * k * (sizeof(float) + sizeof(int)); + + // How much temporary storage we need per each query + size_t sizePerQuery = + 2 * // streams + ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets + nprobe * maxListLength * sizeof(float) + // allDistances + // residual distances + nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) + + sizeForFirstSelectPass); + + int queryTileSize = (int) (sizeAvailable / sizePerQuery); + + if (queryTileSize < kMinQueryTileSize) { + queryTileSize = kMinQueryTileSize; + } else if (queryTileSize > kMaxQueryTileSize) { + queryTileSize = kMaxQueryTileSize; + } + + // FIXME: we should adjust queryTileSize to deal with this, since + // indexing is in int32 + FAISS_ASSERT(queryTileSize * nprobe * maxListLength < + std::numeric_limits::max()); + + // Temporary memory buffers + // Make sure there is space prior to the start which will be 0, and + // will handle the boundary condition without branches + DeviceTensor prefixSumOffsetSpace1( + mem, {queryTileSize * nprobe + 1}, stream); + DeviceTensor prefixSumOffsetSpace2( + mem, {queryTileSize * nprobe + 1}, stream); + + DeviceTensor prefixSumOffsets1( + prefixSumOffsetSpace1[1].data(), + {queryTileSize, nprobe}); + DeviceTensor prefixSumOffsets2( + prefixSumOffsetSpace2[1].data(), + {queryTileSize, nprobe}); + DeviceTensor* prefixSumOffsets[2] = + {&prefixSumOffsets1, &prefixSumOffsets2}; + + // Make sure the element before prefixSumOffsets is 0, since we + // depend upon simple, boundary-less indexing to get proper results + CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(), + 0, + sizeof(int), + stream)); + CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(), + 0, + sizeof(int), + stream)); + + int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float); + + int totalCodeDistancesSize = + queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes * + codeDistanceTypeSize; + + DeviceTensor codeDistances1Mem( + mem, {totalCodeDistancesSize}, stream); + NoTypeTensor<4, true> codeDistances1( + codeDistances1Mem.data(), + codeDistanceTypeSize, + {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); + + DeviceTensor codeDistances2Mem( + mem, {totalCodeDistancesSize}, stream); + NoTypeTensor<4, true> codeDistances2( + codeDistances2Mem.data(), + codeDistanceTypeSize, + {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes}); + + NoTypeTensor<4, true>* codeDistances[2] = + {&codeDistances1, &codeDistances2}; + + DeviceTensor allDistances1( + mem, {queryTileSize * nprobe * maxListLength}, stream); + DeviceTensor allDistances2( + mem, {queryTileSize * nprobe * maxListLength}, stream); + DeviceTensor* allDistances[2] = + {&allDistances1, &allDistances2}; + + DeviceTensor heapDistances1( + mem, {queryTileSize, pass2Chunks, k}, stream); + DeviceTensor heapDistances2( + mem, {queryTileSize, pass2Chunks, k}, stream); + DeviceTensor* heapDistances[2] = + {&heapDistances1, &heapDistances2}; + + DeviceTensor heapIndices1( + mem, {queryTileSize, pass2Chunks, k}, stream); + DeviceTensor heapIndices2( + mem, {queryTileSize, pass2Chunks, k}, stream); + DeviceTensor* heapIndices[2] = + {&heapIndices1, &heapIndices2}; + + auto streams = res->getAlternateStreamsCurrentDevice(); + streamWait(streams, {stream}); + + int curStream = 0; + + for (int query = 0; query < queries.getSize(0); query += queryTileSize) { + int numQueriesInTile = + std::min(queryTileSize, queries.getSize(0) - query); + + auto prefixSumOffsetsView = + prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile); + + auto codeDistancesView = + codeDistances[curStream]->narrowOutermost(0, numQueriesInTile); + auto coarseIndicesView = + topQueryToCentroid.narrowOutermost(query, numQueriesInTile); + auto queryView = + queries.narrowOutermost(query, numQueriesInTile); + + auto heapDistancesView = + heapDistances[curStream]->narrowOutermost(0, numQueriesInTile); + auto heapIndicesView = + heapIndices[curStream]->narrowOutermost(0, numQueriesInTile); + + auto outDistanceView = + outDistances.narrowOutermost(query, numQueriesInTile); + auto outIndicesView = + outIndices.narrowOutermost(query, numQueriesInTile); + + runMultiPassTile(queryView, + centroids, + pqCentroidsInnermostCode, + codeDistancesView, + coarseIndicesView, + useFloat16Lookup, + bytesPerCode, + numSubQuantizers, + numSubQuantizerCodes, + listCodes, + listIndices, + indicesOptions, + listLengths, + *thrustMem[curStream], + prefixSumOffsetsView, + *allDistances[curStream], + heapDistancesView, + heapIndicesView, + k, + metric, + outDistanceView, + outIndicesView, + streams[curStream]); + + curStream = (curStream + 1) % 2; + } + + streamWait({stream}, streams); +} + +} } // namespace diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/gpu/impl/PQScanMultiPassNoPrecomputed.cuh index eae3869ce..5712eb675 100644 --- a/gpu/impl/PQScanMultiPassNoPrecomputed.cuh +++ b/gpu/impl/PQScanMultiPassNoPrecomputed.cuh @@ -21,8 +21,9 @@ class GpuResources; /// per subquantizer? bool isSupportedNoPrecomputedSubDimSize(int dims); +template void runPQScanMultiPassNoPrecomputed(Tensor& queries, - Tensor& centroids, + Tensor& centroids, Tensor& pqCentroidsInnermostCode, Tensor& topQueryToCentroid, bool useFloat16Lookup, @@ -43,3 +44,5 @@ void runPQScanMultiPassNoPrecomputed(Tensor& queries, GpuResources* res); } } // namespace + +#include diff --git a/gpu/perf/PerfFlat.cu b/gpu/perf/PerfFlat.cu index 3b0e36ba1..20a16382f 100644 --- a/gpu/perf/PerfFlat.cu +++ b/gpu/perf/PerfFlat.cu @@ -76,7 +76,6 @@ int main(int argc, char** argv) { GpuIndexFlatConfig config; config.device = dev; config.useFloat16 = FLAGS_use_float16; - config.useFloat16Accumulator = FLAGS_use_float16_math; config.storeTransposed = FLAGS_transposed; config.memorySpace = FLAGS_use_unified_mem ? MemorySpace::Unified : MemorySpace::Device; diff --git a/gpu/test/TestGpuIndexIVFPQ.cpp b/gpu/test/TestGpuIndexIVFPQ.cpp index f5834f9a6..1bee6b4bb 100644 --- a/gpu/test/TestGpuIndexIVFPQ.cpp +++ b/gpu/test/TestGpuIndexIVFPQ.cpp @@ -187,6 +187,41 @@ TEST(TestGpuIndexIVFPQ, Query_IP) { } } +TEST(TestGpuIndexIVFPQ, Float16Coarse) { + Options opt; + + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 coarseQuantizer(opt.dim); + faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids, + opt.codes, opt.bitsPerCode); + cpuIndex.nprobe = opt.nprobe; + cpuIndex.train(opt.numTrain, trainVecs.data()); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.flatConfig.useFloat16 = true; + config.usePrecomputedTables = opt.usePrecomputed; + config.indicesOptions = opt.indicesOpt; + config.useFloat16LookupTables = opt.useFloat16; + + faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); + gpuIndex.setNumProbes(opt.nprobe); + + gpuIndex.add(opt.numAdd, addVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); + + faiss::gpu::compareIndices(cpuIndex, gpuIndex, + opt.numQuery, opt.dim, opt.k, opt.toString(), + opt.getCompareEpsilon(), + opt.getPctMaxDiff1(), + opt.getPctMaxDiffN()); +} + TEST(TestGpuIndexIVFPQ, Add_L2) { for (int tries = 0; tries < 2; ++tries) { Options opt; diff --git a/gpu/test/test_gpu_index.py b/gpu/test/test_gpu_index.py index 02ede949b..8b17b4801 100644 --- a/gpu/test/test_gpu_index.py +++ b/gpu/test/test_gpu_index.py @@ -98,7 +98,9 @@ class EvalIVFPQAccuracy(unittest.TestCase): D, Inew = gpu_index.search(xq, 10) - self.assertGreaterEqual((Iref == Inew).sum(), Iref.size) + # 0.99: allow some tolerance in results otherwise test + # fails occasionally (not reproducible) + self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99) def test_cpu_to_gpu_IVFPQ(self): self.do_cpu_to_gpu('IVF128,PQ4') @@ -267,6 +269,45 @@ class TestGPUKmeans(unittest.TestCase): assert np.allclose(obj1, obj2) +class TestAlternativeDistances(unittest.TestCase): + + def do_test(self, metric, metric_arg=0): + res = faiss.StandardGpuResources() + d = 32 + nb = 1000 + nq = 100 + + rs = np.random.RandomState(123) + xb = rs.rand(nb, d).astype('float32') + xq = rs.rand(nq, d).astype('float32') + + index_ref = faiss.IndexFlat(d, metric) + index_ref.metric_arg = metric_arg + index_ref.add(xb) + Dref, Iref = index_ref.search(xq, 10) + + # build from other index + index = faiss.GpuIndexFlat(res, index_ref) + Dnew, Inew = index.search(xq, 10) + np.testing.assert_array_equal(Inew, Iref) + np.testing.assert_allclose(Dnew, Dref, rtol=1e-6) + + # build from scratch + index = faiss.GpuIndexFlat(res, d, metric) + index.metric_arg = metric_arg + index.add(xb) + + Dnew, Inew = index.search(xq, 10) + np.testing.assert_array_equal(Inew, Iref) + + def test_L1(self): + self.do_test(faiss.METRIC_L1) + + def test_Linf(self): + self.do_test(faiss.METRIC_Linf) + + def test_Lp(self): + self.do_test(faiss.METRIC_Lp, 0.7) if __name__ == '__main__': diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh new file mode 100644 index 000000000..ede225e03 --- /dev/null +++ b/gpu/utils/MatrixMult-inl.cuh @@ -0,0 +1,160 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + + +#pragma once + +#include +#include +#include +#include +#include + +namespace faiss { namespace gpu { + +class DeviceMemory; + +template +struct GetCudaType; + +template <> +struct GetCudaType { + static constexpr cudaDataType_t Type = CUDA_R_32F; +}; + +template <> +struct GetCudaType { + static constexpr cudaDataType_t Type = CUDA_R_16F; +}; + +template +cublasStatus_t +rawGemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float fAlpha, + const AT *A, + int lda, + const BT *B, + int ldb, + const float fBeta, + float *C, + int ldc) { + auto cAT = GetCudaType::Type; + auto cBT = GetCudaType::Type; + + // Always accumulate in f32 + return cublasSgemmEx(handle, transa, transb, m, n, k, + &fAlpha, A, cAT, lda, + B, cBT, ldb, + &fBeta, + C, CUDA_R_32F, ldc); +} + +template +void +runMatrixMult(Tensor& c, bool transC, + Tensor& a, bool transA, + Tensor& b, bool transB, + float alpha, + float beta, + cublasHandle_t handle, + cudaStream_t stream) { + cublasSetStream(handle, stream); + + // Check that we have (m x k) * (k x n) = (m x n) + // using the input row-major layout + int aM = transA ? a.getSize(1) : a.getSize(0); + int aK = transA ? a.getSize(0) : a.getSize(1); + + int bK = transB ? b.getSize(1) : b.getSize(0); + int bN = transB ? b.getSize(0) : b.getSize(1); + + int cM = transC ? c.getSize(1) : c.getSize(0); + int cN = transC ? c.getSize(0) : c.getSize(1); + + FAISS_ASSERT(aM == cM); + FAISS_ASSERT(aK == bK); + FAISS_ASSERT(bN == cN); + + FAISS_ASSERT(a.getStride(1) == 1); + FAISS_ASSERT(b.getStride(1) == 1); + FAISS_ASSERT(c.getStride(1) == 1); + + // Now, we have to represent the matrix multiplication in + // column-major layout + float* pC = c.data(); + + int m = c.getSize(1); // stride 1 size + int n = c.getSize(0); // other size + int k = transA ? a.getSize(0) : a.getSize(1); + + int lda = transC ? a.getStride(0) : b.getStride(0); + int ldb = transC ? b.getStride(0) : a.getStride(0); + int ldc = c.getStride(0); + + auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + + if (transC) { + gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T; + gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T; + } + + cublasStatus_t err; + + if (transC) { + err = rawGemm(handle, + gemmTrA, gemmTrB, + m, n, k, alpha, + a.data(), lda, b.data(), ldb, beta, + pC, ldc); + } else { + err = rawGemm(handle, + gemmTrA, gemmTrB, + m, n, k, alpha, + b.data(), lda, a.data(), ldb, beta, + pC, ldc); + } + + FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS, + "cublas failed (%d): " + "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s", + (int) err, + a.getSize(0), a.getSize(1), transA ? "'" : "", + b.getSize(0), b.getSize(1), transB ? "'" : "", + c.getSize(0), c.getSize(1), transC ? "'" : ""); + CUDA_TEST_ERROR(); +} + +template +void runIteratedMatrixMult(Tensor& c, bool transC, + Tensor& a, bool transA, + Tensor& b, bool transB, + float alpha, + float beta, + cublasHandle_t handle, + cudaStream_t stream) { + FAISS_ASSERT(c.getSize(0) == a.getSize(0)); + FAISS_ASSERT(a.getSize(0) == b.getSize(0)); + + for (int i = 0; i < a.getSize(0); ++i) { + auto cView = c[i].view(); + auto aView = a[i].view(); + auto bView = b[i].view(); + + runMatrixMult(cView, transC, + aView, transA, + bView, transB, + alpha, beta, handle, stream); + } +} + +} } // namespace diff --git a/gpu/utils/MatrixMult.cu b/gpu/utils/MatrixMult.cu index 31e6b2c2c..2afb5017b 100644 --- a/gpu/utils/MatrixMult.cu +++ b/gpu/utils/MatrixMult.cu @@ -8,176 +8,9 @@ #include #include -#include -#include -#include -#include namespace faiss { namespace gpu { -template -struct CublasGemm { -}; - -template <> -struct CublasGemm { - static cublasStatus_t gemm(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - float fAlpha, - const float *A, - int lda, - const float *B, - int ldb, - float fBeta, - float *C, - int ldc) { - return cublasSgemm(handle, transa, transb, m, n, k, - &fAlpha, A, lda, B, ldb, &fBeta, C, ldc); - } -}; - -template <> -struct CublasGemm { - static cublasStatus_t gemm(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float fAlpha, - const half *A, - int lda, - const half *B, - int ldb, - const float fBeta, - float *C, - int ldc) { - // Always accumulate in f32 - return cublasSgemmEx(handle, transa, transb, m, n, k, - &fAlpha, A, CUDA_R_16F, lda, - B, CUDA_R_16F, ldb, - &fBeta, - C, CUDA_R_32F, ldc); - } -}; - -template -void -runMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, - float alpha, - float beta, - cublasHandle_t handle, - cudaStream_t stream) { - cublasSetStream(handle, stream); - - // Check that we have (m x k) * (k x n) = (m x n) - // using the input row-major layout - int aM = transA ? a.getSize(1) : a.getSize(0); - int aK = transA ? a.getSize(0) : a.getSize(1); - - int bK = transB ? b.getSize(1) : b.getSize(0); - int bN = transB ? b.getSize(0) : b.getSize(1); - - int cM = transC ? c.getSize(1) : c.getSize(0); - int cN = transC ? c.getSize(0) : c.getSize(1); - - FAISS_ASSERT(aM == cM); - FAISS_ASSERT(aK == bK); - FAISS_ASSERT(bN == cN); - - FAISS_ASSERT(a.getStride(1) == 1); - FAISS_ASSERT(b.getStride(1) == 1); - FAISS_ASSERT(c.getStride(1) == 1); - - // Now, we have to represent the matrix multiplication in - // column-major layout - T* pA = transC ? a.data() : b.data(); - T* pB = transC ? b.data() : a.data(); - float* pC = c.data(); - - int m = c.getSize(1); // stride 1 size - int n = c.getSize(0); // other size - int k = transA ? a.getSize(0) : a.getSize(1); - - int lda = transC ? a.getStride(0) : b.getStride(0); - int ldb = transC ? b.getStride(0) : a.getStride(0); - int ldc = c.getStride(0); - - auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N; - - if (transC) { - gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T; - gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T; - } - - auto err = CublasGemm::gemm(handle, - gemmTrA, gemmTrB, - m, n, k, alpha, - pA, lda, pB, ldb, beta, - pC, ldc); - - FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS, - "cublas failed (%d): " - "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s", - (int) err, - a.getSize(0), a.getSize(1), transA ? "'" : "", - b.getSize(0), b.getSize(1), transB ? "'" : "", - c.getSize(0), c.getSize(1), transC ? "'" : ""); - CUDA_TEST_ERROR(); -} - -void runMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, - float alpha, - float beta, - cublasHandle_t handle, - cudaStream_t stream) { - return runMatrixMult(c, transC, a, transA, b, transB, - alpha, beta, handle, stream); -} - -void runMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, - float alpha, - float beta, - cublasHandle_t handle, - cudaStream_t stream) { - return runMatrixMult(c, transC, a, transA, b, transB, - alpha, beta, handle, stream); -} - -void -runIteratedMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, - float alpha, - float beta, - cublasHandle_t handle, - cudaStream_t stream) { - FAISS_ASSERT(c.getSize(0) == a.getSize(0)); - FAISS_ASSERT(a.getSize(0) == b.getSize(0)); - - for (int i = 0; i < a.getSize(0); ++i) { - auto cView = c[i].view(); - auto aView = a[i].view(); - auto bView = b[i].view(); - - runMatrixMult(cView, transC, - aView, transA, - bView, transB, - alpha, beta, handle, stream); - } -} - void runBatchMatrixMult(Tensor& c, bool transC, Tensor& a, bool transA, diff --git a/gpu/utils/MatrixMult.cuh b/gpu/utils/MatrixMult.cuh index 548457f2c..eeb11ccc5 100644 --- a/gpu/utils/MatrixMult.cuh +++ b/gpu/utils/MatrixMult.cuh @@ -10,6 +10,9 @@ #include #include +#include +#include +#include namespace faiss { namespace gpu { @@ -17,30 +20,23 @@ class DeviceMemory; /// C = alpha * A * B + beta * C /// Expects row major layout, not fortran/blas column major! -void runMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, - float alpha, - float beta, - cublasHandle_t handle, - cudaStream_t stream); - -/// C = alpha * A * B + beta * C -/// Expects row major layout, not fortran/blas column major! -void runMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, - float alpha, - float beta, - cublasHandle_t handle, - cudaStream_t stream); +template +void +runMatrixMult(Tensor& c, bool transC, + Tensor& a, bool transA, + Tensor& b, bool transB, + float alpha, + float beta, + cublasHandle_t handle, + cudaStream_t stream); /// C_i = alpha * A_i * B_i + beta * C_i /// where `i` is the outermost dimension, via iterated gemm /// Expects row major layout, not fortran/blas column major! +template void runIteratedMatrixMult(Tensor& c, bool transC, - Tensor& a, bool transA, - Tensor& b, bool transB, + Tensor& a, bool transA, + Tensor& b, bool transB, float alpha, float beta, cublasHandle_t handle, @@ -59,3 +55,5 @@ void runBatchMatrixMult(Tensor& c, bool transC, cudaStream_t stream); } } // namespace + +#include diff --git a/impl/AuxIndexStructures.h b/impl/AuxIndexStructures.h index 840e191ce..c82b9ed56 100644 --- a/impl/AuxIndexStructures.h +++ b/impl/AuxIndexStructures.h @@ -51,9 +51,7 @@ struct RangeSearchResult { }; -/** - - Encapsulates a set of ids to remove. */ +/** Encapsulates a set of ids to remove. */ struct IDSelector { typedef Index::idx_t idx_t; virtual bool is_member (idx_t id) const = 0; diff --git a/impl/PolysemousTraining.h b/impl/PolysemousTraining.h index cf511a74c..c27a48c99 100644 --- a/impl/PolysemousTraining.h +++ b/impl/PolysemousTraining.h @@ -123,15 +123,15 @@ struct PolysemousTraining: SimulatedAnnealingParameters { enum Optimization_type_t { OT_None, OT_ReproduceDistances_affine, ///< default - OT_Ranking_weighted_diff /// same as _2, but use rank of y+ - rank of y- + OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of y- }; Optimization_type_t optimization_type; - // use 1/4 of the training points for the optimization, with - // max. ntrain_permutation. If ntrain_permutation == 0: train on - // centroids + /** use 1/4 of the training points for the optimization, with + * max. ntrain_permutation. If ntrain_permutation == 0: train on + * centroids */ int ntrain_permutation; - double dis_weight_factor; // decay of exp that weights distance loss + double dis_weight_factor; ///< decay of exp that weights distance loss // filename pattern for the logging of iterations std::string log_pattern; diff --git a/impl/index_read.cpp b/impl/index_read.cpp index 8e582e2df..cab9dc642 100644 --- a/impl/index_read.cpp +++ b/impl/index_read.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include @@ -752,6 +754,56 @@ static void read_binary_ivf_header ( read_direct_map (&ivf->direct_map, f); } +static void read_binary_hash_invlists ( + IndexBinaryHash::InvertedListMap &invlists, + int b, IOReader *f) +{ + size_t sz; + READ1 (sz); + int il_nbit = 0; + READ1 (il_nbit); + // buffer for bitstrings + std::vector buf((b + il_nbit) * sz); + READVECTOR (buf); + BitstringReader rd (buf.data(), buf.size()); + invlists.reserve (sz); + for (size_t i = 0; i < sz; i++) { + uint64_t hash = rd.read(b); + uint64_t ilsz = rd.read(il_nbit); + auto & il = invlists[hash]; + READVECTOR (il.ids); + FAISS_THROW_IF_NOT (il.ids.size() == ilsz); + READVECTOR (il.vecs); + } +} + +static void read_binary_multi_hash_map( + IndexBinaryMultiHash::Map &map, + int b, size_t ntotal, + IOReader *f) +{ + int id_bits; + size_t sz; + READ1 (id_bits); + READ1 (sz); + std::vector buf; + READVECTOR (buf); + size_t nbit = (b + id_bits) * sz + ntotal * id_bits; + FAISS_THROW_IF_NOT (buf.size() == (nbit + 7) / 8); + BitstringReader rd (buf.data(), buf.size()); + map.reserve (sz); + for (size_t i = 0; i < sz; i++) { + uint64_t hash = rd.read(b); + uint64_t ilsz = rd.read(id_bits); + auto & il = map[hash]; + for (size_t j = 0; j < ilsz; j++) { + il.push_back (rd.read (id_bits)); + } + } +} + + + IndexBinary *read_index_binary (IOReader *f, int io_flags) { IndexBinary * idx = nullptr; uint32_t h; @@ -793,6 +845,28 @@ IndexBinary *read_index_binary (IOReader *f, int io_flags) { static_cast(idxmap)->construct_rev_map (); } idx = idxmap; + } else if(h == fourcc("IBHh")) { + IndexBinaryHash *idxh = new IndexBinaryHash (); + read_index_binary_header (idxh, f); + READ1 (idxh->b); + READ1 (idxh->nflip); + read_binary_hash_invlists(idxh->invlists, idxh->b, f); + idx = idxh; + } else if(h == fourcc("IBHm")) { + IndexBinaryMultiHash* idxmh = new IndexBinaryMultiHash (); + read_index_binary_header (idxmh, f); + idxmh->storage = dynamic_cast (read_index_binary (f)); + FAISS_THROW_IF_NOT(idxmh->storage && idxmh->storage->ntotal == idxmh->ntotal); + idxmh->own_fields = true; + READ1 (idxmh->b); + READ1 (idxmh->nhash); + READ1 (idxmh->nflip); + idxmh->maps.resize (idxmh->nhash); + for (int i = 0; i < idxmh->nhash; i++) { + read_binary_multi_hash_map( + idxmh->maps[i], idxmh->b, idxmh->ntotal, f); + } + idx = idxmh; } else { FAISS_THROW_FMT("Index type 0x%08x not supported\n", h); idx = nullptr; diff --git a/impl/index_write.cpp b/impl/index_write.cpp index 4818f13c4..46cc2a68c 100644 --- a/impl/index_write.cpp +++ b/impl/index_write.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include @@ -515,6 +517,67 @@ static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) { write_direct_map (&ivf->direct_map, f); } +static void write_binary_hash_invlists ( + const IndexBinaryHash::InvertedListMap &invlists, + int b, IOWriter *f) +{ + size_t sz = invlists.size(); + WRITE1 (sz); + size_t maxil = 0; + for (auto it = invlists.begin(); it != invlists.end(); ++it) { + if(it->second.ids.size() > maxil) { + maxil = it->second.ids.size(); + } + } + int il_nbit = 0; + while(maxil >= ((uint64_t)1 << il_nbit)) { + il_nbit++; + } + WRITE1(il_nbit); + + // first write sizes then data, may be useful if we want to + // memmap it at some point + + // buffer for bitstrings + std::vector buf (((b + il_nbit) * sz + 7) / 8); + BitstringWriter wr (buf.data(), buf.size()); + for (auto it = invlists.begin(); it != invlists.end(); ++it) { + wr.write (it->first, b); + wr.write (it->second.ids.size(), il_nbit); + } + WRITEVECTOR (buf); + + for (auto it = invlists.begin(); it != invlists.end(); ++it) { + WRITEVECTOR (it->second.ids); + WRITEVECTOR (it->second.vecs); + } +} + +static void write_binary_multi_hash_map( + const IndexBinaryMultiHash::Map &map, + int b, size_t ntotal, + IOWriter *f) +{ + int id_bits = 0; + while ((ntotal > ((Index::idx_t)1 << id_bits))) { + id_bits++; + } + WRITE1(id_bits); + size_t sz = map.size(); + WRITE1(sz); + size_t nbit = (b + id_bits) * sz + ntotal * id_bits; + std::vector buf((nbit + 7) / 8); + BitstringWriter wr (buf.data(), buf.size()); + for (auto it = map.begin(); it != map.end(); ++it) { + wr.write(it->first, b); + wr.write(it->second.size(), id_bits); + for (auto id : it->second) { + wr.write(id, id_bits); + } + } + WRITEVECTOR (buf); +} + void write_index_binary (const IndexBinary *idx, IOWriter *f) { if (const IndexBinaryFlat *idxf = dynamic_cast (idx)) { @@ -551,6 +614,27 @@ void write_index_binary (const IndexBinary *idx, IOWriter *f) { write_index_binary_header (idxmap, f); write_index_binary (idxmap->index, f); WRITEVECTOR (idxmap->id_map); + } else if (const IndexBinaryHash *idxh = + dynamic_cast (idx)) { + uint32_t h = fourcc ("IBHh"); + WRITE1 (h); + write_index_binary_header (idxh, f); + WRITE1 (idxh->b); + WRITE1 (idxh->nflip); + write_binary_hash_invlists(idxh->invlists, idxh->b, f); + } else if (const IndexBinaryMultiHash *idxmh = + dynamic_cast (idx)) { + uint32_t h = fourcc ("IBHm"); + WRITE1 (h); + write_index_binary_header (idxmh, f); + write_index_binary (idxmh->storage, f); + WRITE1 (idxmh->b); + WRITE1 (idxmh->nhash); + WRITE1 (idxmh->nflip); + for (int i = 0; i < idxmh->nhash; i++) { + write_binary_multi_hash_map( + idxmh->maps[i], idxmh->b, idxmh->ntotal, f); + } } else { FAISS_THROW_MSG ("don't know how to serialize this type of index"); } diff --git a/impl/io.cpp b/impl/io.cpp index e8ffca6bc..0954f3c1f 100644 --- a/impl/io.cpp +++ b/impl/io.cpp @@ -37,7 +37,6 @@ int IOWriter::fileno () ***********************************************************************/ - size_t VectorIOWriter::operator()( const void *ptr, size_t size, size_t nitems) { @@ -132,6 +131,117 @@ int FileIOWriter::fileno() { return ::fileno (f); } +/*********************************************************************** + * IO buffer + ***********************************************************************/ + +BufferedIOReader::BufferedIOReader(IOReader *reader, size_t bsz, size_t totsz): + reader(reader), bsz(bsz), totsz(totsz), ofs(0), b0(0), b1(0), buffer(bsz) +{ +} + + +size_t BufferedIOReader::operator()(void *ptr, size_t unitsize, size_t nitems) +{ + size_t size = unitsize * nitems; + if (size == 0) return 0; + char * dst = (char*)ptr; + size_t nb; + + { // first copy available bytes + nb = std::min(b1 - b0, size); + memcpy (dst, buffer.data() + b0, nb); + b0 += nb; + dst += nb; + size -= nb; + } + + if (size > totsz - ofs) { + size = totsz - ofs; + } + // while we would like to have more data + while (size > 0) { + assert (b0 == b1); // buffer empty on input + // try to read from main reader + b0 = 0; + b1 = (*reader)(buffer.data(), 1, std::min(bsz, size)); + + if (b1 == 0) { + // no more bytes available + break; + } + ofs += b1; + + // copy remaining bytes + size_t nb2 = std::min(b1, size); + memcpy (dst, buffer.data(), nb2); + b0 = nb2; + nb += nb2; + dst += nb2; + size -= nb2; + } + return nb / unitsize; +} + + +BufferedIOWriter::BufferedIOWriter(IOWriter *writer, size_t bsz): + writer(writer), bsz(bsz), b0(0), buffer(bsz) +{ +} + +size_t BufferedIOWriter::operator()(const void *ptr, size_t unitsize, size_t nitems) +{ + size_t size = unitsize * nitems; + if (size == 0) return 0; + const char * src = (const char*)ptr; + size_t nb; + + { // copy as many bytes as possible to buffer + nb = std::min(bsz - b0, size); + memcpy (buffer.data() + b0, src, nb); + b0 += nb; + src += nb; + size -= nb; + } + while (size > 0) { + assert(b0 == bsz); + // now we need to flush to add more bytes + size_t ofs = 0; + do { + assert (ofs < 10000000); + size_t written = (*writer)(buffer.data() + ofs, 1, bsz - ofs); + FAISS_THROW_IF_NOT(written > 0); + ofs += written; + } while(ofs != bsz); + + // copy src to buffer + size_t nb1 = std::min(bsz, size); + memcpy (buffer.data(), src, nb1); + b0 = nb1; + nb += nb1; + src += nb1; + size -= nb1; + } + + return nb / unitsize; +} + +BufferedIOWriter::~BufferedIOWriter() +{ + size_t ofs = 0; + while(ofs != b0) { + printf("Destructor write %ld \n", b0 - ofs); + size_t written = (*writer)(buffer.data() + ofs, 1, b0 - ofs); + FAISS_THROW_IF_NOT(written > 0); + ofs += written; + } + +} + + + + + uint32_t fourcc (const char sx[4]) { assert(4 == strlen(sx)); const unsigned char *x = (unsigned char*)sx; diff --git a/impl/io.h b/impl/io.h index 173d87da6..a3a565af2 100644 --- a/impl/io.h +++ b/impl/io.h @@ -9,6 +9,9 @@ /*********************************************************** * Abstract I/O objects + * + * I/O is always sequential, seek does not need to be supported + * (indexes could be read or written to a pipe). ***********************************************************/ #pragma once @@ -92,6 +95,41 @@ struct FileIOWriter: IOWriter { int fileno() override; }; +/******************************************************* + * Buffered reader + writer + *******************************************************/ + + + +/** wraps an ioreader to make buffered reads to avoid too small reads */ +struct BufferedIOReader: IOReader { + + IOReader *reader; + size_t bsz, totsz, ofs; + size_t b0, b1; ///< range of available bytes in the buffer + std::vector buffer; + + BufferedIOReader(IOReader *reader, size_t bsz, + size_t totsz=(size_t)(-1)); + + size_t operator()(void *ptr, size_t size, size_t nitems) override; +}; + +struct BufferedIOWriter: IOWriter { + + IOWriter *writer; + size_t bsz, ofs; + size_t b0; ///< amount of data in buffer + std::vector buffer; + + BufferedIOWriter(IOWriter *writer, size_t bsz); + + size_t operator()(const void *ptr, size_t size, size_t nitems) override; + + // flushes + ~BufferedIOWriter(); +}; + /// cast a 4-character string to a uint32_t that can be written and read easily uint32_t fourcc (const char sx[4]); diff --git a/python/faiss.py b/python/faiss.py index 163e2cb96..2d58b7f70 100644 --- a/python/faiss.py +++ b/python/faiss.py @@ -283,6 +283,18 @@ def handle_IndexBinary(the_class): swig_ptr(labels)) return distances, labels + def replacement_range_search(self, x, thresh): + n, d = x.shape + assert d * 8 == self.d + res = RangeSearchResult(n) + self.range_search_c(n, swig_ptr(x), thresh, res) + # get pointers and copy them + lims = rev_swig_ptr(res.lims, n + 1).copy() + nd = int(lims[-1]) + D = rev_swig_ptr(res.distances, nd).copy() + I = rev_swig_ptr(res.labels, nd).copy() + return lims, D, I + def replacement_remove_ids(self, x): if isinstance(x, IDSelector): sel = x @@ -295,6 +307,7 @@ def handle_IndexBinary(the_class): replace_method(the_class, 'add_with_ids', replacement_add_with_ids) replace_method(the_class, 'train', replacement_train) replace_method(the_class, 'search', replacement_search) + replace_method(the_class, 'range_search', replacement_range_search) replace_method(the_class, 'reconstruct', replacement_reconstruct) replace_method(the_class, 'remove_ids', replacement_remove_ids) @@ -461,6 +474,9 @@ add_ref_in_constructor(IndexBinaryIDMap2, 0) add_ref_in_method(IndexReplicas, 'addIndex', 0) add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0) +add_ref_in_constructor(BufferedIOWriter, 0) +add_ref_in_constructor(BufferedIOReader, 0) + # seems really marginal... # remove_ref_from_method(IndexReplicas, 'removeIndex', 0) @@ -751,9 +767,24 @@ def deserialize_index(data): copy_array_to_vector(data, reader.data) return read_index(reader) +def serialize_index_binary(index): + """ convert an index to a numpy uint8 array """ + writer = VectorIOWriter() + write_index_binary(index, writer) + return vector_to_array(writer.data) + +def deserialize_index_binary(data): + reader = VectorIOReader() + copy_array_to_vector(data, reader.data) + return read_index_binary(reader) + + +########################################### +# ResultHeap +########################################### class ResultHeap: - """Combine query results from a sliced dataset. The final result will + """Accumulate query results from a sliced dataset. The final result will be in self.D, self.I.""" def __init__(self, nq, k): diff --git a/python/setup.py b/python/setup.py index a5ebdbc84..89b6d398c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -32,7 +32,7 @@ are implemented on the GPU. It is developed by Facebook AI Research. """ setup( name='faiss', - version='1.6.2', + version='1.6.3', description='A library for efficient similarity search and clustering of dense vectors', long_description=long_description, url='https://github.com/facebookresearch/faiss', diff --git a/python/swigfaiss.swig b/python/swigfaiss.swig index 9d09b3720..b0d8b8173 100644 --- a/python/swigfaiss.swig +++ b/python/swigfaiss.swig @@ -93,6 +93,7 @@ extern "C" { #include #include #include +#include #include #include @@ -359,6 +360,7 @@ void gpu_sync_all_devices() %include %include %include +%include @@ -979,6 +981,124 @@ struct MapLong2Long { %} +/******************************************************************* + * Support I/O to arbitrary functions + *******************************************************************/ + + +%inline %{ + +#ifdef SWIGPYTHON + + +struct PyCallbackIOWriter: faiss::IOWriter { + + PyObject * callback; + size_t bs; // maximum write size + + PyCallbackIOWriter(PyObject *callback, + size_t bs = 1024 * 1024): + callback(callback), bs(bs) { + Py_INCREF(callback); + name = "PyCallbackIOWriter"; + } + + size_t operator()(const void *ptrv, size_t size, size_t nitems) override { + size_t ws = size * nitems; + const char *ptr = (const char*)ptrv; + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + while(ws > 0) { + size_t wi = ws > bs ? bs : ws; + PyObject* bo = PyBytes_FromStringAndSize(ptr, wi); + PyObject *arglist = Py_BuildValue("(N)", bo); + if(!arglist) { + PyGILState_Release(gstate); + return 0; + } + ptr += wi; + ws -= wi; + PyObject * result = PyObject_CallObject(callback, arglist); + Py_DECREF(arglist); + if (result == NULL) { + PyGILState_Release(gstate); + return 0; + } + Py_DECREF(result); + } + PyGILState_Release(gstate); + return nitems; + } + + ~PyCallbackIOWriter() { + Py_DECREF(callback); + } + +}; + +struct PyCallbackIOReader: faiss::IOReader { + + PyObject * callback; + size_t bs; // maximum buffer size + + PyCallbackIOReader(PyObject *callback, + size_t bs = 1024 * 1024): + callback(callback), bs(bs) { + Py_INCREF(callback); + name = "PyCallbackIOReader"; + } + + size_t operator()(void *ptrv, size_t size, size_t nitems) override { + size_t rs = size * nitems; + char *ptr = (char*)ptrv; + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + while(rs > 0) { + size_t ri = rs > bs ? bs : rs; + PyObject *arglist = Py_BuildValue("(n)", ri); + PyObject * result = PyObject_CallObject(callback, arglist); + Py_DECREF(arglist); + if (result == NULL) { + PyGILState_Release(gstate); + return 0; + } + if(!PyBytes_Check(result)) { + Py_DECREF(result); + PyErr_SetString(PyExc_RuntimeError, + "read callback did not return a bytes object"); + PyGILState_Release(gstate); + throw faiss::FaissException("reader error"); + } + size_t sz = PyBytes_Size(result); + if (sz == 0 || sz > rs) { + Py_DECREF(result); + PyErr_Format(PyExc_RuntimeError, + "read callback returned %ld bytes (asked %ld)", + sz, rs); + PyGILState_Release(gstate); + throw faiss::FaissException("reader error"); + } + memcpy(ptr, PyBytes_AsString(result), sz); + Py_DECREF(result); + ptr += sz; + rs -= sz; + } + PyGILState_Release(gstate); + return nitems; + } + + ~PyCallbackIOReader() { + Py_DECREF(callback); + } + +}; + +#endif + +%} + + + %inline %{ void wait() { // in gdb, use return to get out of this function diff --git a/tests/common.py b/tests/common.py index 290e317c3..8621dd822 100644 --- a/tests/common.py +++ b/tests/common.py @@ -97,3 +97,32 @@ def get_dataset_2(d, nt, nb, nq): x = np.sin(x) x = x.astype('float32') return x[:nt], x[nt:nt + nb], x[nt + nb:] + + +def make_binary_dataset(d, nt, nb, nq): + assert d % 8 == 0 + rs = np.random.RandomState(123) + x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8') + return x[:nt], x[nt:-nq], x[-nq:] + + +def compare_binary_result_lists(D1, I1, D2, I2): + """comparing result lists is difficult because there are many + ties. Here we sort by (distance, index) pairs and ignore the largest + distance of each result. Compatible result lists should pass this.""" + assert D1.shape == I1.shape == D2.shape == I2.shape + n, k = D1.shape + ndiff = (D1 != D2).sum() + assert ndiff == 0, '%d differences in distance matrix %s' % ( + ndiff, D1.shape) + + def normalize_DI(D, I): + norm = I.max() + 1.0 + Dr = D.astype('float64') + I / norm + # ignore -1s and elements on last column + Dr[I1 == -1] = 1e20 + Dr[D == D[:, -1:]] = 1e20 + Dr.sort(axis=1) + return Dr + ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum() + assert ndiff == 0, '%d differences in normalized D matrix' % ndiff diff --git a/tests/test_binary_hashindex.py b/tests/test_binary_hashindex.py new file mode 100644 index 000000000..1ee5a5f7d --- /dev/null +++ b/tests/test_binary_hashindex.py @@ -0,0 +1,183 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#!/usr/bin/env python3 + +import unittest +import numpy as np +import faiss + +from common import make_binary_dataset + + +def bitvec_shuffle(a, order): + n, d = a.shape + db, = order.shape + b = np.empty((n, db // 8), dtype='uint8') + faiss.bitvec_shuffle( + n, d * 8, db, + faiss.swig_ptr(order), + faiss.swig_ptr(a), faiss.swig_ptr(b)) + return b + + +class TestSmallFuncs(unittest.TestCase): + + def test_shuffle(self): + d = 256 + n = 1000 + rs = np.random.RandomState(123) + o = rs.permutation(d).astype('int32') + + x = rs.randint(256, size=(n, d // 8)).astype('uint8') + + y1 = bitvec_shuffle(x, o[:128]) + y2 = bitvec_shuffle(x, o[128:]) + y = np.hstack((y1, y2)) + + oinv = np.empty(d, dtype='int32') + oinv[o] = np.arange(d) + z = bitvec_shuffle(y, oinv) + + np.testing.assert_array_equal(x, z) + + +class TestRange(unittest.TestCase): + + def test_hash(self): + d = 128 + nq = 100 + nb = 2000 + + (_, xb, xq) = make_binary_dataset(d, 0, nb, nq) + + index_ref = faiss.IndexBinaryFlat(d) + index_ref.add(xb) + + radius = 55 + + Lref, Dref, Iref = index_ref.range_search(xq, radius) + + print("nb res: ", Lref[-1]) + + index = faiss.IndexBinaryHash(d, 10) + index.add(xb) + # index.display() + nfound = [] + ndis = [] + stats = faiss.cvar.indexBinaryHash_stats + for n_bitflips in range(index.b + 1): + index.nflip = n_bitflips + stats.reset() + Lnew, Dnew, Inew = index.range_search(xq, radius) + for i in range(nq): + ref = Iref[Lref[i]:Lref[i + 1]] + new = Inew[Lnew[i]:Lnew[i + 1]] + snew = set(new) + # no duplicates + self.assertTrue(len(new) == len(snew)) + # subset of real results + self.assertTrue(snew <= set(ref)) + nfound.append(Lnew[-1]) + ndis.append(stats.ndis) + print('nfound=', nfound) + print('ndis=', ndis) + nfound = np.array(nfound) + self.assertTrue(nfound[-1] == Lref[-1]) + self.assertTrue(np.all(nfound[1:] >= nfound[:-1])) + + def test_multihash(self): + d = 128 + nq = 100 + nb = 2000 + + (_, xb, xq) = make_binary_dataset(d, 0, nb, nq) + + index_ref = faiss.IndexBinaryFlat(d) + index_ref.add(xb) + + radius = 55 + + Lref, Dref, Iref = index_ref.range_search(xq, radius) + + print("nb res: ", Lref[-1]) + + nfound = [] + ndis = [] + + for nh in 1, 3, 5: + index = faiss.IndexBinaryMultiHash(d, nh, 10) + index.add(xb) + # index.display() + stats = faiss.cvar.indexBinaryHash_stats + index.nflip = 2 + stats.reset() + Lnew, Dnew, Inew = index.range_search(xq, radius) + for i in range(nq): + ref = Iref[Lref[i]:Lref[i + 1]] + new = Inew[Lnew[i]:Lnew[i + 1]] + snew = set(new) + # no duplicates + self.assertTrue(len(new) == len(snew)) + # subset of real results + self.assertTrue(snew <= set(ref)) + nfound.append(Lnew[-1]) + ndis.append(stats.ndis) + print('nfound=', nfound) + print('ndis=', ndis) + nfound = np.array(nfound) + # self.assertTrue(nfound[-1] == Lref[-1]) + self.assertTrue(np.all(nfound[1:] >= nfound[:-1])) + + +class TestKnn(unittest.TestCase): + + def test_hash_and_multihash(self): + d = 128 + nq = 100 + nb = 2000 + + (_, xb, xq) = make_binary_dataset(d, 0, nb, nq) + + index_ref = faiss.IndexBinaryFlat(d) + index_ref.add(xb) + k = 10 + Dref, Iref = index_ref.search(xq, k) + + nfound = {} + for nh in 0, 1, 3, 5: + + for nbit in 4, 7: + if nh == 0: + index = faiss.IndexBinaryHash(d, nbit) + else: + index = faiss.IndexBinaryMultiHash(d, nh, nbit) + index.add(xb) + index.nflip = 2 + Dnew, Inew = index.search(xq, k) + nf = 0 + for i in range(nq): + ref = Iref[i] + new = Inew[i] + snew = set(new) + # no duplicates + self.assertTrue(len(new) == len(snew)) + nf += len(set(ref) & snew) + print('nfound', nh, nbit, nf) + nfound[(nh, nbit)] = nf + self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)]) + + # test serialization + index2 = faiss.deserialize_index_binary( + faiss.serialize_index_binary(index)) + + D2, I2 = index2.search(xq, k) + np.testing.assert_array_equal(Inew, I2) + np.testing.assert_array_equal(Dnew, D2) + + print('nfound=', nfound) + self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)])) + self.assertGreater(nfound[(3, 7)], nfound[(1, 7)]) + self.assertGreater(nfound[(5, 7)], nfound[(3, 7)]) diff --git a/tests/test_index.py b/tests/test_index.py index 54494dcbb..c41f7f8c0 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -13,7 +13,7 @@ import faiss import tempfile import os import re - +import warnings from common import get_dataset, get_dataset_2 @@ -24,7 +24,6 @@ class TestModuleInterface(unittest.TestCase): assert re.match('^\\d+\\.\\d+\\.\\d+$', faiss.__version__) - class EvalIVFPQAccuracy(unittest.TestCase): def test_IndexIVFPQ(self): @@ -506,37 +505,6 @@ class TestHNSW(unittest.TestCase): assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0]) -class TestIOError(unittest.TestCase): - - def test_io_error(self): - d, n = 32, 1000 - x = np.random.uniform(size=(n, d)).astype('float32') - index = faiss.IndexFlatL2(d) - index.add(x) - _, fname = tempfile.mkstemp() - try: - faiss.write_index(index, fname) - - # should be fine - faiss.read_index(fname) - - # now damage file - data = open(fname, 'rb').read() - data = data[:int(len(data) / 2)] - open(fname, 'wb').write(data) - - # should make a nice readable exception that mentions the - try: - faiss.read_index(fname) - except RuntimeError as e: - if fname not in str(e): - raise - else: - raise - - finally: - if os.path.exists(fname): - os.unlink(fname) class TestDistancesPositive(unittest.TestCase): diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py index 064b1cfbe..c61e2fa5d 100644 --- a/tests/test_index_binary.py +++ b/tests/test_index_binary.py @@ -10,12 +10,8 @@ import numpy as np import unittest import faiss +from common import compare_binary_result_lists, make_binary_dataset -def make_binary_dataset(d, nt, nb, nq): - assert d % 8 == 0 - rs = np.random.RandomState(123) - x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8') - return x[:nt], x[nt:-nq], x[-nq:] def binary_to_float(x): @@ -124,6 +120,29 @@ class TestBinaryFlat(unittest.TestCase): assert(np.all(Iflat == -1)) assert(np.all(Dflat == 2147483647)) # NOTE(hoss): int32_t max + def test_range_search(self): + d = self.xq.shape[1] * 8 + + index = faiss.IndexBinaryFlat(d) + index.add(self.xb) + D, I = index.search(self.xq, 10) + thresh = int(np.median(D[:, -1])) + + lims, D2, I2 = index.range_search(self.xq, thresh) + nt1 = nt2 = 0 + for i in range(len(self.xq)): + range_res = I2[lims[i]:lims[i + 1]] + if thresh > D[i, -1]: + self.assertTrue(set(I[i]) <= set(range_res)) + nt1 += 1 + elif thresh < D[i, -1]: + self.assertTrue(set(range_res) <= set(I[i])) + nt2 += 1 + # in case of equality we have a problem with ties + print('nb tests', nt1, nt2) + # nb tests is actually low... + self.assertTrue(nt1 > 19 and nt2 > 19) + class TestBinaryIVF(unittest.TestCase): @@ -166,6 +185,29 @@ class TestBinaryIVF(unittest.TestCase): self.assertEqual((self.Dref == Divfflat).sum(), 4122) + def test_ivf_range(self): + d = self.xq.shape[1] * 8 + + quantizer = faiss.IndexBinaryFlat(d) + index = faiss.IndexBinaryIVF(quantizer, d, 8) + index.cp.min_points_per_centroid = 5 # quiet warning + index.nprobe = 4 + index.train(self.xt) + index.add(self.xb) + D, I = index.search(self.xq, 10) + + radius = int(np.median(D[:, -1]) + 1) + Lr, Dr, Ir = index.range_search(self.xq, radius) + + for i in range(len(self.xq)): + res = Ir[Lr[i]:Lr[i + 1]] + if D[i, -1] < radius: + self.assertTrue(set(I[i]) <= set(res)) + else: + subset = I[i, D[i, :] < radius] + self.assertTrue(set(subset) == set(res)) + + def test_ivf_flat_empty(self): d = self.xq.shape[1] * 8 @@ -257,27 +299,6 @@ class TestHNSW(unittest.TestCase): self.assertTrue((Dref == Dbin).all()) -def compare_binary_result_lists(D1, I1, D2, I2): - """comparing result lists is difficult because there are many - ties. Here we sort by (distance, index) pairs and ignore the largest - distance of each result. Compatible result lists should pass this.""" - assert D1.shape == I1.shape == D2.shape == I2.shape - n, k = D1.shape - ndiff = (D1 != D2).sum() - assert ndiff == 0, '%d differences in distance matrix %s' % ( - ndiff, D1.shape) - - def normalize_DI(D, I): - norm = I.max() + 1.0 - Dr = D.astype('float64') + I / norm - # ignore -1s and elements on last column - Dr[I1 == -1] = 1e20 - Dr[D == D[:, -1:]] = 1e20 - Dr.sort(axis=1) - return Dr - ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum() - assert ndiff == 0, '%d differences in normalized D matrix' % ndiff - class TestReplicasAndShards(unittest.TestCase): diff --git a/tests/test_io.py b/tests/test_io.py new file mode 100644 index 000000000..7e3d6edf5 --- /dev/null +++ b/tests/test_io.py @@ -0,0 +1,220 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#!/usr/bin/env python3 + +import numpy as np +import unittest +import faiss +import tempfile +import os +import io +import sys +import warnings +from multiprocessing.dummy import Pool as ThreadPool + +from common import get_dataset, get_dataset_2 + + +class TestIOVariants(unittest.TestCase): + + def test_io_error(self): + d, n = 32, 1000 + x = np.random.uniform(size=(n, d)).astype('float32') + index = faiss.IndexFlatL2(d) + index.add(x) + _, fname = tempfile.mkstemp() + try: + faiss.write_index(index, fname) + + # should be fine + faiss.read_index(fname) + + # now damage file + data = open(fname, 'rb').read() + data = data[:int(len(data) / 2)] + open(fname, 'wb').write(data) + + # should make a nice readable exception that mentions the filename + try: + faiss.read_index(fname) + except RuntimeError as e: + if fname not in str(e): + raise + else: + raise + + finally: + if os.path.exists(fname): + os.unlink(fname) + + +class TestCallbacks(unittest.TestCase): + + def do_write_callback(self, bsz): + d, n = 32, 1000 + x = np.random.uniform(size=(n, d)).astype('float32') + index = faiss.IndexFlatL2(d) + index.add(x) + + f = io.BytesIO() + # test with small block size + writer = faiss.PyCallbackIOWriter(f.write, 1234) + + if bsz > 0: + writer = faiss.BufferedIOWriter(writer, bsz) + + faiss.write_index(index, writer) + del writer # make sure all writes committed + + if sys.version_info[0] < 3: + buf = f.getvalue() + else: + buf = f.getbuffer() + + index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8')) + + self.assertEqual(index.d, index2.d) + self.assertTrue(np.all( + faiss.vector_to_array(index.xb) == faiss.vector_to_array(index2.xb) + )) + + # This is not a callable function: shoudl raise an exception + writer = faiss.PyCallbackIOWriter("blabla") + self.assertRaises( + Exception, + faiss.write_index, index, writer + ) + + def test_buf_read(self): + x = np.random.uniform(size=20) + + _, fname = tempfile.mkstemp() + try: + x.tofile(fname) + + f = open(fname, 'rb') + reader = faiss.PyCallbackIOReader(f.read, 1234) + + bsz = 123 + reader = faiss.BufferedIOReader(reader, bsz) + + y = np.zeros_like(x) + print('nbytes=', y.nbytes) + reader(faiss.swig_ptr(y), y.nbytes, 1) + + np.testing.assert_array_equal(x, y) + finally: + if os.path.exists(fname): + os.unlink(fname) + + def do_read_callback(self, bsz): + d, n = 32, 1000 + x = np.random.uniform(size=(n, d)).astype('float32') + index = faiss.IndexFlatL2(d) + index.add(x) + + _, fname = tempfile.mkstemp() + try: + faiss.write_index(index, fname) + + f = open(fname, 'rb') + + reader = faiss.PyCallbackIOReader(f.read, 1234) + + if bsz > 0: + reader = faiss.BufferedIOReader(reader, bsz) + + index2 = faiss.read_index(reader) + + self.assertEqual(index.d, index2.d) + np.testing.assert_array_equal( + faiss.vector_to_array(index.xb), + faiss.vector_to_array(index2.xb) + ) + + # This is not a callable function: should raise an exception + reader = faiss.PyCallbackIOReader("blabla") + self.assertRaises( + Exception, + faiss.read_index, reader + ) + finally: + if os.path.exists(fname): + os.unlink(fname) + + def test_write_callback(self): + self.do_write_callback(0) + + def test_write_buffer(self): + self.do_write_callback(123) + self.do_write_callback(2345) + + def test_read_callback(self): + self.do_read_callback(0) + + def test_read_callback_buffered(self): + self.do_read_callback(123) + self.do_read_callback(12345) + + def test_read_buffer(self): + d, n = 32, 1000 + x = np.random.uniform(size=(n, d)).astype('float32') + index = faiss.IndexFlatL2(d) + index.add(x) + + _, fname = tempfile.mkstemp() + try: + faiss.write_index(index, fname) + + reader = faiss.BufferedIOReader( + faiss.FileIOReader(fname), 1234) + + index2 = faiss.read_index(reader) + + self.assertEqual(index.d, index2.d) + np.testing.assert_array_equal( + faiss.vector_to_array(index.xb), + faiss.vector_to_array(index2.xb) + ) + + finally: + if os.path.exists(fname): + os.unlink(fname) + + + def test_transfer_pipe(self): + """ transfer an index through a Unix pipe """ + + d, n = 32, 1000 + x = np.random.uniform(size=(n, d)).astype('float32') + index = faiss.IndexFlatL2(d) + index.add(x) + Dref, Iref = index.search(x, 10) + + rf, wf = os.pipe() + + # start thread that will decompress the index + + def index_from_pipe(): + reader = faiss.PyCallbackIOReader(lambda size: os.read(rf, size)) + return faiss.read_index(reader) + + fut = ThreadPool(1).apply_async(index_from_pipe, ()) + + # write to pipe + writer = faiss.PyCallbackIOWriter(lambda b: os.write(wf, b)) + faiss.write_index(index, writer) + + index2 = fut.get() + + # closing is not really useful but it does not hurt + os.close(wf) + os.close(rf) + + Dnew, Inew = index2.search(x, 10) + + np.testing.assert_array_equal(Iref, Inew) + np.testing.assert_array_equal(Dref, Dnew) diff --git a/utils/hamming.cpp b/utils/hamming.cpp index 67a441e9e..f80c994a0 100644 --- a/utils/hamming.cpp +++ b/utils/hamming.cpp @@ -34,6 +34,7 @@ #include #include #include +#include static const size_t BLOCKSIZE_QUERY = 8192; @@ -484,6 +485,30 @@ void bitvec_print (const uint8_t * b, size_t d) } +void bitvec_shuffle (size_t n, size_t da, size_t db, + const int *order, + const uint8_t *a, + uint8_t *b) +{ + for(size_t i = 0; i < db; i++) { + FAISS_THROW_IF_NOT (order[i] >= 0 && order[i] < da); + } + size_t lda = (da + 7) / 8; + size_t ldb = (db + 7) / 8; + +#pragma omp parallel for if(n > 10000) + for (size_t i = 0; i < n; i++) { + const uint8_t *ai = a + i * lda; + uint8_t *bi = b + i * ldb; + memset (bi, 0, ldb); + for(size_t i = 0; i < db; i++) { + int o = order[i]; + uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1; + bi[i >> 3] |= the_bit << (i & 7); + } + } + +} @@ -527,6 +552,7 @@ void hammings_knn( { hammings_knn_hc(ha, a, b, nb, ncodes, order); } + void hammings_knn_hc ( int_maxheap_array_t * ha, const uint8_t * a, @@ -610,7 +636,66 @@ void hammings_knn_mc( } } } +template +static +void hamming_range_search_template ( + const uint8_t * a, + const uint8_t * b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult *res) +{ +#pragma omp parallel + { + RangeSearchPartialResult pres (res); + +#pragma omp for + for (size_t i = 0; i < na; i++) { + HammingComputer hc (a + i * code_size, code_size); + const uint8_t * yi = b; + RangeQueryResult & qres = pres.new_result (i); + + for (size_t j = 0; j < nb; j++) { + int dis = hc.hamming (yi); + if (dis < radius) { + qres.add(dis, j); + } + yi += code_size; + } + } + pres.finalize (); + } +} + +void hamming_range_search ( + const uint8_t * a, + const uint8_t * b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult *result) +{ + +#define HC(name) hamming_range_search_template (a, b, na, nb, radius, code_size, result) + + switch(code_size) { + case 4: HC(HammingComputer4); break; + case 8: HC(HammingComputer8); break; + case 16: HC(HammingComputer16); break; + case 32: HC(HammingComputer32); break; + default: + if (code_size % 8 == 0) { + HC(HammingComputerM8); + } else { + HC(HammingComputerDefault); + } + } +#undef HC +} diff --git a/utils/hamming.h b/utils/hamming.h index 1ddbd5c01..2d04613ef 100644 --- a/utils/hamming.h +++ b/utils/hamming.h @@ -39,6 +39,7 @@ namespace faiss { * General bit vector functions **************************************************/ +struct RangeSearchResult; void bitvec_print (const uint8_t * b, size_t d); @@ -65,6 +66,14 @@ void bitvecs2fvecs ( void fvec2bitvec (const float * x, uint8_t * b, size_t d); +/** Shuffle the bits from b(i, j) := a(i, order[j]) + */ +void bitvec_shuffle (size_t n, size_t da, size_t db, + const int *order, + const uint8_t *a, + uint8_t *b); + + /*********************************************** * Generic reader/writer for bit strings ***********************************************/ @@ -171,6 +180,17 @@ void hammings_knn_mc ( int32_t *distances, int64_t *labels); +/** same as hammings_knn except we are doing a range search with radius */ +void hamming_range_search ( + const uint8_t * a, + const uint8_t * b, + size_t na, + size_t nb, + int radius, + size_t ncodes, + RangeSearchResult *result); + + /* Counting the number of matches or of cross-matches (without returning them) For use with function that assume pre-allocated memory */ void hamming_count_thres (