Sync 20200323. (#1157)

* Sync 20200323.

* Bump version.

* Remove warning filter.
pull/1160/head v1.6.3
Lucas Hosseini 2020-03-24 14:06:48 +01:00 committed by GitHub
parent fc2a1c1775
commit a17a631dc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 3528 additions and 361 deletions

View File

@ -34,7 +34,7 @@ struct ClusteringParameters {
int seed; ///< seed for the random number generator
size_t decode_block_size; /// < how many vectors at a time to decode
size_t decode_block_size; ///< how many vectors at a time to decode
/// sets reasonable defaults
ClusteringParameters ();
@ -42,11 +42,11 @@ struct ClusteringParameters {
struct ClusteringIterationStats {
float obj; /// objective values (sum of distances reported by index)
double time; /// seconds for iteration
double time_search; /// seconds for just search
double imbalance_factor; /// imbalance factor of iteration
int nsplit; /// number of cluster splits
float obj; ///< objective values (sum of distances reported by index)
double time; ///< seconds for iteration
double time_search; ///< seconds for just search
double imbalance_factor; ///< imbalance factor of iteration
int nsplit; ///< number of cluster splits
};

10
Index.h
View File

@ -18,7 +18,7 @@
#define FAISS_VERSION_MAJOR 1
#define FAISS_VERSION_MINOR 6
#define FAISS_VERSION_PATCH 2
#define FAISS_VERSION_PATCH 3
/**
* @namespace faiss
@ -44,12 +44,10 @@ struct IDSelector;
struct RangeSearchResult;
struct DistanceComputer;
/** Abstract structure for an index
/** Abstract structure for an index, supports adding vectors and searching them.
*
* Supports adding vertices and searching them.
*
* Currently only asymmetric queries are supported:
* database-to-database queries are not implemented.
* All vectors provided at add or search time are 32-bit float arrays,
* although the internal representation may vary.
*/
struct Index {
using idx_t = int64_t; ///< all indices are this type

View File

@ -99,9 +99,13 @@ struct IndexBinary {
/** Query n vectors of dimension d to the index.
*
* return all vectors with distance < radius. Note that many
* indexes do not implement the range_search (only the k-NN search
* is mandatory).
* return all vectors with distance < radius. Note that many indexes
* do not implement the range_search (only the k-NN search is
* mandatory). The distances are converted to float to reuse the
* RangeSearchResult structure, but they are integer. By convention,
* only distances < radius (strict comparison) are returned,
* ie. radius = 0 does not return any result and 1 returns only
* exact same vectors.
*
* @param x input vectors to search, size n * d / 8
* @param radius search radius

View File

@ -79,5 +79,10 @@ void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
}
void IndexBinaryFlat::range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const
{
hamming_range_search (x, xb.data(), n, ntotal, radius, code_size, result);
}
} // namespace faiss

View File

@ -38,6 +38,9 @@ struct IndexBinaryFlat : IndexBinary {
void search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const override;
void range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const override;
void reconstruct(idx_t key, uint8_t *recons) const override;
/** Remove some ids. Note that because of the indexing structure,

492
IndexBinaryHash.cpp 100644
View File

@ -0,0 +1,492 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved
// -*- c++ -*-
#include <faiss/IndexBinaryHash.h>
#include <cstdio>
#include <memory>
#include <faiss/utils/hamming.h>
#include <faiss/utils/utils.h>
#include <faiss/impl/AuxIndexStructures.h>
#include <faiss/impl/FaissAssert.h>
namespace faiss {
void IndexBinaryHash::InvertedList::add (
idx_t id, size_t code_size, const uint8_t *code)
{
ids.push_back(id);
vecs.insert(vecs.end(), code, code + code_size);
}
IndexBinaryHash::IndexBinaryHash(int d, int b):
IndexBinary(d), b(b), nflip(0)
{
is_trained = true;
}
IndexBinaryHash::IndexBinaryHash(): b(0), nflip(0)
{
is_trained = true;
}
void IndexBinaryHash::reset()
{
invlists.clear();
ntotal = 0;
}
void IndexBinaryHash::add(idx_t n, const uint8_t *x)
{
add_with_ids(n, x, nullptr);
}
void IndexBinaryHash::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids)
{
uint64_t mask = ((uint64_t)1 << b) - 1;
// simplistic add function. Cannot really be parallelized.
for (idx_t i = 0; i < n; i++) {
idx_t id = xids ? xids[i] : ntotal + i;
const uint8_t * xi = x + i * code_size;
idx_t hash = *((uint64_t*)xi) & mask;
invlists[hash].add(id, code_size, xi);
}
ntotal += n;
}
namespace {
/** Enumerate all bit vectors of size nbit with up to maxflip 1s
* test in P127257851 P127258235
*/
struct FlipEnumerator {
int nbit, nflip, maxflip;
uint64_t mask, x;
FlipEnumerator (int nbit, int maxflip): nbit(nbit), maxflip(maxflip) {
nflip = 0;
mask = 0;
x = 0;
}
bool next() {
if (x == mask) {
if (nflip == maxflip) {
return false;
}
// increase Hamming radius
nflip++;
mask = (((uint64_t)1 << nflip) - 1);
x = mask << (nbit - nflip);
return true;
}
int i = __builtin_ctzll(x);
if (i > 0) {
x ^= (uint64_t)3 << (i - 1);
} else {
// nb of LSB 1s
int n1 = __builtin_ctzll(~x);
// clear them
x &= ((uint64_t)(-1) << n1);
int n2 = __builtin_ctzll(x);
x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
}
return true;
}
};
using idx_t = Index::idx_t;
struct RangeSearchResults {
int radius;
RangeQueryResult &qres;
inline void add (float dis, idx_t id) {
if (dis < radius) {
qres.add (dis, id);
}
}
};
struct KnnSearchResults {
// heap params
idx_t k;
int32_t * heap_sim;
idx_t * heap_ids;
using C = CMax<int, idx_t>;
inline void add (float dis, idx_t id) {
if (dis < heap_sim[0]) {
heap_pop<C> (k, heap_sim, heap_ids);
heap_push<C> (k, heap_sim, heap_ids, dis, id);
}
}
};
template<class HammingComputer, class SearchResults>
void
search_single_query_template(const IndexBinaryHash & index, const uint8_t *q,
SearchResults &res,
size_t &n0, size_t &nlist, size_t &ndis)
{
size_t code_size = index.code_size;
uint64_t mask = ((uint64_t)1 << index.b) - 1;
uint64_t qhash = *((uint64_t*)q) & mask;
HammingComputer hc (q, code_size);
FlipEnumerator fe(index.b, index.nflip);
// loop over neighbors that are at most at nflip bits
do {
uint64_t hash = qhash ^ fe.x;
auto it = index.invlists.find (hash);
if (it == index.invlists.end()) {
continue;
}
const IndexBinaryHash::InvertedList &il = it->second;
size_t nv = il.ids.size();
if (nv == 0) {
n0++;
} else {
const uint8_t *codes = il.vecs.data();
for (size_t i = 0; i < nv; i++) {
int dis = hc.hamming (codes);
res.add(dis, il.ids[i]);
codes += code_size;
}
ndis += nv;
nlist++;
}
} while(fe.next());
}
template<class SearchResults>
void
search_single_query(const IndexBinaryHash & index, const uint8_t *q,
SearchResults &res,
size_t &n0, size_t &nlist, size_t &ndis)
{
#define HC(name) search_single_query_template<name>(index, q, res, n0, nlist, ndis);
switch(index.code_size) {
case 4: HC(HammingComputer4); break;
case 8: HC(HammingComputer8); break;
case 16: HC(HammingComputer16); break;
case 20: HC(HammingComputer20); break;
case 32: HC(HammingComputer32); break;
default:
if (index.code_size % 8 == 0) {
HC(HammingComputerM8);
} else {
HC(HammingComputerDefault);
}
}
#undef HC
}
} // anonymous namespace
void IndexBinaryHash::range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const
{
size_t nlist = 0, ndis = 0, n0 = 0;
#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
{
RangeSearchPartialResult pres (result);
#pragma omp for
for (size_t i = 0; i < n; i++) { // loop queries
RangeQueryResult & qres = pres.new_result (i);
RangeSearchResults res = {radius, qres};
const uint8_t *q = x + i * code_size;
search_single_query (*this, q, res, n0, nlist, ndis);
}
pres.finalize ();
}
indexBinaryHash_stats.nq += n;
indexBinaryHash_stats.n0 += n0;
indexBinaryHash_stats.nlist += nlist;
indexBinaryHash_stats.ndis += ndis;
}
void IndexBinaryHash::search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const
{
using HeapForL2 = CMax<int32_t, idx_t>;
size_t nlist = 0, ndis = 0, n0 = 0;
#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
for (size_t i = 0; i < n; i++) {
int32_t * simi = distances + k * i;
idx_t * idxi = labels + k * i;
heap_heapify<HeapForL2> (k, simi, idxi);
KnnSearchResults res = {k, simi, idxi};
const uint8_t *q = x + i * code_size;
search_single_query (*this, q, res, n0, nlist, ndis);
}
indexBinaryHash_stats.nq += n;
indexBinaryHash_stats.n0 += n0;
indexBinaryHash_stats.nlist += nlist;
indexBinaryHash_stats.ndis += ndis;
}
size_t IndexBinaryHash::hashtable_size() const
{
return invlists.size();
}
void IndexBinaryHash::display() const
{
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
printf("%ld: [", it->first);
const std::vector<idx_t> & v = it->second.ids;
for (auto x: v) {
printf("%ld ", 0 + x);
}
printf("]\n");
}
}
void IndexBinaryHashStats::reset()
{
memset ((void*)this, 0, sizeof (*this));
}
IndexBinaryHashStats indexBinaryHash_stats;
/*******************************************************
* IndexBinaryMultiHash implementation
******************************************************/
IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b):
IndexBinary(d),
storage(new IndexBinaryFlat(d)), own_fields(true),
maps(nhash), nhash(nhash), b(b), nflip(0)
{
FAISS_THROW_IF_NOT(nhash * b <= d);
}
IndexBinaryMultiHash::IndexBinaryMultiHash():
storage(nullptr), own_fields(true),
nhash(0), b(0), nflip(0)
{}
IndexBinaryMultiHash::~IndexBinaryMultiHash()
{
if (own_fields) {
delete storage;
}
}
void IndexBinaryMultiHash::reset()
{
storage->reset();
ntotal = 0;
for(auto map: maps) {
map.clear();
}
}
void IndexBinaryMultiHash::add(idx_t n, const uint8_t *x)
{
storage->add(n, x);
// populate maps
uint64_t mask = ((uint64_t)1 << b) - 1;
for(idx_t i = 0; i < n; i++) {
const uint8_t *xi = x + i * code_size;
int ho = 0;
for(int h = 0; h < nhash; h++) {
uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
hash &= mask;
maps[h][hash].push_back(i + ntotal);
ho += b;
}
}
ntotal += n;
}
namespace {
template <class HammingComputer, class SearchResults>
static
void verify_shortlist(
const IndexBinaryFlat & index,
const uint8_t * q,
const std::unordered_set<Index::idx_t> & shortlist,
SearchResults &res)
{
size_t code_size = index.code_size;
size_t nlist = 0, ndis = 0, n0 = 0;
HammingComputer hc (q, code_size);
const uint8_t *codes = index.xb.data();
for (auto i: shortlist) {
int dis = hc.hamming (codes + i * code_size);
res.add(dis, i);
}
}
template<class SearchResults>
void
search_1_query_multihash(const IndexBinaryMultiHash & index, const uint8_t *xi,
SearchResults &res,
size_t &n0, size_t &nlist, size_t &ndis)
{
std::unordered_set<idx_t> shortlist;
int b = index.b;
uint64_t mask = ((uint64_t)1 << b) - 1;
int ho = 0;
for(int h = 0; h < index.nhash; h++) {
uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
qhash &= mask;
const IndexBinaryMultiHash::Map & map = index.maps[h];
FlipEnumerator fe(index.b, index.nflip);
// loop over neighbors that are at most at nflip bits
do {
uint64_t hash = qhash ^ fe.x;
auto it = map.find (hash);
if (it != map.end()) {
const std::vector<idx_t> & v = it->second;
for (auto i: v) {
shortlist.insert(i);
}
nlist++;
} else {
n0++;
}
} while(fe.next());
ho += b;
}
ndis += shortlist.size();
// verify shortlist
#define HC(name) verify_shortlist<name> (*index.storage, xi, shortlist, res)
switch(index.code_size) {
case 4: HC(HammingComputer4); break;
case 8: HC(HammingComputer8); break;
case 16: HC(HammingComputer16); break;
case 20: HC(HammingComputer20); break;
case 32: HC(HammingComputer32); break;
default:
if (index.code_size % 8 == 0) {
HC(HammingComputerM8);
} else {
HC(HammingComputerDefault);
}
}
#undef HC
}
} // anonymous namespace
void IndexBinaryMultiHash::range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const
{
size_t nlist = 0, ndis = 0, n0 = 0;
#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
{
RangeSearchPartialResult pres (result);
#pragma omp for
for (size_t i = 0; i < n; i++) { // loop queries
RangeQueryResult & qres = pres.new_result (i);
RangeSearchResults res = {radius, qres};
const uint8_t *q = x + i * code_size;
search_1_query_multihash (*this, q, res, n0, nlist, ndis);
}
pres.finalize ();
}
indexBinaryHash_stats.nq += n;
indexBinaryHash_stats.n0 += n0;
indexBinaryHash_stats.nlist += nlist;
indexBinaryHash_stats.ndis += ndis;
}
void IndexBinaryMultiHash::search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const
{
using HeapForL2 = CMax<int32_t, idx_t>;
size_t nlist = 0, ndis = 0, n0 = 0;
#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
for (size_t i = 0; i < n; i++) {
int32_t * simi = distances + k * i;
idx_t * idxi = labels + k * i;
heap_heapify<HeapForL2> (k, simi, idxi);
KnnSearchResults res = {k, simi, idxi};
const uint8_t *q = x + i * code_size;
search_1_query_multihash (*this, q, res, n0, nlist, ndis);
}
indexBinaryHash_stats.nq += n;
indexBinaryHash_stats.n0 += n0;
indexBinaryHash_stats.nlist += nlist;
indexBinaryHash_stats.ndis += ndis;
}
size_t IndexBinaryMultiHash::hashtable_size() const
{
size_t tot = 0;
for (auto map: maps) {
tot += map.size();
}
return tot;
}
}

116
IndexBinaryHash.h 100644
View File

@ -0,0 +1,116 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#ifndef FAISS_BINARY_HASH_H
#define FAISS_BINARY_HASH_H
#include <vector>
#include <unordered_map>
#include <faiss/IndexBinary.h>
#include <faiss/IndexBinaryFlat.h>
#include <faiss/utils/Heap.h>
namespace faiss {
struct RangeSearchResult;
/** just uses the b first bits as a hash value */
struct IndexBinaryHash : IndexBinary {
struct InvertedList {
std::vector<idx_t> ids;
std::vector<uint8_t> vecs;
void add (idx_t id, size_t code_size, const uint8_t *code);
};
using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
InvertedListMap invlists;
int b, nflip;
IndexBinaryHash(int d, int b);
IndexBinaryHash();
void reset() override;
void add(idx_t n, const uint8_t *x) override;
void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
void range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const override;
void search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const override;
void display() const;
size_t hashtable_size() const;
};
struct IndexBinaryHashStats {
size_t nq; // nb of queries run
size_t n0; // nb of empty lists
size_t nlist; // nb of non-empty inverted lists scanned
size_t ndis; // nb of distancs computed
IndexBinaryHashStats () {reset (); }
void reset ();
};
extern IndexBinaryHashStats indexBinaryHash_stats;
/** just uses the b first bits as a hash value */
struct IndexBinaryMultiHash: IndexBinary {
// where the vectors are actually stored
IndexBinaryFlat *storage;
bool own_fields;
// maps hash values to the ids that hash to them
using Map = std::unordered_map<idx_t, std::vector<idx_t> >;
// the different hashes, size nhash
std::vector<Map> maps;
int nhash; ///< nb of hash maps
int b; ///< nb bits per hash map
int nflip; ///< nb bit flips to use at search time
IndexBinaryMultiHash(int d, int nhash, int b);
IndexBinaryMultiHash();
~IndexBinaryMultiHash();
void reset() override;
void add(idx_t n, const uint8_t *x) override;
void range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const override;
void search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const override;
size_t hashtable_size() const;
};
}
#endif

View File

@ -11,11 +11,13 @@
#include <faiss/IndexBinaryIVF.h>
#include <cstdio>
#include <omp.h>
#include <memory>
#include <faiss/utils/hamming.h>
#include <faiss/utils/utils.h>
#include <faiss/impl/AuxIndexStructures.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/IndexFlat.h>
@ -281,13 +283,15 @@ namespace {
using idx_t = Index::idx_t;
template<class HammingComputer, bool store_pairs>
template<class HammingComputer>
struct IVFBinaryScannerL2: BinaryInvertedListScanner {
HammingComputer hc;
size_t code_size;
bool store_pairs;
IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
IVFBinaryScannerL2 (size_t code_size, bool store_pairs):
code_size (code_size), store_pairs(store_pairs)
{}
void set_query (const uint8_t *query_vector) override {
@ -316,7 +320,7 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
uint32_t dis = hc.hamming (codes);
if (dis < simi[0]) {
heap_pop<C> (k, simi, idxi);
idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
heap_push<C> (k, simi, idxi, dis, id);
nup++;
}
@ -325,6 +329,24 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
return nup;
}
void scan_codes_range (size_t n,
const uint8_t *codes,
const idx_t *ids,
int radius,
RangeQueryResult &result) const
{
size_t nup = 0;
for (size_t j = 0; j < n; j++) {
uint32_t dis = hc.hamming (codes);
if (dis < radius) {
int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
result.add (dis, id);
}
codes += code_size;
}
}
};
@ -332,29 +354,6 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
template <bool store_pairs>
BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
switch (code_size) {
#define HANDLE_CS(cs) \
case cs: \
return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
HANDLE_CS(4);
HANDLE_CS(8);
HANDLE_CS(16);
HANDLE_CS(20);
HANDLE_CS(32);
HANDLE_CS(64);
#undef HANDLE_CS
default:
if (code_size % 8 == 0) {
return new IVFBinaryScannerL2<HammingComputerM8,
store_pairs> (code_size);
} else if (code_size % 4 == 0) {
return new IVFBinaryScannerL2<HammingComputerM4,
store_pairs> (code_size);
} else {
return new IVFBinaryScannerL2<HammingComputerDefault,
store_pairs> (code_size);
}
}
}
@ -425,8 +424,10 @@ void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
ids = sids->get();
}
nheap += scanner->scan_codes (list_size, scodes.get(),
ids, simi, idxi, k);
nheap += scanner->scan_codes (
list_size, scodes.get(),
ids, simi, idxi, k
);
nscan += list_size;
if (max_codes && nscan >= max_codes)
@ -586,11 +587,26 @@ void search_knn_hamming_count_1 (
BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
(bool store_pairs) const
{
if (store_pairs) {
return select_IVFBinaryScannerL2<true> (code_size);
} else {
return select_IVFBinaryScannerL2<false> (code_size);
#define HC(name) return new IVFBinaryScannerL2<name> (code_size, store_pairs)
switch (code_size) {
case 4: HC(HammingComputer4);
case 8: HC(HammingComputer8);
case 16: HC(HammingComputer16);
case 20: HC(HammingComputer20);
case 32: HC(HammingComputer32);
case 64: HC(HammingComputer64);
default:
if (code_size % 8 == 0) {
HC(HammingComputerM8);
} else if (code_size % 4 == 0) {
HC(HammingComputerM4);
} else {
HC(HammingComputerDefault);
}
}
#undef HC
}
void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
@ -616,6 +632,84 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
}
}
void IndexBinaryIVF::range_search(
idx_t n, const uint8_t *x, int radius,
RangeSearchResult *res) const
{
std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
double t0 = getmillisecs();
quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
indexIVF_stats.quantization_time += getmillisecs() - t0;
t0 = getmillisecs();
invlists->prefetch_lists(idx.get(), n * nprobe);
bool store_pairs = false;
size_t nlistv = 0, ndis = 0;
std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
#pragma omp parallel reduction(+: nlistv, ndis)
{
RangeSearchPartialResult pres(res);
std::unique_ptr<BinaryInvertedListScanner> scanner
(get_InvertedListScanner(store_pairs));
FAISS_THROW_IF_NOT (scanner.get ());
all_pres[omp_get_thread_num()] = &pres;
auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres)
{
idx_t key = idx[i * nprobe + ik]; /* select the list */
if (key < 0) return;
FAISS_THROW_IF_NOT_FMT (
key < (idx_t) nlist,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
key, ik, nlist);
const size_t list_size = invlists->list_size(key);
if (list_size == 0) return;
InvertedLists::ScopedCodes scodes (invlists, key);
InvertedLists::ScopedIds ids (invlists, key);
scanner->set_list (key, coarse_dis[i * nprobe + ik]);
nlistv++;
ndis += list_size;
scanner->scan_codes_range (list_size, scodes.get(),
ids.get(), radius, qres);
};
#pragma omp for
for (size_t i = 0; i < n; i++) {
scanner->set_query (x + i * code_size);
RangeQueryResult & qres = pres.new_result (i);
for (size_t ik = 0; ik < nprobe; ik++) {
scan_list_func (i, ik, qres);
}
}
pres.finalize();
}
indexIVF_stats.nq += n;
indexIVF_stats.nlist += nlistv;
indexIVF_stats.ndis += ndis;
indexIVF_stats.search_time += getmillisecs() - t0;
}
IndexBinaryIVF::~IndexBinaryIVF() {
if (own_invlists) {
delete invlists;

View File

@ -109,8 +109,11 @@ struct IndexBinaryIVF : IndexBinary {
bool store_pairs=false) const;
/** assign the vectors, then call search_preassign */
virtual void search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const override;
void search(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels) const override;
void range_search(idx_t n, const uint8_t *x, int radius,
RangeSearchResult *result) const override;
void reconstruct(idx_t key, uint8_t *recons) const override;
@ -202,6 +205,12 @@ struct BinaryInvertedListScanner {
int32_t *distances, idx_t *labels,
size_t k) const = 0;
virtual void scan_codes_range (size_t n,
const uint8_t *codes,
const idx_t *ids,
int radius,
RangeQueryResult &result) const = 0;
virtual ~BinaryInvertedListScanner () {}
};

View File

@ -19,6 +19,7 @@ namespace faiss {
/** Index that stores the full vectors and performs exhaustive search */
struct IndexFlat: Index {
/// database vectors, size ntotal * d
std::vector<float> xb;
@ -144,7 +145,7 @@ struct IndexRefineFlat: Index {
};
/// optimized version for 1D "vectors"
/// optimized version for 1D "vectors".
struct IndexFlat1D:IndexFlatL2 {
bool continuous_update; ///< is the permutation updated continuously?

View File

@ -612,7 +612,6 @@ InvertedListScanner *IndexIVF::get_InvertedListScanner (
void IndexIVF::reconstruct (idx_t key, float* recons) const
{
idx_t lo = direct_map.get (key);
reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
}

View File

@ -42,14 +42,14 @@ struct IndexIVFPQ: IndexIVF {
int polysemous_ht; ///< Hamming thresh for polysemous filtering
/** Precompute table that speed up query preprocessing at some
* memory cost
* memory cost (used only for by_residual with L2 metric)
* =-1: force disable
* =0: decide heuristically (default: use tables only if they are
* < precomputed_tables_max_bytes)
* =1: tables that work for all quantizers (size 256 * nlist * M)
* =2: specific version for MultiIndexQuantizer (much more compact)
*/
int use_precomputed_table; ///< if by_residual, build precompute tables
int use_precomputed_table;
static size_t precomputed_table_max_bytes;
/// if use_precompute_table
@ -93,9 +93,9 @@ struct IndexIVFPQ: IndexIVF {
* the duplicates are returned in pre-allocated arrays (see the
* max sizes).
*
* @params lims limits between groups of duplicates
* @param lims limits between groups of duplicates
* (max size ntotal / 2 + 1)
* @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
* @param ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
* duplicates (max size ntotal)
* @return n number of groups found
*/
@ -135,15 +135,14 @@ struct IndexIVFPQ: IndexIVF {
/// statistics are robust to internal threading, but not if
/// IndexIVFPQ::search_preassigned is called by multiple threads
struct IndexIVFPQStats {
size_t nrefine; // nb of refines (IVFPQR)
size_t nrefine; ///< nb of refines (IVFPQR)
size_t n_hamming_pass;
// nb of passed Hamming distance tests (for polysemous)
///< nb of passed Hamming distance tests (for polysemous)
// timings measured with the CPU RTC
// on all threads
// timings measured with the CPU RTC on all threads
size_t search_cycles;
size_t refine_cycles; // only for IVFPQR
size_t refine_cycles; ///< only for IVFPQR
IndexIVFPQStats () {reset (); }
void reset ();

View File

@ -0,0 +1,185 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cstdio>
#include <cstdlib>
#include <faiss/Clustering.h>
#include <faiss/utils/random.h>
#include <faiss/utils/distances.h>
#include <faiss/IndexFlat.h>
#include <faiss/IndexHNSW.h>
namespace {
enum WeightedKMeansType {
WKMT_FlatL2,
WKMT_FlatIP,
WKMT_FlatIP_spherical,
WKMT_HNSW,
};
float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
const float *input,
const float *weights,
float *centroids,
WeightedKMeansType index_num)
{
using namespace faiss;
Clustering clus (d, k);
clus.verbose = true;
std::unique_ptr<Index> index;
switch (index_num) {
case WKMT_FlatL2:
index.reset(new IndexFlatL2 (d));
break;
case WKMT_FlatIP:
index.reset(new IndexFlatIP (d));
break;
case WKMT_FlatIP_spherical:
index.reset(new IndexFlatIP (d));
clus.spherical = true;
break;
case WKMT_HNSW:
IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
ihnsw->hnsw.efSearch = 128;
index.reset(ihnsw);
break;
}
clus.train(n, input, *index.get(), weights);
// on output the index contains the centroids.
memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
return clus.iteration_stats.back().obj;
}
int d = 32;
float sigma = 0.1;
#define BIGTEST
#ifdef BIGTEST
// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
int nc = 200000;
int n_big = 4;
int n_small = 2;
#else
int nc = 5;
int n_big = 100;
int n_small = 10;
#endif
int n; // number of training points
void generate_trainset (std::vector<float> & ccent,
std::vector<float> & x,
std::vector<float> & weights)
{
// same sampling as test_build_blocks.py test_weighted
ccent.resize (d * 2 * nc);
faiss::float_randn (ccent.data(), d * 2 * nc, 123);
faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
n = nc * n_big + nc * n_small;
x.resize(d * n);
weights.resize(n);
faiss::float_randn (x.data(), x.size(), 1234);
float *xi = x.data();
float *w = weights.data();
for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
int np = ci < nc ? n_big : n_small; // nb of points around this centroid
for (int i = 0; i < np; i++) {
for (int j = 0; j < d; j++) {
xi[j] = xi[j] * sigma + ccent[ci * d + j];
}
*w++ = ci < nc ? 0.1 : 10;
xi += d;
}
}
}
}
int main(int argc, char **argv) {
std::vector<float> ccent;
std::vector<float> x;
std::vector<float> weights;
printf("generate training set\n");
generate_trainset(ccent, x, weights);
std::vector<float> centroids;
centroids.resize(nc * d);
int the_index_num = -1;
int the_with_weights = -1;
if (argc == 3) {
the_index_num = atoi(argv[1]);
the_with_weights = atoi(argv[2]);
}
for (int index_num = WKMT_FlatL2;
index_num <= WKMT_HNSW;
index_num++) {
if (the_index_num >= 0 && index_num != the_index_num) {
continue;
}
for (int with_weights = 0; with_weights <= 1; with_weights++) {
if (the_with_weights >= 0 && with_weights != the_with_weights) {
continue;
}
printf("=================== index_num=%d Run %s weights\n",
index_num, with_weights ? "with" : "without");
weighted_kmeans_clustering (
d, n, nc, x.data(),
with_weights ? weights.data() : nullptr,
centroids.data(), (WeightedKMeansType)index_num
);
{ // compute distance of points to centroids
faiss::IndexFlatL2 cent_index(d);
cent_index.add(nc, centroids.data());
std::vector<float> dis (n);
std::vector<faiss::Index::idx_t> idx (n);
cent_index.search (nc * 2, ccent.data(), 1,
dis.data(), idx.data());
float dis1 = 0, dis2 = 0;
for (int i = 0; i < nc ; i++) {
dis1 += dis[i];
}
printf("average distance of points from big clusters: %g\n",
dis1 / nc);
for (int i = 0; i < nc ; i++) {
dis2 += dis[i + nc];
}
printf("average distance of points from small clusters: %g\n",
dis2 / nc);
}
}
}
return 0;
}

View File

@ -29,8 +29,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
config),
config_(std::move(config)),
data_(nullptr) {
verifySettings_();
// Flat index doesn't need training
this->is_trained = true;
@ -44,8 +42,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
GpuIndex(resources, dims, metric, 0, config),
config_(std::move(config)),
data_(nullptr) {
verifySettings_();
// Flat index doesn't need training
this->is_trained = true;
@ -298,21 +294,6 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n,
fromDevice<float, 2>(residualDevice, residuals, stream);
}
void
GpuIndexFlat::verifySettings_() const {
// If we want Hgemm, ensure that it is supported on this device
if (config_.useFloat16Accumulator) {
FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
"useFloat16Accumulator can only be enabled "
"with useFloat16");
FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
"Device %d does not support Hgemm "
"(useFloat16Accumulator)",
config_.device);
}
}
//
// GpuIndexFlatL2
//

View File

@ -25,17 +25,12 @@ struct FlatIndex;
struct GpuIndexFlatConfig : public GpuIndexConfig {
inline GpuIndexFlatConfig()
: useFloat16(false),
useFloat16Accumulator(false),
storeTransposed(false) {
}
/// Whether or not data is stored as float16
bool useFloat16;
/// This option is now deprecated and doesn't do anything. All accumulation of
/// float16 or float32 data is now done in float32.
bool useFloat16Accumulator;
/// Whether or not data is stored (transparently) in a transposed
/// layout, enabling use of the NN GEMM call, which is ~10% faster.
/// This will improve the speed of the flat index, but will
@ -123,10 +118,6 @@ class GpuIndexFlat : public GpuIndex {
float* distances,
faiss::Index::idx_t* labels) const override;
private:
/// Checks user settings for consistency
void verifySettings_() const;
protected:
/// Our config object
const GpuIndexFlatConfig config_;

View File

@ -62,6 +62,22 @@ FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
}
}
template <>
Tensor<float, 2, true>&
FlatIndex::getVectorsRef<float>() {
// Should not call this unless we are in float32 mode
FAISS_ASSERT(!useFloat16_);
return getVectorsFloat32Ref();
}
template <>
Tensor<half, 2, true>&
FlatIndex::getVectorsRef<half>() {
// Should not call this unless we are in float16 mode
FAISS_ASSERT(useFloat16_);
return getVectorsFloat16Ref();
}
Tensor<float, 2, true>&
FlatIndex::getVectorsFloat32Ref() {
// Should not call this unless we are in float32 mode

View File

@ -26,16 +26,23 @@ class FlatIndex {
bool storeTransposed,
MemorySpace space);
/// Whether or not this flat index primarily stores data in float16
bool getUseFloat16() const;
/// Returns the number of vectors we contain
int getSize() const;
/// Returns the dimensionality of the vectors
int getDim() const;
/// Reserve storage that can contain at least this many vectors
void reserve(size_t numVecs, cudaStream_t stream);
/// Returns the vectors based on the type desired; the FlatIndex must be of
/// the same type (float16 or float32) to not assert
template <typename T>
Tensor<T, 2, true>& getVectorsRef();
/// Returns a reference to our vectors currently in use
Tensor<float, 2, true>& getVectorsFloat32Ref();

View File

@ -123,8 +123,6 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
FAISS_ASSERT(vecs.getSize(1) == dim_);
FAISS_ASSERT(!quantizer_->getUseFloat16());
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
auto& mem = resources_->getMemoryManagerCurrentDevice();
auto stream = resources_->getDefaultStreamCurrentDevice();
@ -155,7 +153,13 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
DeviceTensor<float, 2, true> residuals(
mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
if (quantizer_->getUseFloat16()) {
auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
} else {
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
}
// Residuals are in the form
// (vec x numSubQuantizer x dimPerSubQuantizer)
@ -437,8 +441,9 @@ IVFPQ::setPQCentroids_(float* data) {
pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
}
template <typename CentroidT>
void
IVFPQ::precomputeCodes_() {
IVFPQ::precomputeCodesT_() {
FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
//
@ -449,8 +454,6 @@ IVFPQ::precomputeCodes_() {
// Terms 1 and 3 are available only at query time. We compute term 2
// here.
FAISS_ASSERT(!quantizer_->getUseFloat16());
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
// Compute ||y_R||^2 by treating
// (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
@ -473,9 +476,10 @@ IVFPQ::precomputeCodes_() {
// (centroid id)(sub q)(dim)
// Transpose (centroid id)(sub q)(sub dim) to
// (sub q)(centroid id)(sub dim)
auto centroidView = coarseCentroids.view<3>(
auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
auto centroidView = coarseCentroids.template view<3>(
{coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
DeviceTensor<float, 3, true> centroidsTransposed(
DeviceTensor<CentroidT, 3, true> centroidsTransposed(
{numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
runTransposeAny(centroidView, 0, 1, centroidsTransposed,
@ -521,6 +525,15 @@ IVFPQ::precomputeCodes_() {
}
}
void
IVFPQ::precomputeCodes_() {
if (quantizer_->getUseFloat16()) {
precomputeCodesT_<half>();
} else {
precomputeCodesT_<float>();
}
}
void
IVFPQ::query(Tensor<float, 2, true>& queries,
int nprobe,
@ -688,16 +701,16 @@ IVFPQ::runPQPrecomputedCodes_(
resources_);
}
template <typename CentroidT>
void
IVFPQ::runPQNoPrecomputedCodes_(
IVFPQ::runPQNoPrecomputedCodesT_(
Tensor<float, 2, true>& queries,
DeviceTensor<float, 2, true>& coarseDistances,
DeviceTensor<int, 2, true>& coarseIndices,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices) {
FAISS_ASSERT(!quantizer_->getUseFloat16());
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
runPQScanMultiPassNoPrecomputed(queries,
coarseCentroids,
@ -719,4 +732,29 @@ IVFPQ::runPQNoPrecomputedCodes_(
resources_);
}
void
IVFPQ::runPQNoPrecomputedCodes_(
Tensor<float, 2, true>& queries,
DeviceTensor<float, 2, true>& coarseDistances,
DeviceTensor<int, 2, true>& coarseIndices,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices) {
if (quantizer_->getUseFloat16()) {
runPQNoPrecomputedCodesT_<half>(queries,
coarseDistances,
coarseIndices,
k,
outDistances,
outIndices);
} else {
runPQNoPrecomputedCodesT_<float>(queries,
coarseDistances,
coarseIndices,
k,
outDistances,
outIndices);
}
}
} } // namespace

View File

@ -83,6 +83,11 @@ class IVFPQ : public IVFBase {
/// Calculate precomputed residual distance information
void precomputeCodes_();
/// Calculate precomputed residual distance information (for different coarse
/// centroid type)
template <typename CentroidT>
void precomputeCodesT_();
/// Runs kernels for scanning inverted lists with precomputed codes
void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
DeviceTensor<float, 2, true>& coarseDistances,
@ -99,6 +104,16 @@ class IVFPQ : public IVFBase {
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices);
/// Runs kernels for scanning inverted lists without precomputed codes (for
/// different coarse centroid type)
template <typename CentroidT>
void runPQNoPrecomputedCodesT_(Tensor<float, 2, true>& queries,
DeviceTensor<float, 2, true>& coarseDistances,
DeviceTensor<int, 2, true>& coarseIndices,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices);
private:
/// Number of sub-quantizers per vector
const int numSubQuantizers_;

View File

@ -0,0 +1,561 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/impl/BroadcastSum.cuh>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/impl/L2Norm.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/DeviceDefs.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Float16.cuh>
#include <faiss/gpu/utils/MatrixMult.cuh>
#include <faiss/gpu/utils/PtxUtils.cuh>
#include <faiss/gpu/utils/StaticUtils.h>
#include <faiss/gpu/utils/Transpose.cuh>
namespace faiss { namespace gpu {
// Kernel responsible for calculating distance from residual vector to
// each product quantizer code centroid
template <typename OutCodeT,
typename CentroidT,
int DimsPerSubQuantizer,
bool L2Distance>
__global__ void
__launch_bounds__(288, 4)
pqCodeDistances(Tensor<float, 2, true> queries,
int queriesPerBlock,
Tensor<CentroidT, 2, true> coarseCentroids,
Tensor<float, 3, true> pqCentroids,
Tensor<int, 2, true> topQueryToCentroid,
// (query id)(coarse)(subquantizer)(code) -> dist
Tensor<OutCodeT, 4, true> outCodeDistances) {
const auto numSubQuantizers = pqCentroids.getSize(0);
const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
extern __shared__ float smem[];
// Each thread calculates a single code
float subQuantizerData[DimsPerSubQuantizer];
auto code = threadIdx.x;
auto subQuantizer = blockIdx.y;
// Each thread will load the pq centroid data for the code that it
// is processing
#pragma unroll
for (int i = 0; i < DimsPerSubQuantizer; ++i) {
subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
}
// Where we store our query vector
float* smemQuery = smem;
// Where we store our residual vector; this is double buffered so we
// can be loading the next one while processing the current one
float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
// Where we pre-load the coarse centroid IDs
int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
// Each thread is calculating the distance for a single code,
// performing the reductions locally
// Handle multiple queries per block
auto startQueryId = blockIdx.x * queriesPerBlock;
auto numQueries = queries.getSize(0) - startQueryId;
if (numQueries > queriesPerBlock) {
numQueries = queriesPerBlock;
}
for (int query = 0; query < numQueries; ++query) {
auto queryId = startQueryId + query;
auto querySubQuantizer =
queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
// Load current query vector
for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
smemQuery[i] = querySubQuantizer[i];
}
// Load list of coarse centroids found
for (int i = threadIdx.x;
i < topQueryToCentroid.getSize(1); i += blockDim.x) {
coarseIds[i] = topQueryToCentroid[queryId][i];
}
// We need coarseIds below
// FIXME: investigate loading separately, so we don't need this
__syncthreads();
// Preload first buffer of residual data
if (isLoadingThread) {
for (int i = loadingThreadId;
i < DimsPerSubQuantizer;
i += blockDim.x - codesPerSubQuantizer) {
auto coarseId = coarseIds[0];
// In case NaNs were in the original query data
coarseId = coarseId == -1 ? 0 : coarseId;
auto coarseCentroidSubQuantizer =
coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
if (L2Distance) {
smemResidual1[i] = smemQuery[i] -
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
} else {
smemResidual1[i] =
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
}
}
}
// The block walks the list for a single query
for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
// Wait for smemResidual1 to be loaded
__syncthreads();
if (isLoadingThread) {
// Preload second buffer of residual data
for (int i = loadingThreadId;
i < DimsPerSubQuantizer;
i += blockDim.x - codesPerSubQuantizer) {
// FIXME: try always making this centroid id 0 so we can
// terminate
if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
auto coarseId = coarseIds[coarse + 1];
// In case NaNs were in the original query data
coarseId = coarseId == -1 ? 0 : coarseId;
auto coarseCentroidSubQuantizer =
coarseCentroids[coarseId]
[subQuantizer * dimsPerSubQuantizer].data();
if (L2Distance) {
smemResidual2[i] = smemQuery[i] -
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
} else {
smemResidual2[i] =
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
}
}
}
} else {
// These are the processing threads
float dist = 0.0f;
constexpr int kUnroll = 4;
constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
float vals[kUnroll];
// Calculate residual - pqCentroid for each dim that we're
// processing
// Unrolled loop
if (L2Distance) {
#pragma unroll
for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] = smemResidual1[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] -= subQuantizerData[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] *= vals[j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
dist += vals[j];
}
}
} else {
// Inner product: query slice against the reconstructed sub-quantizer
// for this coarse cell (query o (centroid + subQCentroid))
#pragma unroll
for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] = smemResidual1[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] += subQuantizerData[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] *= smemQuery[i * kUnroll + j];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
dist += vals[j];
}
}
}
// Remainder loop
if (L2Distance) {
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] = smemResidual1[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] -= subQuantizerData[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] *= vals[j];
}
} else {
// Inner product
// Inner product: query slice against the reconstructed sub-quantizer
// for this coarse cell (query o (centroid + subQCentroid))
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] = smemResidual1[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] += subQuantizerData[kRemainderBase + j];
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
vals[j] *= smemQuery[kRemainderBase + j];
}
}
#pragma unroll
for (int j = 0; j < kRemainder; ++j) {
dist += vals[j];
}
// We have the distance for our code; write it out
outCodeDistances[queryId][coarse][subQuantizer][code] =
ConvertTo<OutCodeT>::to(dist);
} // !isLoadingThread
// Swap residual buffers
float* tmp = smemResidual1;
smemResidual1 = smemResidual2;
smemResidual2 = tmp;
}
}
}
template <typename CentroidT>
__global__ void
residualVector(Tensor<float, 2, true> queries,
Tensor<CentroidT, 2, true> coarseCentroids,
Tensor<int, 2, true> topQueryToCentroid,
int numSubDim,
// output is transposed:
// (sub q)(query id)(centroid id)(sub dim)
Tensor<float, 4, true> residual) {
// block x is query id
// block y is centroid id
// thread x is dim
auto queryId = blockIdx.x;
auto centroidId = blockIdx.y;
int realCentroidId = topQueryToCentroid[queryId][centroidId];
for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
float q = queries[queryId][dim];
float c = ConvertTo<float>::to(coarseCentroids[realCentroidId][dim]);
residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = q - c;
}
}
template <typename CentroidT>
void
runResidualVector(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<CentroidT, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
Tensor<float, 4, true>& residual,
cudaStream_t stream) {
auto grid =
dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
residualVector<<<grid, block, 0, stream>>>(
queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
residual);
CUDA_TEST_ERROR();
}
template <typename CentroidT>
void
runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<CentroidT, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
NoTypeTensor<4, true>& outCodeDistances,
bool useFloat16Lookup,
DeviceMemory& mem,
cublasHandle_t handle,
cudaStream_t stream) {
// Calculate (q - c) residual vector
// (sub q)(query id)(centroid id)(sub dim)
DeviceTensor<float, 4, true> residual(
mem,
{pqCentroids.getSize(0),
topQueryToCentroid.getSize(0),
topQueryToCentroid.getSize(1),
pqCentroids.getSize(1)},
stream);
runResidualVector(pqCentroids, queries,
coarseCentroids, topQueryToCentroid,
residual, stream);
// Calculate ||q - c||^2
DeviceTensor<float, 1, true> residualNorms(
mem,
{pqCentroids.getSize(0) *
topQueryToCentroid.getSize(0) *
topQueryToCentroid.getSize(1)},
stream);
auto residualView2 = residual.view<2>(
{pqCentroids.getSize(0) *
topQueryToCentroid.getSize(0) *
topQueryToCentroid.getSize(1),
pqCentroids.getSize(1)});
runL2Norm(residualView2, true, residualNorms, true, stream);
// Perform a batch MM:
// (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
// (sub q) x {(q * c)(code)}
auto residualView3 = residual.view<3>(
{pqCentroids.getSize(0),
topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
pqCentroids.getSize(1)});
DeviceTensor<float, 3, true> residualDistance(
mem,
{pqCentroids.getSize(0),
topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
pqCentroids.getSize(2)},
stream);
runIteratedMatrixMult(residualDistance, false,
residualView3, false,
pqCentroids, false,
-2.0f, 0.0f,
handle,
stream);
// Sum ||q - c||^2 along rows
auto residualDistanceView2 = residualDistance.view<2>(
{pqCentroids.getSize(0) *
topQueryToCentroid.getSize(0) *
topQueryToCentroid.getSize(1),
pqCentroids.getSize(2)});
runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
Tensor<float, 4, true> outCodeDistancesF;
DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
if (useFloat16Lookup) {
outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
mem, {outCodeDistances.getSize(0),
outCodeDistances.getSize(1),
outCodeDistances.getSize(2),
outCodeDistances.getSize(3)},
stream);
outCodeDistancesF = outCodeDistancesFloatMem;
} else {
outCodeDistancesF = outCodeDistances.toTensor<float>();
}
// Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
// is where we build our output distances)
auto outCodeDistancesView = outCodeDistancesF.view<3>(
{topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
outCodeDistances.getSize(2),
outCodeDistances.getSize(3)});
runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
// Calculate code norms per each sub-dim
// (sub q)(sub dim)(code) is pqCentroids
// transpose to (sub q)(code)(sub dim)
DeviceTensor<float, 3, true> pqCentroidsTranspose(
mem,
{pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
stream);
runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
{pqCentroids.getSize(0) * pqCentroids.getSize(2),
pqCentroids.getSize(1)});
DeviceTensor<float, 1, true> pqCentroidsNorm(
mem,
{pqCentroids.getSize(0) * pqCentroids.getSize(2)},
stream);
runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
// View output as (q * c)(sub q * code), and add centroid norm to
// each row
auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
{topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
if (useFloat16Lookup) {
// Need to convert back
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
convertTensor<float, half, 4>(stream,
outCodeDistancesF,
outCodeDistancesH);
}
}
template <typename CentroidT>
void
runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<CentroidT, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
NoTypeTensor<4, true>& outCodeDistances,
bool l2Distance,
bool useFloat16Lookup,
cudaStream_t stream) {
const auto numSubQuantizers = pqCentroids.getSize(0);
const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
// FIXME: tune
// Reuse of pq centroid data is based on both # of queries * nprobe,
// and we should really be tiling in both dimensions
constexpr int kQueriesPerBlock = 8;
auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
numSubQuantizers);
// Reserve one block of threads for double buffering
// FIXME: probably impractical for large # of dims?
auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
auto block = dim3(codesPerSubQuantizer + loadingThreads);
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
+ topQueryToCentroid.getSize(1) * sizeof(int);
#define RUN_CODE(DIMS, L2) \
do { \
if (useFloat16Lookup) { \
auto outCodeDistancesT = outCodeDistances.toTensor<half>(); \
\
pqCodeDistances<half, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
queries, kQueriesPerBlock, \
coarseCentroids, pqCentroids, \
topQueryToCentroid, outCodeDistancesT); \
} else { \
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
\
pqCodeDistances<float, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
queries, kQueriesPerBlock, \
coarseCentroids, pqCentroids, \
topQueryToCentroid, outCodeDistancesT); \
} \
} while (0)
#define CODE_L2(DIMS) \
do { \
if (l2Distance) { \
RUN_CODE(DIMS, true); \
} else { \
RUN_CODE(DIMS, false); \
} \
} while (0)
switch (dimsPerSubQuantizer) {
case 1:
CODE_L2(1);
break;
case 2:
CODE_L2(2);
break;
case 3:
CODE_L2(3);
break;
case 4:
CODE_L2(4);
break;
case 6:
CODE_L2(6);
break;
case 8:
CODE_L2(8);
break;
case 10:
CODE_L2(10);
break;
case 12:
CODE_L2(12);
break;
case 16:
CODE_L2(16);
break;
case 20:
CODE_L2(20);
break;
case 24:
CODE_L2(24);
break;
case 28:
CODE_L2(28);
break;
case 32:
CODE_L2(32);
break;
// FIXME: larger sizes require too many registers - we need the
// MM implementation working
default:
FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
"not currently supported");
}
#undef RUN_CODE
#undef CODE_L2
CUDA_TEST_ERROR();
}
} } // namespace

View File

@ -20,18 +20,20 @@ class DeviceMemory;
/// Calculates the distance from the (query - centroid) residual to
/// each sub-code vector, for the given list of query results in
/// topQueryToCentroid
template <typename CentroidT>
void runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& coarseCentroids,
Tensor<CentroidT, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
NoTypeTensor<4, true>& outCodeDistances,
bool l2Distance,
bool useFloat16Lookup,
cudaStream_t stream);
template <typename CentroidT>
void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& coarseCentroids,
Tensor<CentroidT, 2, true>& coarseCentroids,
Tensor<int, 2, true>& topQueryToCentroid,
NoTypeTensor<4, true>& outCodeDistances,
bool useFloat16Lookup,
@ -40,3 +42,5 @@ void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
cudaStream_t stream);
} } // namespace
#include <faiss/gpu/impl/PQCodeDistances-inl.cuh>

View File

@ -0,0 +1,599 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/impl/PQCodeDistances.cuh>
#include <faiss/gpu/impl/PQCodeLoad.cuh>
#include <faiss/gpu/impl/IVFUtils.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Float16.cuh>
#include <faiss/gpu/utils/LoadStoreOperators.cuh>
#include <faiss/gpu/utils/NoTypeTensor.cuh>
#include <faiss/gpu/utils/StaticUtils.h>
#include <faiss/gpu/utils/HostTensor.cuh>
namespace faiss { namespace gpu {
// This must be kept in sync with PQCodeDistances.cu
inline bool isSupportedNoPrecomputedSubDimSize(int dims) {
switch (dims) {
case 1:
case 2:
case 3:
case 4:
case 6:
case 8:
case 10:
case 12:
case 16:
case 20:
case 24:
case 28:
case 32:
return true;
default:
// FIXME: larger sizes require too many registers - we need the
// MM implementation working
return false;
}
}
template <typename LookupT, typename LookupVecT>
struct LoadCodeDistances {
static inline __device__ void load(LookupT* smem,
LookupT* codes,
int numCodes) {
constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
// We can only use the vector type if the data is guaranteed to be
// aligned. The codes are innermost, so if it is evenly divisible,
// then any slice will be aligned.
if (numCodes % kWordSize == 0) {
// Load the data by float4 for efficiency, and then handle any remainder
// limitVec is the number of whole vec words we can load, in terms
// of whole blocks performing the load
constexpr int kUnroll = 2;
int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
limitVec *= kUnroll * blockDim.x;
LookupVecT* smemV = (LookupVecT*) smem;
LookupVecT* codesV = (LookupVecT*) codes;
for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
LookupVecT vals[kUnroll];
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] =
LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
}
}
// This is where we start loading the remainder that does not evenly
// fit into kUnroll x blockDim.x
int remainder = limitVec * kWordSize;
for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
smem[i] = codes[i];
}
} else {
// Potential unaligned load
constexpr int kUnroll = 4;
int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
int i = threadIdx.x;
for (; i < limit; i += kUnroll * blockDim.x) {
LookupT vals[kUnroll];
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
vals[j] = codes[i + j * blockDim.x];
}
#pragma unroll
for (int j = 0; j < kUnroll; ++j) {
smem[i + j * blockDim.x] = vals[j];
}
}
for (; i < numCodes; i += blockDim.x) {
smem[i] = codes[i];
}
}
}
};
template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
__global__ void
pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
Tensor<float, 3, true> pqCentroids,
Tensor<int, 2, true> topQueryToCentroid,
Tensor<LookupT, 4, true> codeDistances,
void** listCodes,
int* listLengths,
Tensor<int, 2, true> prefixSumOffsets,
Tensor<float, 1, true> distance) {
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
// Where the pq code -> residual distance is stored
extern __shared__ char smemCodeDistances[];
LookupT* codeDist = (LookupT*) smemCodeDistances;
// Each block handles a single query
auto queryId = blockIdx.y;
auto probeId = blockIdx.x;
// This is where we start writing out data
// We ensure that before the array (at offset -1), there is a 0 value
int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
float* distanceOut = distance[outBase].data();
auto listId = topQueryToCentroid[queryId][probeId];
// Safety guard in case NaNs in input cause no list ID to be generated
if (listId == -1) {
return;
}
unsigned char* codeList = (unsigned char*) listCodes[listId];
int limit = listLengths[listId];
constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
(NumSubQuantizers / 4);
unsigned int code32[kNumCode32];
unsigned int nextCode32[kNumCode32];
// We double-buffer the code loading, which improves memory utilization
if (threadIdx.x < limit) {
LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
}
LoadCodeDistances<LookupT, LookupVecT>::load(
codeDist,
codeDistances[queryId][probeId].data(),
codeDistances.getSize(2) * codeDistances.getSize(3));
// Prevent WAR dependencies
__syncthreads();
// Each thread handles one code element in the list, with a
// block-wide stride
for (int codeIndex = threadIdx.x;
codeIndex < limit;
codeIndex += blockDim.x) {
// Prefetch next codes
if (codeIndex + blockDim.x < limit) {
LoadCode32<NumSubQuantizers>::load(
nextCode32, codeList, codeIndex + blockDim.x);
}
float dist = 0.0f;
#pragma unroll
for (int word = 0; word < kNumCode32; ++word) {
constexpr int kBytesPerCode32 =
NumSubQuantizers < 4 ? NumSubQuantizers : 4;
if (kBytesPerCode32 == 1) {
auto code = code32[0];
dist = ConvertTo<float>::to(codeDist[code]);
} else {
#pragma unroll
for (int byte = 0; byte < kBytesPerCode32; ++byte) {
auto code = getByte(code32[word], byte * 8, 8);
auto offset =
codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
dist += ConvertTo<float>::to(codeDist[offset + code]);
}
}
}
// Write out intermediate distance result
// We do not maintain indices here, in order to reduce global
// memory traffic. Those are recovered in the final selection step.
distanceOut[codeIndex] = dist;
// Rotate buffers
#pragma unroll
for (int word = 0; word < kNumCode32; ++word) {
code32[word] = nextCode32[word];
}
}
}
template <typename CentroidT>
void
runMultiPassTile(Tensor<float, 2, true>& queries,
Tensor<CentroidT, 2, true>& centroids,
Tensor<float, 3, true>& pqCentroidsInnermostCode,
NoTypeTensor<4, true>& codeDistances,
Tensor<int, 2, true>& topQueryToCentroid,
bool useFloat16Lookup,
int bytesPerCode,
int numSubQuantizers,
int numSubQuantizerCodes,
thrust::device_vector<void*>& listCodes,
thrust::device_vector<void*>& listIndices,
IndicesOptions indicesOptions,
thrust::device_vector<int>& listLengths,
Tensor<char, 1, true>& thrustMem,
Tensor<int, 2, true>& prefixSumOffsets,
Tensor<float, 1, true>& allDistances,
Tensor<float, 3, true>& heapDistances,
Tensor<int, 3, true>& heapIndices,
int k,
faiss::MetricType metric,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices,
cudaStream_t stream) {
// We only support two metrics at the moment
FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
metric == MetricType::METRIC_L2);
bool l2Distance = metric == MetricType::METRIC_L2;
// Calculate offset lengths, so we know where to write out
// intermediate results
runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
thrustMem, stream);
// Calculate residual code distances, since this is without
// precomputed codes
runPQCodeDistances(pqCentroidsInnermostCode,
queries,
centroids,
topQueryToCentroid,
codeDistances,
l2Distance,
useFloat16Lookup,
stream);
// Convert all codes to a distance, and write out (distance,
// index) values for all intermediate results
{
auto kThreadsPerBlock = 256;
auto grid = dim3(topQueryToCentroid.getSize(1),
topQueryToCentroid.getSize(0));
auto block = dim3(kThreadsPerBlock);
// pq centroid distances
auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
smem *= numSubQuantizers * numSubQuantizerCodes;
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \
do { \
auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>(); \
\
pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
<<<grid, block, smem, stream>>>( \
queries, \
pqCentroidsInnermostCode, \
topQueryToCentroid, \
codeDistancesT, \
listCodes.data().get(), \
listLengths.data().get(), \
prefixSumOffsets, \
allDistances); \
} while (0)
#define RUN_PQ(NUM_SUB_Q) \
do { \
if (useFloat16Lookup) { \
RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \
} else { \
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
} \
} while (0)
switch (bytesPerCode) {
case 1:
RUN_PQ(1);
break;
case 2:
RUN_PQ(2);
break;
case 3:
RUN_PQ(3);
break;
case 4:
RUN_PQ(4);
break;
case 8:
RUN_PQ(8);
break;
case 12:
RUN_PQ(12);
break;
case 16:
RUN_PQ(16);
break;
case 20:
RUN_PQ(20);
break;
case 24:
RUN_PQ(24);
break;
case 28:
RUN_PQ(28);
break;
case 32:
RUN_PQ(32);
break;
case 40:
RUN_PQ(40);
break;
case 48:
RUN_PQ(48);
break;
case 56:
RUN_PQ(56);
break;
case 64:
RUN_PQ(64);
break;
case 96:
RUN_PQ(96);
break;
default:
FAISS_ASSERT(false);
break;
}
#undef RUN_PQ
#undef RUN_PQ_OPT
}
CUDA_TEST_ERROR();
// k-select the output in chunks, to increase parallelism
runPass1SelectLists(prefixSumOffsets,
allDistances,
topQueryToCentroid.getSize(1),
k,
!l2Distance, // L2 distance chooses smallest
heapDistances,
heapIndices,
stream);
// k-select final output
auto flatHeapDistances = heapDistances.downcastInner<2>();
auto flatHeapIndices = heapIndices.downcastInner<2>();
runPass2SelectLists(flatHeapDistances,
flatHeapIndices,
listIndices,
indicesOptions,
prefixSumOffsets,
topQueryToCentroid,
k,
!l2Distance, // L2 distance chooses smallest
outDistances,
outIndices,
stream);
}
template <typename CentroidT>
void
runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
Tensor<CentroidT, 2, true>& centroids,
Tensor<float, 3, true>& pqCentroidsInnermostCode,
Tensor<int, 2, true>& topQueryToCentroid,
bool useFloat16Lookup,
int bytesPerCode,
int numSubQuantizers,
int numSubQuantizerCodes,
thrust::device_vector<void*>& listCodes,
thrust::device_vector<void*>& listIndices,
IndicesOptions indicesOptions,
thrust::device_vector<int>& listLengths,
int maxListLength,
int k,
faiss::MetricType metric,
// output
Tensor<float, 2, true>& outDistances,
// output
Tensor<long, 2, true>& outIndices,
GpuResources* res) {
constexpr int kMinQueryTileSize = 8;
constexpr int kMaxQueryTileSize = 128;
constexpr int kThrustMemSize = 16384;
int nprobe = topQueryToCentroid.getSize(1);
auto& mem = res->getMemoryManagerCurrentDevice();
auto stream = res->getDefaultStreamCurrentDevice();
// Make a reservation for Thrust to do its dirty work (global memory
// cross-block reduction space); hopefully this is large enough.
DeviceTensor<char, 1, true> thrustMem1(
mem, {kThrustMemSize}, stream);
DeviceTensor<char, 1, true> thrustMem2(
mem, {kThrustMemSize}, stream);
DeviceTensor<char, 1, true>* thrustMem[2] =
{&thrustMem1, &thrustMem2};
// How much temporary storage is available?
// If possible, we'd like to fit within the space available.
size_t sizeAvailable = mem.getSizeAvailable();
// We run two passes of heap selection
// This is the size of the first-level heap passes
constexpr int kNProbeSplit = 8;
int pass2Chunks = std::min(nprobe, kNProbeSplit);
size_t sizeForFirstSelectPass =
pass2Chunks * k * (sizeof(float) + sizeof(int));
// How much temporary storage we need per each query
size_t sizePerQuery =
2 * // streams
((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
nprobe * maxListLength * sizeof(float) + // allDistances
// residual distances
nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
sizeForFirstSelectPass);
int queryTileSize = (int) (sizeAvailable / sizePerQuery);
if (queryTileSize < kMinQueryTileSize) {
queryTileSize = kMinQueryTileSize;
} else if (queryTileSize > kMaxQueryTileSize) {
queryTileSize = kMaxQueryTileSize;
}
// FIXME: we should adjust queryTileSize to deal with this, since
// indexing is in int32
FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
std::numeric_limits<int>::max());
// Temporary memory buffers
// Make sure there is space prior to the start which will be 0, and
// will handle the boundary condition without branches
DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
mem, {queryTileSize * nprobe + 1}, stream);
DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
mem, {queryTileSize * nprobe + 1}, stream);
DeviceTensor<int, 2, true> prefixSumOffsets1(
prefixSumOffsetSpace1[1].data(),
{queryTileSize, nprobe});
DeviceTensor<int, 2, true> prefixSumOffsets2(
prefixSumOffsetSpace2[1].data(),
{queryTileSize, nprobe});
DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
{&prefixSumOffsets1, &prefixSumOffsets2};
// Make sure the element before prefixSumOffsets is 0, since we
// depend upon simple, boundary-less indexing to get proper results
CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
0,
sizeof(int),
stream));
CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
0,
sizeof(int),
stream));
int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
int totalCodeDistancesSize =
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
codeDistanceTypeSize;
DeviceTensor<char, 1, true> codeDistances1Mem(
mem, {totalCodeDistancesSize}, stream);
NoTypeTensor<4, true> codeDistances1(
codeDistances1Mem.data(),
codeDistanceTypeSize,
{queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
DeviceTensor<char, 1, true> codeDistances2Mem(
mem, {totalCodeDistancesSize}, stream);
NoTypeTensor<4, true> codeDistances2(
codeDistances2Mem.data(),
codeDistanceTypeSize,
{queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
NoTypeTensor<4, true>* codeDistances[2] =
{&codeDistances1, &codeDistances2};
DeviceTensor<float, 1, true> allDistances1(
mem, {queryTileSize * nprobe * maxListLength}, stream);
DeviceTensor<float, 1, true> allDistances2(
mem, {queryTileSize * nprobe * maxListLength}, stream);
DeviceTensor<float, 1, true>* allDistances[2] =
{&allDistances1, &allDistances2};
DeviceTensor<float, 3, true> heapDistances1(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<float, 3, true> heapDistances2(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<float, 3, true>* heapDistances[2] =
{&heapDistances1, &heapDistances2};
DeviceTensor<int, 3, true> heapIndices1(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<int, 3, true> heapIndices2(
mem, {queryTileSize, pass2Chunks, k}, stream);
DeviceTensor<int, 3, true>* heapIndices[2] =
{&heapIndices1, &heapIndices2};
auto streams = res->getAlternateStreamsCurrentDevice();
streamWait(streams, {stream});
int curStream = 0;
for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
int numQueriesInTile =
std::min(queryTileSize, queries.getSize(0) - query);
auto prefixSumOffsetsView =
prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
auto codeDistancesView =
codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
auto coarseIndicesView =
topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
auto queryView =
queries.narrowOutermost(query, numQueriesInTile);
auto heapDistancesView =
heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
auto heapIndicesView =
heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
auto outDistanceView =
outDistances.narrowOutermost(query, numQueriesInTile);
auto outIndicesView =
outIndices.narrowOutermost(query, numQueriesInTile);
runMultiPassTile(queryView,
centroids,
pqCentroidsInnermostCode,
codeDistancesView,
coarseIndicesView,
useFloat16Lookup,
bytesPerCode,
numSubQuantizers,
numSubQuantizerCodes,
listCodes,
listIndices,
indicesOptions,
listLengths,
*thrustMem[curStream],
prefixSumOffsetsView,
*allDistances[curStream],
heapDistancesView,
heapIndicesView,
k,
metric,
outDistanceView,
outIndicesView,
streams[curStream]);
curStream = (curStream + 1) % 2;
}
streamWait({stream}, streams);
}
} } // namespace

View File

@ -21,8 +21,9 @@ class GpuResources;
/// per subquantizer?
bool isSupportedNoPrecomputedSubDimSize(int dims);
template <typename CentroidT>
void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
Tensor<float, 2, true>& centroids,
Tensor<CentroidT, 2, true>& centroids,
Tensor<float, 3, true>& pqCentroidsInnermostCode,
Tensor<int, 2, true>& topQueryToCentroid,
bool useFloat16Lookup,
@ -43,3 +44,5 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
GpuResources* res);
} } // namespace
#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh>

View File

@ -76,7 +76,6 @@ int main(int argc, char** argv) {
GpuIndexFlatConfig config;
config.device = dev;
config.useFloat16 = FLAGS_use_float16;
config.useFloat16Accumulator = FLAGS_use_float16_math;
config.storeTransposed = FLAGS_transposed;
config.memorySpace = FLAGS_use_unified_mem ?
MemorySpace::Unified : MemorySpace::Device;

View File

@ -187,6 +187,41 @@ TEST(TestGpuIndexIVFPQ, Query_IP) {
}
}
TEST(TestGpuIndexIVFPQ, Float16Coarse) {
Options opt;
std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
faiss::IndexFlatL2 coarseQuantizer(opt.dim);
faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
opt.codes, opt.bitsPerCode);
cpuIndex.nprobe = opt.nprobe;
cpuIndex.train(opt.numTrain, trainVecs.data());
faiss::gpu::StandardGpuResources res;
res.noTempMemory();
faiss::gpu::GpuIndexIVFPQConfig config;
config.device = opt.device;
config.flatConfig.useFloat16 = true;
config.usePrecomputedTables = opt.usePrecomputed;
config.indicesOptions = opt.indicesOpt;
config.useFloat16LookupTables = opt.useFloat16;
faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
gpuIndex.setNumProbes(opt.nprobe);
gpuIndex.add(opt.numAdd, addVecs.data());
cpuIndex.add(opt.numAdd, addVecs.data());
faiss::gpu::compareIndices(cpuIndex, gpuIndex,
opt.numQuery, opt.dim, opt.k, opt.toString(),
opt.getCompareEpsilon(),
opt.getPctMaxDiff1(),
opt.getPctMaxDiffN());
}
TEST(TestGpuIndexIVFPQ, Add_L2) {
for (int tries = 0; tries < 2; ++tries) {
Options opt;

View File

@ -98,7 +98,9 @@ class EvalIVFPQAccuracy(unittest.TestCase):
D, Inew = gpu_index.search(xq, 10)
self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
# 0.99: allow some tolerance in results otherwise test
# fails occasionally (not reproducible)
self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
def test_cpu_to_gpu_IVFPQ(self):
self.do_cpu_to_gpu('IVF128,PQ4')
@ -267,6 +269,45 @@ class TestGPUKmeans(unittest.TestCase):
assert np.allclose(obj1, obj2)
class TestAlternativeDistances(unittest.TestCase):
def do_test(self, metric, metric_arg=0):
res = faiss.StandardGpuResources()
d = 32
nb = 1000
nq = 100
rs = np.random.RandomState(123)
xb = rs.rand(nb, d).astype('float32')
xq = rs.rand(nq, d).astype('float32')
index_ref = faiss.IndexFlat(d, metric)
index_ref.metric_arg = metric_arg
index_ref.add(xb)
Dref, Iref = index_ref.search(xq, 10)
# build from other index
index = faiss.GpuIndexFlat(res, index_ref)
Dnew, Inew = index.search(xq, 10)
np.testing.assert_array_equal(Inew, Iref)
np.testing.assert_allclose(Dnew, Dref, rtol=1e-6)
# build from scratch
index = faiss.GpuIndexFlat(res, d, metric)
index.metric_arg = metric_arg
index.add(xb)
Dnew, Inew = index.search(xq, 10)
np.testing.assert_array_equal(Inew, Iref)
def test_L1(self):
self.do_test(faiss.METRIC_L1)
def test_Linf(self):
self.do_test(faiss.METRIC_Linf)
def test_Lp(self):
self.do_test(faiss.METRIC_Lp, 0.7)
if __name__ == '__main__':

View File

@ -0,0 +1,160 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <cublas_v2.h>
#include <faiss/gpu/utils/Tensor.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/HostTensor.cuh>
#include <faiss/gpu/utils/Float16.cuh>
namespace faiss { namespace gpu {
class DeviceMemory;
template <typename T>
struct GetCudaType;
template <>
struct GetCudaType<float> {
static constexpr cudaDataType_t Type = CUDA_R_32F;
};
template <>
struct GetCudaType<half> {
static constexpr cudaDataType_t Type = CUDA_R_16F;
};
template <typename AT, typename BT>
cublasStatus_t
rawGemm(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float fAlpha,
const AT *A,
int lda,
const BT *B,
int ldb,
const float fBeta,
float *C,
int ldc) {
auto cAT = GetCudaType<AT>::Type;
auto cBT = GetCudaType<BT>::Type;
// Always accumulate in f32
return cublasSgemmEx(handle, transa, transb, m, n, k,
&fAlpha, A, cAT, lda,
B, cBT, ldb,
&fBeta,
C, CUDA_R_32F, ldc);
}
template <typename AT, typename BT>
void
runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<AT, 2, true>& a, bool transA,
Tensor<BT, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream) {
cublasSetStream(handle, stream);
// Check that we have (m x k) * (k x n) = (m x n)
// using the input row-major layout
int aM = transA ? a.getSize(1) : a.getSize(0);
int aK = transA ? a.getSize(0) : a.getSize(1);
int bK = transB ? b.getSize(1) : b.getSize(0);
int bN = transB ? b.getSize(0) : b.getSize(1);
int cM = transC ? c.getSize(1) : c.getSize(0);
int cN = transC ? c.getSize(0) : c.getSize(1);
FAISS_ASSERT(aM == cM);
FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in
// column-major layout
float* pC = c.data();
int m = c.getSize(1); // stride 1 size
int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
if (transC) {
gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
}
cublasStatus_t err;
if (transC) {
err = rawGemm(handle,
gemmTrA, gemmTrB,
m, n, k, alpha,
a.data(), lda, b.data(), ldb, beta,
pC, ldc);
} else {
err = rawGemm(handle,
gemmTrA, gemmTrB,
m, n, k, alpha,
b.data(), lda, a.data(), ldb, beta,
pC, ldc);
}
FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
"cublas failed (%d): "
"(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
(int) err,
a.getSize(0), a.getSize(1), transA ? "'" : "",
b.getSize(0), b.getSize(1), transB ? "'" : "",
c.getSize(0), c.getSize(1), transC ? "'" : "");
CUDA_TEST_ERROR();
}
template <typename AT, typename BT>
void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
Tensor<AT, 3, true>& a, bool transA,
Tensor<BT, 3, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream) {
FAISS_ASSERT(c.getSize(0) == a.getSize(0));
FAISS_ASSERT(a.getSize(0) == b.getSize(0));
for (int i = 0; i < a.getSize(0); ++i) {
auto cView = c[i].view();
auto aView = a[i].view();
auto bView = b[i].view();
runMatrixMult(cView, transC,
aView, transA,
bView, transB,
alpha, beta, handle, stream);
}
}
} } // namespace

View File

@ -8,176 +8,9 @@
#include <faiss/gpu/utils/MatrixMult.cuh>
#include <faiss/gpu/utils/DeviceMemory.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Float16.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/HostTensor.cuh>
namespace faiss { namespace gpu {
template <typename T>
struct CublasGemm {
};
template <>
struct CublasGemm<float> {
static cublasStatus_t gemm(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
float fAlpha,
const float *A,
int lda,
const float *B,
int ldb,
float fBeta,
float *C,
int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k,
&fAlpha, A, lda, B, ldb, &fBeta, C, ldc);
}
};
template <>
struct CublasGemm<half> {
static cublasStatus_t gemm(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float fAlpha,
const half *A,
int lda,
const half *B,
int ldb,
const float fBeta,
float *C,
int ldc) {
// Always accumulate in f32
return cublasSgemmEx(handle, transa, transb, m, n, k,
&fAlpha, A, CUDA_R_16F, lda,
B, CUDA_R_16F, ldb,
&fBeta,
C, CUDA_R_32F, ldc);
}
};
template <typename T>
void
runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<T, 2, true>& a, bool transA,
Tensor<T, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream) {
cublasSetStream(handle, stream);
// Check that we have (m x k) * (k x n) = (m x n)
// using the input row-major layout
int aM = transA ? a.getSize(1) : a.getSize(0);
int aK = transA ? a.getSize(0) : a.getSize(1);
int bK = transB ? b.getSize(1) : b.getSize(0);
int bN = transB ? b.getSize(0) : b.getSize(1);
int cM = transC ? c.getSize(1) : c.getSize(0);
int cN = transC ? c.getSize(0) : c.getSize(1);
FAISS_ASSERT(aM == cM);
FAISS_ASSERT(aK == bK);
FAISS_ASSERT(bN == cN);
FAISS_ASSERT(a.getStride(1) == 1);
FAISS_ASSERT(b.getStride(1) == 1);
FAISS_ASSERT(c.getStride(1) == 1);
// Now, we have to represent the matrix multiplication in
// column-major layout
T* pA = transC ? a.data() : b.data();
T* pB = transC ? b.data() : a.data();
float* pC = c.data();
int m = c.getSize(1); // stride 1 size
int n = c.getSize(0); // other size
int k = transA ? a.getSize(0) : a.getSize(1);
int lda = transC ? a.getStride(0) : b.getStride(0);
int ldb = transC ? b.getStride(0) : a.getStride(0);
int ldc = c.getStride(0);
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
if (transC) {
gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
}
auto err = CublasGemm<T>::gemm(handle,
gemmTrA, gemmTrB,
m, n, k, alpha,
pA, lda, pB, ldb, beta,
pC, ldc);
FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
"cublas failed (%d): "
"(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
(int) err,
a.getSize(0), a.getSize(1), transA ? "'" : "",
b.getSize(0), b.getSize(1), transB ? "'" : "",
c.getSize(0), c.getSize(1), transC ? "'" : "");
CUDA_TEST_ERROR();
}
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<float, 2, true>& a, bool transA,
Tensor<float, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream) {
return runMatrixMult<float>(c, transC, a, transA, b, transB,
alpha, beta, handle, stream);
}
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<half, 2, true>& a, bool transA,
Tensor<half, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream) {
return runMatrixMult<half>(c, transC, a, transA, b, transB,
alpha, beta, handle, stream);
}
void
runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
Tensor<float, 3, true>& a, bool transA,
Tensor<float, 3, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream) {
FAISS_ASSERT(c.getSize(0) == a.getSize(0));
FAISS_ASSERT(a.getSize(0) == b.getSize(0));
for (int i = 0; i < a.getSize(0); ++i) {
auto cView = c[i].view();
auto aView = a[i].view();
auto bView = b[i].view();
runMatrixMult(cView, transC,
aView, transA,
bView, transB,
alpha, beta, handle, stream);
}
}
void
runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
Tensor<float, 3, true>& a, bool transA,

View File

@ -10,6 +10,9 @@
#include <cublas_v2.h>
#include <faiss/gpu/utils/Tensor.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/HostTensor.cuh>
#include <faiss/gpu/utils/Float16.cuh>
namespace faiss { namespace gpu {
@ -17,30 +20,23 @@ class DeviceMemory;
/// C = alpha * A * B + beta * C
/// Expects row major layout, not fortran/blas column major!
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<float, 2, true>& a, bool transA,
Tensor<float, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream);
/// C = alpha * A * B + beta * C
/// Expects row major layout, not fortran/blas column major!
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<half, 2, true>& a, bool transA,
Tensor<half, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream);
template <typename AT, typename BT>
void
runMatrixMult(Tensor<float, 2, true>& c, bool transC,
Tensor<AT, 2, true>& a, bool transA,
Tensor<BT, 2, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
cudaStream_t stream);
/// C_i = alpha * A_i * B_i + beta * C_i
/// where `i` is the outermost dimension, via iterated gemm
/// Expects row major layout, not fortran/blas column major!
template <typename AT, typename BT>
void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
Tensor<float, 3, true>& a, bool transA,
Tensor<float, 3, true>& b, bool transB,
Tensor<AT, 3, true>& a, bool transA,
Tensor<BT, 3, true>& b, bool transB,
float alpha,
float beta,
cublasHandle_t handle,
@ -59,3 +55,5 @@ void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
cudaStream_t stream);
} } // namespace
#include <faiss/gpu/utils/MatrixMult-inl.cuh>

View File

@ -51,9 +51,7 @@ struct RangeSearchResult {
};
/**
Encapsulates a set of ids to remove. */
/** Encapsulates a set of ids to remove. */
struct IDSelector {
typedef Index::idx_t idx_t;
virtual bool is_member (idx_t id) const = 0;

View File

@ -123,15 +123,15 @@ struct PolysemousTraining: SimulatedAnnealingParameters {
enum Optimization_type_t {
OT_None,
OT_ReproduceDistances_affine, ///< default
OT_Ranking_weighted_diff /// same as _2, but use rank of y+ - rank of y-
OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of y-
};
Optimization_type_t optimization_type;
// use 1/4 of the training points for the optimization, with
// max. ntrain_permutation. If ntrain_permutation == 0: train on
// centroids
/** use 1/4 of the training points for the optimization, with
* max. ntrain_permutation. If ntrain_permutation == 0: train on
* centroids */
int ntrain_permutation;
double dis_weight_factor; // decay of exp that weights distance loss
double dis_weight_factor; ///< decay of exp that weights distance loss
// filename pattern for the logging of iterations
std::string log_pattern;

View File

@ -19,6 +19,7 @@
#include <faiss/impl/FaissAssert.h>
#include <faiss/impl/io.h>
#include <faiss/utils/hamming.h>
#include <faiss/IndexFlat.h>
#include <faiss/VectorTransform.h>
@ -41,6 +42,7 @@
#include <faiss/IndexBinaryFromFloat.h>
#include <faiss/IndexBinaryHNSW.h>
#include <faiss/IndexBinaryIVF.h>
#include <faiss/IndexBinaryHash.h>
@ -752,6 +754,56 @@ static void read_binary_ivf_header (
read_direct_map (&ivf->direct_map, f);
}
static void read_binary_hash_invlists (
IndexBinaryHash::InvertedListMap &invlists,
int b, IOReader *f)
{
size_t sz;
READ1 (sz);
int il_nbit = 0;
READ1 (il_nbit);
// buffer for bitstrings
std::vector<uint8_t> buf((b + il_nbit) * sz);
READVECTOR (buf);
BitstringReader rd (buf.data(), buf.size());
invlists.reserve (sz);
for (size_t i = 0; i < sz; i++) {
uint64_t hash = rd.read(b);
uint64_t ilsz = rd.read(il_nbit);
auto & il = invlists[hash];
READVECTOR (il.ids);
FAISS_THROW_IF_NOT (il.ids.size() == ilsz);
READVECTOR (il.vecs);
}
}
static void read_binary_multi_hash_map(
IndexBinaryMultiHash::Map &map,
int b, size_t ntotal,
IOReader *f)
{
int id_bits;
size_t sz;
READ1 (id_bits);
READ1 (sz);
std::vector<uint8_t> buf;
READVECTOR (buf);
size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
FAISS_THROW_IF_NOT (buf.size() == (nbit + 7) / 8);
BitstringReader rd (buf.data(), buf.size());
map.reserve (sz);
for (size_t i = 0; i < sz; i++) {
uint64_t hash = rd.read(b);
uint64_t ilsz = rd.read(id_bits);
auto & il = map[hash];
for (size_t j = 0; j < ilsz; j++) {
il.push_back (rd.read (id_bits));
}
}
}
IndexBinary *read_index_binary (IOReader *f, int io_flags) {
IndexBinary * idx = nullptr;
uint32_t h;
@ -793,6 +845,28 @@ IndexBinary *read_index_binary (IOReader *f, int io_flags) {
static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map ();
}
idx = idxmap;
} else if(h == fourcc("IBHh")) {
IndexBinaryHash *idxh = new IndexBinaryHash ();
read_index_binary_header (idxh, f);
READ1 (idxh->b);
READ1 (idxh->nflip);
read_binary_hash_invlists(idxh->invlists, idxh->b, f);
idx = idxh;
} else if(h == fourcc("IBHm")) {
IndexBinaryMultiHash* idxmh = new IndexBinaryMultiHash ();
read_index_binary_header (idxmh, f);
idxmh->storage = dynamic_cast<IndexBinaryFlat*> (read_index_binary (f));
FAISS_THROW_IF_NOT(idxmh->storage && idxmh->storage->ntotal == idxmh->ntotal);
idxmh->own_fields = true;
READ1 (idxmh->b);
READ1 (idxmh->nhash);
READ1 (idxmh->nflip);
idxmh->maps.resize (idxmh->nhash);
for (int i = 0; i < idxmh->nhash; i++) {
read_binary_multi_hash_map(
idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
}
idx = idxmh;
} else {
FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
idx = nullptr;

View File

@ -19,6 +19,7 @@
#include <faiss/impl/FaissAssert.h>
#include <faiss/impl/io.h>
#include <faiss/utils/hamming.h>
#include <faiss/IndexFlat.h>
#include <faiss/VectorTransform.h>
@ -41,6 +42,7 @@
#include <faiss/IndexBinaryFromFloat.h>
#include <faiss/IndexBinaryHNSW.h>
#include <faiss/IndexBinaryIVF.h>
#include <faiss/IndexBinaryHash.h>
@ -515,6 +517,67 @@ static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) {
write_direct_map (&ivf->direct_map, f);
}
static void write_binary_hash_invlists (
const IndexBinaryHash::InvertedListMap &invlists,
int b, IOWriter *f)
{
size_t sz = invlists.size();
WRITE1 (sz);
size_t maxil = 0;
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
if(it->second.ids.size() > maxil) {
maxil = it->second.ids.size();
}
}
int il_nbit = 0;
while(maxil >= ((uint64_t)1 << il_nbit)) {
il_nbit++;
}
WRITE1(il_nbit);
// first write sizes then data, may be useful if we want to
// memmap it at some point
// buffer for bitstrings
std::vector<uint8_t> buf (((b + il_nbit) * sz + 7) / 8);
BitstringWriter wr (buf.data(), buf.size());
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
wr.write (it->first, b);
wr.write (it->second.ids.size(), il_nbit);
}
WRITEVECTOR (buf);
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
WRITEVECTOR (it->second.ids);
WRITEVECTOR (it->second.vecs);
}
}
static void write_binary_multi_hash_map(
const IndexBinaryMultiHash::Map &map,
int b, size_t ntotal,
IOWriter *f)
{
int id_bits = 0;
while ((ntotal > ((Index::idx_t)1 << id_bits))) {
id_bits++;
}
WRITE1(id_bits);
size_t sz = map.size();
WRITE1(sz);
size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
std::vector<uint8_t> buf((nbit + 7) / 8);
BitstringWriter wr (buf.data(), buf.size());
for (auto it = map.begin(); it != map.end(); ++it) {
wr.write(it->first, b);
wr.write(it->second.size(), id_bits);
for (auto id : it->second) {
wr.write(id, id_bits);
}
}
WRITEVECTOR (buf);
}
void write_index_binary (const IndexBinary *idx, IOWriter *f) {
if (const IndexBinaryFlat *idxf =
dynamic_cast<const IndexBinaryFlat *> (idx)) {
@ -551,6 +614,27 @@ void write_index_binary (const IndexBinary *idx, IOWriter *f) {
write_index_binary_header (idxmap, f);
write_index_binary (idxmap->index, f);
WRITEVECTOR (idxmap->id_map);
} else if (const IndexBinaryHash *idxh =
dynamic_cast<const IndexBinaryHash *> (idx)) {
uint32_t h = fourcc ("IBHh");
WRITE1 (h);
write_index_binary_header (idxh, f);
WRITE1 (idxh->b);
WRITE1 (idxh->nflip);
write_binary_hash_invlists(idxh->invlists, idxh->b, f);
} else if (const IndexBinaryMultiHash *idxmh =
dynamic_cast<const IndexBinaryMultiHash *> (idx)) {
uint32_t h = fourcc ("IBHm");
WRITE1 (h);
write_index_binary_header (idxmh, f);
write_index_binary (idxmh->storage, f);
WRITE1 (idxmh->b);
WRITE1 (idxmh->nhash);
WRITE1 (idxmh->nflip);
for (int i = 0; i < idxmh->nhash; i++) {
write_binary_multi_hash_map(
idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
}
} else {
FAISS_THROW_MSG ("don't know how to serialize this type of index");
}

View File

@ -37,7 +37,6 @@ int IOWriter::fileno ()
***********************************************************************/
size_t VectorIOWriter::operator()(
const void *ptr, size_t size, size_t nitems)
{
@ -132,6 +131,117 @@ int FileIOWriter::fileno() {
return ::fileno (f);
}
/***********************************************************************
* IO buffer
***********************************************************************/
BufferedIOReader::BufferedIOReader(IOReader *reader, size_t bsz, size_t totsz):
reader(reader), bsz(bsz), totsz(totsz), ofs(0), b0(0), b1(0), buffer(bsz)
{
}
size_t BufferedIOReader::operator()(void *ptr, size_t unitsize, size_t nitems)
{
size_t size = unitsize * nitems;
if (size == 0) return 0;
char * dst = (char*)ptr;
size_t nb;
{ // first copy available bytes
nb = std::min(b1 - b0, size);
memcpy (dst, buffer.data() + b0, nb);
b0 += nb;
dst += nb;
size -= nb;
}
if (size > totsz - ofs) {
size = totsz - ofs;
}
// while we would like to have more data
while (size > 0) {
assert (b0 == b1); // buffer empty on input
// try to read from main reader
b0 = 0;
b1 = (*reader)(buffer.data(), 1, std::min(bsz, size));
if (b1 == 0) {
// no more bytes available
break;
}
ofs += b1;
// copy remaining bytes
size_t nb2 = std::min(b1, size);
memcpy (dst, buffer.data(), nb2);
b0 = nb2;
nb += nb2;
dst += nb2;
size -= nb2;
}
return nb / unitsize;
}
BufferedIOWriter::BufferedIOWriter(IOWriter *writer, size_t bsz):
writer(writer), bsz(bsz), b0(0), buffer(bsz)
{
}
size_t BufferedIOWriter::operator()(const void *ptr, size_t unitsize, size_t nitems)
{
size_t size = unitsize * nitems;
if (size == 0) return 0;
const char * src = (const char*)ptr;
size_t nb;
{ // copy as many bytes as possible to buffer
nb = std::min(bsz - b0, size);
memcpy (buffer.data() + b0, src, nb);
b0 += nb;
src += nb;
size -= nb;
}
while (size > 0) {
assert(b0 == bsz);
// now we need to flush to add more bytes
size_t ofs = 0;
do {
assert (ofs < 10000000);
size_t written = (*writer)(buffer.data() + ofs, 1, bsz - ofs);
FAISS_THROW_IF_NOT(written > 0);
ofs += written;
} while(ofs != bsz);
// copy src to buffer
size_t nb1 = std::min(bsz, size);
memcpy (buffer.data(), src, nb1);
b0 = nb1;
nb += nb1;
src += nb1;
size -= nb1;
}
return nb / unitsize;
}
BufferedIOWriter::~BufferedIOWriter()
{
size_t ofs = 0;
while(ofs != b0) {
printf("Destructor write %ld \n", b0 - ofs);
size_t written = (*writer)(buffer.data() + ofs, 1, b0 - ofs);
FAISS_THROW_IF_NOT(written > 0);
ofs += written;
}
}
uint32_t fourcc (const char sx[4]) {
assert(4 == strlen(sx));
const unsigned char *x = (unsigned char*)sx;

View File

@ -9,6 +9,9 @@
/***********************************************************
* Abstract I/O objects
*
* I/O is always sequential, seek does not need to be supported
* (indexes could be read or written to a pipe).
***********************************************************/
#pragma once
@ -92,6 +95,41 @@ struct FileIOWriter: IOWriter {
int fileno() override;
};
/*******************************************************
* Buffered reader + writer
*******************************************************/
/** wraps an ioreader to make buffered reads to avoid too small reads */
struct BufferedIOReader: IOReader {
IOReader *reader;
size_t bsz, totsz, ofs;
size_t b0, b1; ///< range of available bytes in the buffer
std::vector<char> buffer;
BufferedIOReader(IOReader *reader, size_t bsz,
size_t totsz=(size_t)(-1));
size_t operator()(void *ptr, size_t size, size_t nitems) override;
};
struct BufferedIOWriter: IOWriter {
IOWriter *writer;
size_t bsz, ofs;
size_t b0; ///< amount of data in buffer
std::vector<char> buffer;
BufferedIOWriter(IOWriter *writer, size_t bsz);
size_t operator()(const void *ptr, size_t size, size_t nitems) override;
// flushes
~BufferedIOWriter();
};
/// cast a 4-character string to a uint32_t that can be written and read easily
uint32_t fourcc (const char sx[4]);

View File

@ -283,6 +283,18 @@ def handle_IndexBinary(the_class):
swig_ptr(labels))
return distances, labels
def replacement_range_search(self, x, thresh):
n, d = x.shape
assert d * 8 == self.d
res = RangeSearchResult(n)
self.range_search_c(n, swig_ptr(x), thresh, res)
# get pointers and copy them
lims = rev_swig_ptr(res.lims, n + 1).copy()
nd = int(lims[-1])
D = rev_swig_ptr(res.distances, nd).copy()
I = rev_swig_ptr(res.labels, nd).copy()
return lims, D, I
def replacement_remove_ids(self, x):
if isinstance(x, IDSelector):
sel = x
@ -295,6 +307,7 @@ def handle_IndexBinary(the_class):
replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
replace_method(the_class, 'train', replacement_train)
replace_method(the_class, 'search', replacement_search)
replace_method(the_class, 'range_search', replacement_range_search)
replace_method(the_class, 'reconstruct', replacement_reconstruct)
replace_method(the_class, 'remove_ids', replacement_remove_ids)
@ -461,6 +474,9 @@ add_ref_in_constructor(IndexBinaryIDMap2, 0)
add_ref_in_method(IndexReplicas, 'addIndex', 0)
add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0)
add_ref_in_constructor(BufferedIOWriter, 0)
add_ref_in_constructor(BufferedIOReader, 0)
# seems really marginal...
# remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
@ -751,9 +767,24 @@ def deserialize_index(data):
copy_array_to_vector(data, reader.data)
return read_index(reader)
def serialize_index_binary(index):
""" convert an index to a numpy uint8 array """
writer = VectorIOWriter()
write_index_binary(index, writer)
return vector_to_array(writer.data)
def deserialize_index_binary(data):
reader = VectorIOReader()
copy_array_to_vector(data, reader.data)
return read_index_binary(reader)
###########################################
# ResultHeap
###########################################
class ResultHeap:
"""Combine query results from a sliced dataset. The final result will
"""Accumulate query results from a sliced dataset. The final result will
be in self.D, self.I."""
def __init__(self, nq, k):

View File

@ -32,7 +32,7 @@ are implemented on the GPU. It is developed by Facebook AI Research.
"""
setup(
name='faiss',
version='1.6.2',
version='1.6.3',
description='A library for efficient similarity search and clustering of dense vectors',
long_description=long_description,
url='https://github.com/facebookresearch/faiss',

View File

@ -93,6 +93,7 @@ extern "C" {
#include <faiss/IndexBinaryIVF.h>
#include <faiss/IndexBinaryFromFloat.h>
#include <faiss/IndexBinaryHNSW.h>
#include <faiss/IndexBinaryHash.h>
#include <faiss/impl/io.h>
#include <faiss/index_io.h>
@ -359,6 +360,7 @@ void gpu_sync_all_devices()
%include <faiss/IndexBinaryIVF.h>
%include <faiss/IndexBinaryFromFloat.h>
%include <faiss/IndexBinaryHNSW.h>
%include <faiss/IndexBinaryHash.h>
@ -979,6 +981,124 @@ struct MapLong2Long {
%}
/*******************************************************************
* Support I/O to arbitrary functions
*******************************************************************/
%inline %{
#ifdef SWIGPYTHON
struct PyCallbackIOWriter: faiss::IOWriter {
PyObject * callback;
size_t bs; // maximum write size
PyCallbackIOWriter(PyObject *callback,
size_t bs = 1024 * 1024):
callback(callback), bs(bs) {
Py_INCREF(callback);
name = "PyCallbackIOWriter";
}
size_t operator()(const void *ptrv, size_t size, size_t nitems) override {
size_t ws = size * nitems;
const char *ptr = (const char*)ptrv;
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();
while(ws > 0) {
size_t wi = ws > bs ? bs : ws;
PyObject* bo = PyBytes_FromStringAndSize(ptr, wi);
PyObject *arglist = Py_BuildValue("(N)", bo);
if(!arglist) {
PyGILState_Release(gstate);
return 0;
}
ptr += wi;
ws -= wi;
PyObject * result = PyObject_CallObject(callback, arglist);
Py_DECREF(arglist);
if (result == NULL) {
PyGILState_Release(gstate);
return 0;
}
Py_DECREF(result);
}
PyGILState_Release(gstate);
return nitems;
}
~PyCallbackIOWriter() {
Py_DECREF(callback);
}
};
struct PyCallbackIOReader: faiss::IOReader {
PyObject * callback;
size_t bs; // maximum buffer size
PyCallbackIOReader(PyObject *callback,
size_t bs = 1024 * 1024):
callback(callback), bs(bs) {
Py_INCREF(callback);
name = "PyCallbackIOReader";
}
size_t operator()(void *ptrv, size_t size, size_t nitems) override {
size_t rs = size * nitems;
char *ptr = (char*)ptrv;
PyGILState_STATE gstate;
gstate = PyGILState_Ensure();
while(rs > 0) {
size_t ri = rs > bs ? bs : rs;
PyObject *arglist = Py_BuildValue("(n)", ri);
PyObject * result = PyObject_CallObject(callback, arglist);
Py_DECREF(arglist);
if (result == NULL) {
PyGILState_Release(gstate);
return 0;
}
if(!PyBytes_Check(result)) {
Py_DECREF(result);
PyErr_SetString(PyExc_RuntimeError,
"read callback did not return a bytes object");
PyGILState_Release(gstate);
throw faiss::FaissException("reader error");
}
size_t sz = PyBytes_Size(result);
if (sz == 0 || sz > rs) {
Py_DECREF(result);
PyErr_Format(PyExc_RuntimeError,
"read callback returned %ld bytes (asked %ld)",
sz, rs);
PyGILState_Release(gstate);
throw faiss::FaissException("reader error");
}
memcpy(ptr, PyBytes_AsString(result), sz);
Py_DECREF(result);
ptr += sz;
rs -= sz;
}
PyGILState_Release(gstate);
return nitems;
}
~PyCallbackIOReader() {
Py_DECREF(callback);
}
};
#endif
%}
%inline %{
void wait() {
// in gdb, use return to get out of this function

View File

@ -97,3 +97,32 @@ def get_dataset_2(d, nt, nb, nq):
x = np.sin(x)
x = x.astype('float32')
return x[:nt], x[nt:nt + nb], x[nt + nb:]
def make_binary_dataset(d, nt, nb, nq):
assert d % 8 == 0
rs = np.random.RandomState(123)
x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
return x[:nt], x[nt:-nq], x[-nq:]
def compare_binary_result_lists(D1, I1, D2, I2):
"""comparing result lists is difficult because there are many
ties. Here we sort by (distance, index) pairs and ignore the largest
distance of each result. Compatible result lists should pass this."""
assert D1.shape == I1.shape == D2.shape == I2.shape
n, k = D1.shape
ndiff = (D1 != D2).sum()
assert ndiff == 0, '%d differences in distance matrix %s' % (
ndiff, D1.shape)
def normalize_DI(D, I):
norm = I.max() + 1.0
Dr = D.astype('float64') + I / norm
# ignore -1s and elements on last column
Dr[I1 == -1] = 1e20
Dr[D == D[:, -1:]] = 1e20
Dr.sort(axis=1)
return Dr
ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
assert ndiff == 0, '%d differences in normalized D matrix' % ndiff

View File

@ -0,0 +1,183 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#!/usr/bin/env python3
import unittest
import numpy as np
import faiss
from common import make_binary_dataset
def bitvec_shuffle(a, order):
n, d = a.shape
db, = order.shape
b = np.empty((n, db // 8), dtype='uint8')
faiss.bitvec_shuffle(
n, d * 8, db,
faiss.swig_ptr(order),
faiss.swig_ptr(a), faiss.swig_ptr(b))
return b
class TestSmallFuncs(unittest.TestCase):
def test_shuffle(self):
d = 256
n = 1000
rs = np.random.RandomState(123)
o = rs.permutation(d).astype('int32')
x = rs.randint(256, size=(n, d // 8)).astype('uint8')
y1 = bitvec_shuffle(x, o[:128])
y2 = bitvec_shuffle(x, o[128:])
y = np.hstack((y1, y2))
oinv = np.empty(d, dtype='int32')
oinv[o] = np.arange(d)
z = bitvec_shuffle(y, oinv)
np.testing.assert_array_equal(x, z)
class TestRange(unittest.TestCase):
def test_hash(self):
d = 128
nq = 100
nb = 2000
(_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
index_ref = faiss.IndexBinaryFlat(d)
index_ref.add(xb)
radius = 55
Lref, Dref, Iref = index_ref.range_search(xq, radius)
print("nb res: ", Lref[-1])
index = faiss.IndexBinaryHash(d, 10)
index.add(xb)
# index.display()
nfound = []
ndis = []
stats = faiss.cvar.indexBinaryHash_stats
for n_bitflips in range(index.b + 1):
index.nflip = n_bitflips
stats.reset()
Lnew, Dnew, Inew = index.range_search(xq, radius)
for i in range(nq):
ref = Iref[Lref[i]:Lref[i + 1]]
new = Inew[Lnew[i]:Lnew[i + 1]]
snew = set(new)
# no duplicates
self.assertTrue(len(new) == len(snew))
# subset of real results
self.assertTrue(snew <= set(ref))
nfound.append(Lnew[-1])
ndis.append(stats.ndis)
print('nfound=', nfound)
print('ndis=', ndis)
nfound = np.array(nfound)
self.assertTrue(nfound[-1] == Lref[-1])
self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
def test_multihash(self):
d = 128
nq = 100
nb = 2000
(_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
index_ref = faiss.IndexBinaryFlat(d)
index_ref.add(xb)
radius = 55
Lref, Dref, Iref = index_ref.range_search(xq, radius)
print("nb res: ", Lref[-1])
nfound = []
ndis = []
for nh in 1, 3, 5:
index = faiss.IndexBinaryMultiHash(d, nh, 10)
index.add(xb)
# index.display()
stats = faiss.cvar.indexBinaryHash_stats
index.nflip = 2
stats.reset()
Lnew, Dnew, Inew = index.range_search(xq, radius)
for i in range(nq):
ref = Iref[Lref[i]:Lref[i + 1]]
new = Inew[Lnew[i]:Lnew[i + 1]]
snew = set(new)
# no duplicates
self.assertTrue(len(new) == len(snew))
# subset of real results
self.assertTrue(snew <= set(ref))
nfound.append(Lnew[-1])
ndis.append(stats.ndis)
print('nfound=', nfound)
print('ndis=', ndis)
nfound = np.array(nfound)
# self.assertTrue(nfound[-1] == Lref[-1])
self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
class TestKnn(unittest.TestCase):
def test_hash_and_multihash(self):
d = 128
nq = 100
nb = 2000
(_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
index_ref = faiss.IndexBinaryFlat(d)
index_ref.add(xb)
k = 10
Dref, Iref = index_ref.search(xq, k)
nfound = {}
for nh in 0, 1, 3, 5:
for nbit in 4, 7:
if nh == 0:
index = faiss.IndexBinaryHash(d, nbit)
else:
index = faiss.IndexBinaryMultiHash(d, nh, nbit)
index.add(xb)
index.nflip = 2
Dnew, Inew = index.search(xq, k)
nf = 0
for i in range(nq):
ref = Iref[i]
new = Inew[i]
snew = set(new)
# no duplicates
self.assertTrue(len(new) == len(snew))
nf += len(set(ref) & snew)
print('nfound', nh, nbit, nf)
nfound[(nh, nbit)] = nf
self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
# test serialization
index2 = faiss.deserialize_index_binary(
faiss.serialize_index_binary(index))
D2, I2 = index2.search(xq, k)
np.testing.assert_array_equal(Inew, I2)
np.testing.assert_array_equal(Dnew, D2)
print('nfound=', nfound)
self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])

View File

@ -13,7 +13,7 @@ import faiss
import tempfile
import os
import re
import warnings
from common import get_dataset, get_dataset_2
@ -24,7 +24,6 @@ class TestModuleInterface(unittest.TestCase):
assert re.match('^\\d+\\.\\d+\\.\\d+$', faiss.__version__)
class EvalIVFPQAccuracy(unittest.TestCase):
def test_IndexIVFPQ(self):
@ -506,37 +505,6 @@ class TestHNSW(unittest.TestCase):
assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
class TestIOError(unittest.TestCase):
def test_io_error(self):
d, n = 32, 1000
x = np.random.uniform(size=(n, d)).astype('float32')
index = faiss.IndexFlatL2(d)
index.add(x)
_, fname = tempfile.mkstemp()
try:
faiss.write_index(index, fname)
# should be fine
faiss.read_index(fname)
# now damage file
data = open(fname, 'rb').read()
data = data[:int(len(data) / 2)]
open(fname, 'wb').write(data)
# should make a nice readable exception that mentions the
try:
faiss.read_index(fname)
except RuntimeError as e:
if fname not in str(e):
raise
else:
raise
finally:
if os.path.exists(fname):
os.unlink(fname)
class TestDistancesPositive(unittest.TestCase):

View File

@ -10,12 +10,8 @@ import numpy as np
import unittest
import faiss
from common import compare_binary_result_lists, make_binary_dataset
def make_binary_dataset(d, nt, nb, nq):
assert d % 8 == 0
rs = np.random.RandomState(123)
x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
return x[:nt], x[nt:-nq], x[-nq:]
def binary_to_float(x):
@ -124,6 +120,29 @@ class TestBinaryFlat(unittest.TestCase):
assert(np.all(Iflat == -1))
assert(np.all(Dflat == 2147483647)) # NOTE(hoss): int32_t max
def test_range_search(self):
d = self.xq.shape[1] * 8
index = faiss.IndexBinaryFlat(d)
index.add(self.xb)
D, I = index.search(self.xq, 10)
thresh = int(np.median(D[:, -1]))
lims, D2, I2 = index.range_search(self.xq, thresh)
nt1 = nt2 = 0
for i in range(len(self.xq)):
range_res = I2[lims[i]:lims[i + 1]]
if thresh > D[i, -1]:
self.assertTrue(set(I[i]) <= set(range_res))
nt1 += 1
elif thresh < D[i, -1]:
self.assertTrue(set(range_res) <= set(I[i]))
nt2 += 1
# in case of equality we have a problem with ties
print('nb tests', nt1, nt2)
# nb tests is actually low...
self.assertTrue(nt1 > 19 and nt2 > 19)
class TestBinaryIVF(unittest.TestCase):
@ -166,6 +185,29 @@ class TestBinaryIVF(unittest.TestCase):
self.assertEqual((self.Dref == Divfflat).sum(), 4122)
def test_ivf_range(self):
d = self.xq.shape[1] * 8
quantizer = faiss.IndexBinaryFlat(d)
index = faiss.IndexBinaryIVF(quantizer, d, 8)
index.cp.min_points_per_centroid = 5 # quiet warning
index.nprobe = 4
index.train(self.xt)
index.add(self.xb)
D, I = index.search(self.xq, 10)
radius = int(np.median(D[:, -1]) + 1)
Lr, Dr, Ir = index.range_search(self.xq, radius)
for i in range(len(self.xq)):
res = Ir[Lr[i]:Lr[i + 1]]
if D[i, -1] < radius:
self.assertTrue(set(I[i]) <= set(res))
else:
subset = I[i, D[i, :] < radius]
self.assertTrue(set(subset) == set(res))
def test_ivf_flat_empty(self):
d = self.xq.shape[1] * 8
@ -257,27 +299,6 @@ class TestHNSW(unittest.TestCase):
self.assertTrue((Dref == Dbin).all())
def compare_binary_result_lists(D1, I1, D2, I2):
"""comparing result lists is difficult because there are many
ties. Here we sort by (distance, index) pairs and ignore the largest
distance of each result. Compatible result lists should pass this."""
assert D1.shape == I1.shape == D2.shape == I2.shape
n, k = D1.shape
ndiff = (D1 != D2).sum()
assert ndiff == 0, '%d differences in distance matrix %s' % (
ndiff, D1.shape)
def normalize_DI(D, I):
norm = I.max() + 1.0
Dr = D.astype('float64') + I / norm
# ignore -1s and elements on last column
Dr[I1 == -1] = 1e20
Dr[D == D[:, -1:]] = 1e20
Dr.sort(axis=1)
return Dr
ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
class TestReplicasAndShards(unittest.TestCase):

220
tests/test_io.py 100644
View File

@ -0,0 +1,220 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#!/usr/bin/env python3
import numpy as np
import unittest
import faiss
import tempfile
import os
import io
import sys
import warnings
from multiprocessing.dummy import Pool as ThreadPool
from common import get_dataset, get_dataset_2
class TestIOVariants(unittest.TestCase):
def test_io_error(self):
d, n = 32, 1000
x = np.random.uniform(size=(n, d)).astype('float32')
index = faiss.IndexFlatL2(d)
index.add(x)
_, fname = tempfile.mkstemp()
try:
faiss.write_index(index, fname)
# should be fine
faiss.read_index(fname)
# now damage file
data = open(fname, 'rb').read()
data = data[:int(len(data) / 2)]
open(fname, 'wb').write(data)
# should make a nice readable exception that mentions the filename
try:
faiss.read_index(fname)
except RuntimeError as e:
if fname not in str(e):
raise
else:
raise
finally:
if os.path.exists(fname):
os.unlink(fname)
class TestCallbacks(unittest.TestCase):
def do_write_callback(self, bsz):
d, n = 32, 1000
x = np.random.uniform(size=(n, d)).astype('float32')
index = faiss.IndexFlatL2(d)
index.add(x)
f = io.BytesIO()
# test with small block size
writer = faiss.PyCallbackIOWriter(f.write, 1234)
if bsz > 0:
writer = faiss.BufferedIOWriter(writer, bsz)
faiss.write_index(index, writer)
del writer # make sure all writes committed
if sys.version_info[0] < 3:
buf = f.getvalue()
else:
buf = f.getbuffer()
index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8'))
self.assertEqual(index.d, index2.d)
self.assertTrue(np.all(
faiss.vector_to_array(index.xb) == faiss.vector_to_array(index2.xb)
))
# This is not a callable function: shoudl raise an exception
writer = faiss.PyCallbackIOWriter("blabla")
self.assertRaises(
Exception,
faiss.write_index, index, writer
)
def test_buf_read(self):
x = np.random.uniform(size=20)
_, fname = tempfile.mkstemp()
try:
x.tofile(fname)
f = open(fname, 'rb')
reader = faiss.PyCallbackIOReader(f.read, 1234)
bsz = 123
reader = faiss.BufferedIOReader(reader, bsz)
y = np.zeros_like(x)
print('nbytes=', y.nbytes)
reader(faiss.swig_ptr(y), y.nbytes, 1)
np.testing.assert_array_equal(x, y)
finally:
if os.path.exists(fname):
os.unlink(fname)
def do_read_callback(self, bsz):
d, n = 32, 1000
x = np.random.uniform(size=(n, d)).astype('float32')
index = faiss.IndexFlatL2(d)
index.add(x)
_, fname = tempfile.mkstemp()
try:
faiss.write_index(index, fname)
f = open(fname, 'rb')
reader = faiss.PyCallbackIOReader(f.read, 1234)
if bsz > 0:
reader = faiss.BufferedIOReader(reader, bsz)
index2 = faiss.read_index(reader)
self.assertEqual(index.d, index2.d)
np.testing.assert_array_equal(
faiss.vector_to_array(index.xb),
faiss.vector_to_array(index2.xb)
)
# This is not a callable function: should raise an exception
reader = faiss.PyCallbackIOReader("blabla")
self.assertRaises(
Exception,
faiss.read_index, reader
)
finally:
if os.path.exists(fname):
os.unlink(fname)
def test_write_callback(self):
self.do_write_callback(0)
def test_write_buffer(self):
self.do_write_callback(123)
self.do_write_callback(2345)
def test_read_callback(self):
self.do_read_callback(0)
def test_read_callback_buffered(self):
self.do_read_callback(123)
self.do_read_callback(12345)
def test_read_buffer(self):
d, n = 32, 1000
x = np.random.uniform(size=(n, d)).astype('float32')
index = faiss.IndexFlatL2(d)
index.add(x)
_, fname = tempfile.mkstemp()
try:
faiss.write_index(index, fname)
reader = faiss.BufferedIOReader(
faiss.FileIOReader(fname), 1234)
index2 = faiss.read_index(reader)
self.assertEqual(index.d, index2.d)
np.testing.assert_array_equal(
faiss.vector_to_array(index.xb),
faiss.vector_to_array(index2.xb)
)
finally:
if os.path.exists(fname):
os.unlink(fname)
def test_transfer_pipe(self):
""" transfer an index through a Unix pipe """
d, n = 32, 1000
x = np.random.uniform(size=(n, d)).astype('float32')
index = faiss.IndexFlatL2(d)
index.add(x)
Dref, Iref = index.search(x, 10)
rf, wf = os.pipe()
# start thread that will decompress the index
def index_from_pipe():
reader = faiss.PyCallbackIOReader(lambda size: os.read(rf, size))
return faiss.read_index(reader)
fut = ThreadPool(1).apply_async(index_from_pipe, ())
# write to pipe
writer = faiss.PyCallbackIOWriter(lambda b: os.write(wf, b))
faiss.write_index(index, writer)
index2 = fut.get()
# closing is not really useful but it does not hurt
os.close(wf)
os.close(rf)
Dnew, Inew = index2.search(x, 10)
np.testing.assert_array_equal(Iref, Inew)
np.testing.assert_array_equal(Dref, Dnew)

View File

@ -34,6 +34,7 @@
#include <faiss/utils/Heap.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/utils/utils.h>
#include <faiss/impl/AuxIndexStructures.h>
static const size_t BLOCKSIZE_QUERY = 8192;
@ -484,6 +485,30 @@ void bitvec_print (const uint8_t * b, size_t d)
}
void bitvec_shuffle (size_t n, size_t da, size_t db,
const int *order,
const uint8_t *a,
uint8_t *b)
{
for(size_t i = 0; i < db; i++) {
FAISS_THROW_IF_NOT (order[i] >= 0 && order[i] < da);
}
size_t lda = (da + 7) / 8;
size_t ldb = (db + 7) / 8;
#pragma omp parallel for if(n > 10000)
for (size_t i = 0; i < n; i++) {
const uint8_t *ai = a + i * lda;
uint8_t *bi = b + i * ldb;
memset (bi, 0, ldb);
for(size_t i = 0; i < db; i++) {
int o = order[i];
uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
bi[i >> 3] |= the_bit << (i & 7);
}
}
}
@ -527,6 +552,7 @@ void hammings_knn(
{
hammings_knn_hc(ha, a, b, nb, ncodes, order);
}
void hammings_knn_hc (
int_maxheap_array_t * ha,
const uint8_t * a,
@ -610,7 +636,66 @@ void hammings_knn_mc(
}
}
}
template <class HammingComputer>
static
void hamming_range_search_template (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
int radius,
size_t code_size,
RangeSearchResult *res)
{
#pragma omp parallel
{
RangeSearchPartialResult pres (res);
#pragma omp for
for (size_t i = 0; i < na; i++) {
HammingComputer hc (a + i * code_size, code_size);
const uint8_t * yi = b;
RangeQueryResult & qres = pres.new_result (i);
for (size_t j = 0; j < nb; j++) {
int dis = hc.hamming (yi);
if (dis < radius) {
qres.add(dis, j);
}
yi += code_size;
}
}
pres.finalize ();
}
}
void hamming_range_search (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
int radius,
size_t code_size,
RangeSearchResult *result)
{
#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
switch(code_size) {
case 4: HC(HammingComputer4); break;
case 8: HC(HammingComputer8); break;
case 16: HC(HammingComputer16); break;
case 32: HC(HammingComputer32); break;
default:
if (code_size % 8 == 0) {
HC(HammingComputerM8);
} else {
HC(HammingComputerDefault);
}
}
#undef HC
}

View File

@ -39,6 +39,7 @@ namespace faiss {
* General bit vector functions
**************************************************/
struct RangeSearchResult;
void bitvec_print (const uint8_t * b, size_t d);
@ -65,6 +66,14 @@ void bitvecs2fvecs (
void fvec2bitvec (const float * x, uint8_t * b, size_t d);
/** Shuffle the bits from b(i, j) := a(i, order[j])
*/
void bitvec_shuffle (size_t n, size_t da, size_t db,
const int *order,
const uint8_t *a,
uint8_t *b);
/***********************************************
* Generic reader/writer for bit strings
***********************************************/
@ -171,6 +180,17 @@ void hammings_knn_mc (
int32_t *distances,
int64_t *labels);
/** same as hammings_knn except we are doing a range search with radius */
void hamming_range_search (
const uint8_t * a,
const uint8_t * b,
size_t na,
size_t nb,
int radius,
size_t ncodes,
RangeSearchResult *result);
/* Counting the number of matches or of cross-matches (without returning them)
For use with function that assume pre-allocated memory */
void hamming_count_thres (