Sync 20200323. (#1157)
* Sync 20200323. * Bump version. * Remove warning filter.pull/1160/head v1.6.3
parent
fc2a1c1775
commit
a17a631dc3
12
Clustering.h
12
Clustering.h
|
@ -34,7 +34,7 @@ struct ClusteringParameters {
|
|||
|
||||
int seed; ///< seed for the random number generator
|
||||
|
||||
size_t decode_block_size; /// < how many vectors at a time to decode
|
||||
size_t decode_block_size; ///< how many vectors at a time to decode
|
||||
|
||||
/// sets reasonable defaults
|
||||
ClusteringParameters ();
|
||||
|
@ -42,11 +42,11 @@ struct ClusteringParameters {
|
|||
|
||||
|
||||
struct ClusteringIterationStats {
|
||||
float obj; /// objective values (sum of distances reported by index)
|
||||
double time; /// seconds for iteration
|
||||
double time_search; /// seconds for just search
|
||||
double imbalance_factor; /// imbalance factor of iteration
|
||||
int nsplit; /// number of cluster splits
|
||||
float obj; ///< objective values (sum of distances reported by index)
|
||||
double time; ///< seconds for iteration
|
||||
double time_search; ///< seconds for just search
|
||||
double imbalance_factor; ///< imbalance factor of iteration
|
||||
int nsplit; ///< number of cluster splits
|
||||
};
|
||||
|
||||
|
||||
|
|
10
Index.h
10
Index.h
|
@ -18,7 +18,7 @@
|
|||
|
||||
#define FAISS_VERSION_MAJOR 1
|
||||
#define FAISS_VERSION_MINOR 6
|
||||
#define FAISS_VERSION_PATCH 2
|
||||
#define FAISS_VERSION_PATCH 3
|
||||
|
||||
/**
|
||||
* @namespace faiss
|
||||
|
@ -44,12 +44,10 @@ struct IDSelector;
|
|||
struct RangeSearchResult;
|
||||
struct DistanceComputer;
|
||||
|
||||
/** Abstract structure for an index
|
||||
/** Abstract structure for an index, supports adding vectors and searching them.
|
||||
*
|
||||
* Supports adding vertices and searching them.
|
||||
*
|
||||
* Currently only asymmetric queries are supported:
|
||||
* database-to-database queries are not implemented.
|
||||
* All vectors provided at add or search time are 32-bit float arrays,
|
||||
* although the internal representation may vary.
|
||||
*/
|
||||
struct Index {
|
||||
using idx_t = int64_t; ///< all indices are this type
|
||||
|
|
|
@ -99,9 +99,13 @@ struct IndexBinary {
|
|||
|
||||
/** Query n vectors of dimension d to the index.
|
||||
*
|
||||
* return all vectors with distance < radius. Note that many
|
||||
* indexes do not implement the range_search (only the k-NN search
|
||||
* is mandatory).
|
||||
* return all vectors with distance < radius. Note that many indexes
|
||||
* do not implement the range_search (only the k-NN search is
|
||||
* mandatory). The distances are converted to float to reuse the
|
||||
* RangeSearchResult structure, but they are integer. By convention,
|
||||
* only distances < radius (strict comparison) are returned,
|
||||
* ie. radius = 0 does not return any result and 1 returns only
|
||||
* exact same vectors.
|
||||
*
|
||||
* @param x input vectors to search, size n * d / 8
|
||||
* @param radius search radius
|
||||
|
|
|
@ -79,5 +79,10 @@ void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
|
|||
memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
|
||||
}
|
||||
|
||||
void IndexBinaryFlat::range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const
|
||||
{
|
||||
hamming_range_search (x, xb.data(), n, ntotal, radius, code_size, result);
|
||||
}
|
||||
|
||||
} // namespace faiss
|
||||
|
|
|
@ -38,6 +38,9 @@ struct IndexBinaryFlat : IndexBinary {
|
|||
void search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const override;
|
||||
|
||||
void range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const override;
|
||||
|
||||
void reconstruct(idx_t key, uint8_t *recons) const override;
|
||||
|
||||
/** Remove some ids. Note that because of the indexing structure,
|
||||
|
|
|
@ -0,0 +1,492 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// Copyright 2004-present Facebook. All Rights Reserved
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/IndexBinaryHash.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <memory>
|
||||
|
||||
#include <faiss/utils/hamming.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
void IndexBinaryHash::InvertedList::add (
|
||||
idx_t id, size_t code_size, const uint8_t *code)
|
||||
{
|
||||
ids.push_back(id);
|
||||
vecs.insert(vecs.end(), code, code + code_size);
|
||||
}
|
||||
|
||||
IndexBinaryHash::IndexBinaryHash(int d, int b):
|
||||
IndexBinary(d), b(b), nflip(0)
|
||||
{
|
||||
is_trained = true;
|
||||
}
|
||||
|
||||
IndexBinaryHash::IndexBinaryHash(): b(0), nflip(0)
|
||||
{
|
||||
is_trained = true;
|
||||
}
|
||||
|
||||
void IndexBinaryHash::reset()
|
||||
{
|
||||
invlists.clear();
|
||||
ntotal = 0;
|
||||
}
|
||||
|
||||
|
||||
void IndexBinaryHash::add(idx_t n, const uint8_t *x)
|
||||
{
|
||||
add_with_ids(n, x, nullptr);
|
||||
}
|
||||
|
||||
void IndexBinaryHash::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids)
|
||||
{
|
||||
uint64_t mask = ((uint64_t)1 << b) - 1;
|
||||
// simplistic add function. Cannot really be parallelized.
|
||||
|
||||
for (idx_t i = 0; i < n; i++) {
|
||||
idx_t id = xids ? xids[i] : ntotal + i;
|
||||
const uint8_t * xi = x + i * code_size;
|
||||
idx_t hash = *((uint64_t*)xi) & mask;
|
||||
invlists[hash].add(id, code_size, xi);
|
||||
}
|
||||
ntotal += n;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
/** Enumerate all bit vectors of size nbit with up to maxflip 1s
|
||||
* test in P127257851 P127258235
|
||||
*/
|
||||
struct FlipEnumerator {
|
||||
int nbit, nflip, maxflip;
|
||||
uint64_t mask, x;
|
||||
|
||||
FlipEnumerator (int nbit, int maxflip): nbit(nbit), maxflip(maxflip) {
|
||||
nflip = 0;
|
||||
mask = 0;
|
||||
x = 0;
|
||||
}
|
||||
|
||||
bool next() {
|
||||
if (x == mask) {
|
||||
if (nflip == maxflip) {
|
||||
return false;
|
||||
}
|
||||
// increase Hamming radius
|
||||
nflip++;
|
||||
mask = (((uint64_t)1 << nflip) - 1);
|
||||
x = mask << (nbit - nflip);
|
||||
return true;
|
||||
}
|
||||
|
||||
int i = __builtin_ctzll(x);
|
||||
|
||||
if (i > 0) {
|
||||
x ^= (uint64_t)3 << (i - 1);
|
||||
} else {
|
||||
// nb of LSB 1s
|
||||
int n1 = __builtin_ctzll(~x);
|
||||
// clear them
|
||||
x &= ((uint64_t)(-1) << n1);
|
||||
int n2 = __builtin_ctzll(x);
|
||||
x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
using idx_t = Index::idx_t;
|
||||
|
||||
|
||||
struct RangeSearchResults {
|
||||
int radius;
|
||||
RangeQueryResult &qres;
|
||||
|
||||
inline void add (float dis, idx_t id) {
|
||||
if (dis < radius) {
|
||||
qres.add (dis, id);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct KnnSearchResults {
|
||||
// heap params
|
||||
idx_t k;
|
||||
int32_t * heap_sim;
|
||||
idx_t * heap_ids;
|
||||
|
||||
using C = CMax<int, idx_t>;
|
||||
|
||||
inline void add (float dis, idx_t id) {
|
||||
if (dis < heap_sim[0]) {
|
||||
heap_pop<C> (k, heap_sim, heap_ids);
|
||||
heap_push<C> (k, heap_sim, heap_ids, dis, id);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<class HammingComputer, class SearchResults>
|
||||
void
|
||||
search_single_query_template(const IndexBinaryHash & index, const uint8_t *q,
|
||||
SearchResults &res,
|
||||
size_t &n0, size_t &nlist, size_t &ndis)
|
||||
{
|
||||
size_t code_size = index.code_size;
|
||||
uint64_t mask = ((uint64_t)1 << index.b) - 1;
|
||||
uint64_t qhash = *((uint64_t*)q) & mask;
|
||||
HammingComputer hc (q, code_size);
|
||||
FlipEnumerator fe(index.b, index.nflip);
|
||||
|
||||
// loop over neighbors that are at most at nflip bits
|
||||
do {
|
||||
uint64_t hash = qhash ^ fe.x;
|
||||
auto it = index.invlists.find (hash);
|
||||
|
||||
if (it == index.invlists.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const IndexBinaryHash::InvertedList &il = it->second;
|
||||
|
||||
size_t nv = il.ids.size();
|
||||
|
||||
if (nv == 0) {
|
||||
n0++;
|
||||
} else {
|
||||
const uint8_t *codes = il.vecs.data();
|
||||
for (size_t i = 0; i < nv; i++) {
|
||||
int dis = hc.hamming (codes);
|
||||
res.add(dis, il.ids[i]);
|
||||
codes += code_size;
|
||||
}
|
||||
ndis += nv;
|
||||
nlist++;
|
||||
}
|
||||
} while(fe.next());
|
||||
}
|
||||
|
||||
template<class SearchResults>
|
||||
void
|
||||
search_single_query(const IndexBinaryHash & index, const uint8_t *q,
|
||||
SearchResults &res,
|
||||
size_t &n0, size_t &nlist, size_t &ndis)
|
||||
{
|
||||
#define HC(name) search_single_query_template<name>(index, q, res, n0, nlist, ndis);
|
||||
switch(index.code_size) {
|
||||
case 4: HC(HammingComputer4); break;
|
||||
case 8: HC(HammingComputer8); break;
|
||||
case 16: HC(HammingComputer16); break;
|
||||
case 20: HC(HammingComputer20); break;
|
||||
case 32: HC(HammingComputer32); break;
|
||||
default:
|
||||
if (index.code_size % 8 == 0) {
|
||||
HC(HammingComputerM8);
|
||||
} else {
|
||||
HC(HammingComputerDefault);
|
||||
}
|
||||
}
|
||||
#undef HC
|
||||
}
|
||||
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
|
||||
|
||||
void IndexBinaryHash::range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const
|
||||
{
|
||||
|
||||
size_t nlist = 0, ndis = 0, n0 = 0;
|
||||
|
||||
#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
|
||||
{
|
||||
RangeSearchPartialResult pres (result);
|
||||
|
||||
#pragma omp for
|
||||
for (size_t i = 0; i < n; i++) { // loop queries
|
||||
RangeQueryResult & qres = pres.new_result (i);
|
||||
RangeSearchResults res = {radius, qres};
|
||||
const uint8_t *q = x + i * code_size;
|
||||
|
||||
search_single_query (*this, q, res, n0, nlist, ndis);
|
||||
|
||||
}
|
||||
pres.finalize ();
|
||||
}
|
||||
indexBinaryHash_stats.nq += n;
|
||||
indexBinaryHash_stats.n0 += n0;
|
||||
indexBinaryHash_stats.nlist += nlist;
|
||||
indexBinaryHash_stats.ndis += ndis;
|
||||
}
|
||||
|
||||
void IndexBinaryHash::search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const
|
||||
{
|
||||
|
||||
using HeapForL2 = CMax<int32_t, idx_t>;
|
||||
size_t nlist = 0, ndis = 0, n0 = 0;
|
||||
|
||||
#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
int32_t * simi = distances + k * i;
|
||||
idx_t * idxi = labels + k * i;
|
||||
|
||||
heap_heapify<HeapForL2> (k, simi, idxi);
|
||||
KnnSearchResults res = {k, simi, idxi};
|
||||
const uint8_t *q = x + i * code_size;
|
||||
|
||||
search_single_query (*this, q, res, n0, nlist, ndis);
|
||||
|
||||
}
|
||||
indexBinaryHash_stats.nq += n;
|
||||
indexBinaryHash_stats.n0 += n0;
|
||||
indexBinaryHash_stats.nlist += nlist;
|
||||
indexBinaryHash_stats.ndis += ndis;
|
||||
}
|
||||
|
||||
size_t IndexBinaryHash::hashtable_size() const
|
||||
{
|
||||
return invlists.size();
|
||||
}
|
||||
|
||||
|
||||
void IndexBinaryHash::display() const
|
||||
{
|
||||
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
|
||||
printf("%ld: [", it->first);
|
||||
const std::vector<idx_t> & v = it->second.ids;
|
||||
for (auto x: v) {
|
||||
printf("%ld ", 0 + x);
|
||||
}
|
||||
printf("]\n");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void IndexBinaryHashStats::reset()
|
||||
{
|
||||
memset ((void*)this, 0, sizeof (*this));
|
||||
}
|
||||
|
||||
IndexBinaryHashStats indexBinaryHash_stats;
|
||||
|
||||
/*******************************************************
|
||||
* IndexBinaryMultiHash implementation
|
||||
******************************************************/
|
||||
|
||||
|
||||
IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b):
|
||||
IndexBinary(d),
|
||||
storage(new IndexBinaryFlat(d)), own_fields(true),
|
||||
maps(nhash), nhash(nhash), b(b), nflip(0)
|
||||
{
|
||||
FAISS_THROW_IF_NOT(nhash * b <= d);
|
||||
}
|
||||
|
||||
IndexBinaryMultiHash::IndexBinaryMultiHash():
|
||||
storage(nullptr), own_fields(true),
|
||||
nhash(0), b(0), nflip(0)
|
||||
{}
|
||||
|
||||
IndexBinaryMultiHash::~IndexBinaryMultiHash()
|
||||
{
|
||||
if (own_fields) {
|
||||
delete storage;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void IndexBinaryMultiHash::reset()
|
||||
{
|
||||
storage->reset();
|
||||
ntotal = 0;
|
||||
for(auto map: maps) {
|
||||
map.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void IndexBinaryMultiHash::add(idx_t n, const uint8_t *x)
|
||||
{
|
||||
storage->add(n, x);
|
||||
// populate maps
|
||||
uint64_t mask = ((uint64_t)1 << b) - 1;
|
||||
|
||||
for(idx_t i = 0; i < n; i++) {
|
||||
const uint8_t *xi = x + i * code_size;
|
||||
int ho = 0;
|
||||
for(int h = 0; h < nhash; h++) {
|
||||
uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
|
||||
hash &= mask;
|
||||
maps[h][hash].push_back(i + ntotal);
|
||||
ho += b;
|
||||
}
|
||||
}
|
||||
ntotal += n;
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
template <class HammingComputer, class SearchResults>
|
||||
static
|
||||
void verify_shortlist(
|
||||
const IndexBinaryFlat & index,
|
||||
const uint8_t * q,
|
||||
const std::unordered_set<Index::idx_t> & shortlist,
|
||||
SearchResults &res)
|
||||
{
|
||||
size_t code_size = index.code_size;
|
||||
size_t nlist = 0, ndis = 0, n0 = 0;
|
||||
|
||||
HammingComputer hc (q, code_size);
|
||||
const uint8_t *codes = index.xb.data();
|
||||
|
||||
for (auto i: shortlist) {
|
||||
int dis = hc.hamming (codes + i * code_size);
|
||||
res.add(dis, i);
|
||||
}
|
||||
}
|
||||
|
||||
template<class SearchResults>
|
||||
void
|
||||
search_1_query_multihash(const IndexBinaryMultiHash & index, const uint8_t *xi,
|
||||
SearchResults &res,
|
||||
size_t &n0, size_t &nlist, size_t &ndis)
|
||||
{
|
||||
|
||||
std::unordered_set<idx_t> shortlist;
|
||||
int b = index.b;
|
||||
uint64_t mask = ((uint64_t)1 << b) - 1;
|
||||
|
||||
int ho = 0;
|
||||
for(int h = 0; h < index.nhash; h++) {
|
||||
uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
|
||||
qhash &= mask;
|
||||
const IndexBinaryMultiHash::Map & map = index.maps[h];
|
||||
|
||||
FlipEnumerator fe(index.b, index.nflip);
|
||||
// loop over neighbors that are at most at nflip bits
|
||||
do {
|
||||
uint64_t hash = qhash ^ fe.x;
|
||||
auto it = map.find (hash);
|
||||
|
||||
if (it != map.end()) {
|
||||
const std::vector<idx_t> & v = it->second;
|
||||
for (auto i: v) {
|
||||
shortlist.insert(i);
|
||||
}
|
||||
nlist++;
|
||||
} else {
|
||||
n0++;
|
||||
}
|
||||
} while(fe.next());
|
||||
|
||||
ho += b;
|
||||
}
|
||||
ndis += shortlist.size();
|
||||
|
||||
// verify shortlist
|
||||
|
||||
#define HC(name) verify_shortlist<name> (*index.storage, xi, shortlist, res)
|
||||
switch(index.code_size) {
|
||||
case 4: HC(HammingComputer4); break;
|
||||
case 8: HC(HammingComputer8); break;
|
||||
case 16: HC(HammingComputer16); break;
|
||||
case 20: HC(HammingComputer20); break;
|
||||
case 32: HC(HammingComputer32); break;
|
||||
default:
|
||||
if (index.code_size % 8 == 0) {
|
||||
HC(HammingComputerM8);
|
||||
} else {
|
||||
HC(HammingComputerDefault);
|
||||
}
|
||||
}
|
||||
#undef HC
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
void IndexBinaryMultiHash::range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const
|
||||
{
|
||||
|
||||
size_t nlist = 0, ndis = 0, n0 = 0;
|
||||
|
||||
#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
|
||||
{
|
||||
RangeSearchPartialResult pres (result);
|
||||
|
||||
#pragma omp for
|
||||
for (size_t i = 0; i < n; i++) { // loop queries
|
||||
RangeQueryResult & qres = pres.new_result (i);
|
||||
RangeSearchResults res = {radius, qres};
|
||||
const uint8_t *q = x + i * code_size;
|
||||
|
||||
search_1_query_multihash (*this, q, res, n0, nlist, ndis);
|
||||
|
||||
}
|
||||
pres.finalize ();
|
||||
}
|
||||
indexBinaryHash_stats.nq += n;
|
||||
indexBinaryHash_stats.n0 += n0;
|
||||
indexBinaryHash_stats.nlist += nlist;
|
||||
indexBinaryHash_stats.ndis += ndis;
|
||||
}
|
||||
|
||||
void IndexBinaryMultiHash::search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const
|
||||
{
|
||||
|
||||
using HeapForL2 = CMax<int32_t, idx_t>;
|
||||
size_t nlist = 0, ndis = 0, n0 = 0;
|
||||
|
||||
#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
int32_t * simi = distances + k * i;
|
||||
idx_t * idxi = labels + k * i;
|
||||
|
||||
heap_heapify<HeapForL2> (k, simi, idxi);
|
||||
KnnSearchResults res = {k, simi, idxi};
|
||||
const uint8_t *q = x + i * code_size;
|
||||
|
||||
search_1_query_multihash (*this, q, res, n0, nlist, ndis);
|
||||
|
||||
}
|
||||
indexBinaryHash_stats.nq += n;
|
||||
indexBinaryHash_stats.n0 += n0;
|
||||
indexBinaryHash_stats.nlist += nlist;
|
||||
indexBinaryHash_stats.ndis += ndis;
|
||||
}
|
||||
|
||||
size_t IndexBinaryMultiHash::hashtable_size() const
|
||||
{
|
||||
size_t tot = 0;
|
||||
for (auto map: maps) {
|
||||
tot += map.size();
|
||||
}
|
||||
|
||||
return tot;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,116 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#ifndef FAISS_BINARY_HASH_H
|
||||
#define FAISS_BINARY_HASH_H
|
||||
|
||||
|
||||
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <faiss/IndexBinary.h>
|
||||
#include <faiss/IndexBinaryFlat.h>
|
||||
#include <faiss/utils/Heap.h>
|
||||
|
||||
|
||||
namespace faiss {
|
||||
|
||||
struct RangeSearchResult;
|
||||
|
||||
|
||||
/** just uses the b first bits as a hash value */
|
||||
struct IndexBinaryHash : IndexBinary {
|
||||
|
||||
struct InvertedList {
|
||||
std::vector<idx_t> ids;
|
||||
std::vector<uint8_t> vecs;
|
||||
|
||||
void add (idx_t id, size_t code_size, const uint8_t *code);
|
||||
};
|
||||
|
||||
using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
|
||||
InvertedListMap invlists;
|
||||
|
||||
int b, nflip;
|
||||
|
||||
IndexBinaryHash(int d, int b);
|
||||
|
||||
IndexBinaryHash();
|
||||
|
||||
void reset() override;
|
||||
|
||||
void add(idx_t n, const uint8_t *x) override;
|
||||
|
||||
void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
|
||||
|
||||
void range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const override;
|
||||
|
||||
void search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const override;
|
||||
|
||||
void display() const;
|
||||
size_t hashtable_size() const;
|
||||
|
||||
};
|
||||
|
||||
struct IndexBinaryHashStats {
|
||||
size_t nq; // nb of queries run
|
||||
size_t n0; // nb of empty lists
|
||||
size_t nlist; // nb of non-empty inverted lists scanned
|
||||
size_t ndis; // nb of distancs computed
|
||||
|
||||
IndexBinaryHashStats () {reset (); }
|
||||
void reset ();
|
||||
};
|
||||
|
||||
extern IndexBinaryHashStats indexBinaryHash_stats;
|
||||
|
||||
|
||||
/** just uses the b first bits as a hash value */
|
||||
struct IndexBinaryMultiHash: IndexBinary {
|
||||
|
||||
// where the vectors are actually stored
|
||||
IndexBinaryFlat *storage;
|
||||
bool own_fields;
|
||||
|
||||
// maps hash values to the ids that hash to them
|
||||
using Map = std::unordered_map<idx_t, std::vector<idx_t> >;
|
||||
|
||||
// the different hashes, size nhash
|
||||
std::vector<Map> maps;
|
||||
|
||||
int nhash; ///< nb of hash maps
|
||||
int b; ///< nb bits per hash map
|
||||
int nflip; ///< nb bit flips to use at search time
|
||||
|
||||
IndexBinaryMultiHash(int d, int nhash, int b);
|
||||
|
||||
IndexBinaryMultiHash();
|
||||
|
||||
~IndexBinaryMultiHash();
|
||||
|
||||
void reset() override;
|
||||
|
||||
void add(idx_t n, const uint8_t *x) override;
|
||||
|
||||
void range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const override;
|
||||
|
||||
void search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const override;
|
||||
|
||||
size_t hashtable_size() const;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -11,11 +11,13 @@
|
|||
#include <faiss/IndexBinaryIVF.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <omp.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
|
||||
#include <faiss/utils/hamming.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
|
@ -281,13 +283,15 @@ namespace {
|
|||
using idx_t = Index::idx_t;
|
||||
|
||||
|
||||
template<class HammingComputer, bool store_pairs>
|
||||
template<class HammingComputer>
|
||||
struct IVFBinaryScannerL2: BinaryInvertedListScanner {
|
||||
|
||||
HammingComputer hc;
|
||||
size_t code_size;
|
||||
bool store_pairs;
|
||||
|
||||
IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
|
||||
IVFBinaryScannerL2 (size_t code_size, bool store_pairs):
|
||||
code_size (code_size), store_pairs(store_pairs)
|
||||
{}
|
||||
|
||||
void set_query (const uint8_t *query_vector) override {
|
||||
|
@ -316,7 +320,7 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
|
|||
uint32_t dis = hc.hamming (codes);
|
||||
if (dis < simi[0]) {
|
||||
heap_pop<C> (k, simi, idxi);
|
||||
idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
|
||||
idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
|
||||
heap_push<C> (k, simi, idxi, dis, id);
|
||||
nup++;
|
||||
}
|
||||
|
@ -325,6 +329,24 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
|
|||
return nup;
|
||||
}
|
||||
|
||||
void scan_codes_range (size_t n,
|
||||
const uint8_t *codes,
|
||||
const idx_t *ids,
|
||||
int radius,
|
||||
RangeQueryResult &result) const
|
||||
{
|
||||
size_t nup = 0;
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
uint32_t dis = hc.hamming (codes);
|
||||
if (dis < radius) {
|
||||
int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
|
||||
result.add (dis, id);
|
||||
}
|
||||
codes += code_size;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
@ -332,29 +354,6 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
|
|||
template <bool store_pairs>
|
||||
BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
|
||||
|
||||
switch (code_size) {
|
||||
#define HANDLE_CS(cs) \
|
||||
case cs: \
|
||||
return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
|
||||
HANDLE_CS(4);
|
||||
HANDLE_CS(8);
|
||||
HANDLE_CS(16);
|
||||
HANDLE_CS(20);
|
||||
HANDLE_CS(32);
|
||||
HANDLE_CS(64);
|
||||
#undef HANDLE_CS
|
||||
default:
|
||||
if (code_size % 8 == 0) {
|
||||
return new IVFBinaryScannerL2<HammingComputerM8,
|
||||
store_pairs> (code_size);
|
||||
} else if (code_size % 4 == 0) {
|
||||
return new IVFBinaryScannerL2<HammingComputerM4,
|
||||
store_pairs> (code_size);
|
||||
} else {
|
||||
return new IVFBinaryScannerL2<HammingComputerDefault,
|
||||
store_pairs> (code_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -425,8 +424,10 @@ void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
|
|||
ids = sids->get();
|
||||
}
|
||||
|
||||
nheap += scanner->scan_codes (list_size, scodes.get(),
|
||||
ids, simi, idxi, k);
|
||||
nheap += scanner->scan_codes (
|
||||
list_size, scodes.get(),
|
||||
ids, simi, idxi, k
|
||||
);
|
||||
|
||||
nscan += list_size;
|
||||
if (max_codes && nscan >= max_codes)
|
||||
|
@ -586,11 +587,26 @@ void search_knn_hamming_count_1 (
|
|||
BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
|
||||
(bool store_pairs) const
|
||||
{
|
||||
if (store_pairs) {
|
||||
return select_IVFBinaryScannerL2<true> (code_size);
|
||||
} else {
|
||||
return select_IVFBinaryScannerL2<false> (code_size);
|
||||
|
||||
#define HC(name) return new IVFBinaryScannerL2<name> (code_size, store_pairs)
|
||||
switch (code_size) {
|
||||
case 4: HC(HammingComputer4);
|
||||
case 8: HC(HammingComputer8);
|
||||
case 16: HC(HammingComputer16);
|
||||
case 20: HC(HammingComputer20);
|
||||
case 32: HC(HammingComputer32);
|
||||
case 64: HC(HammingComputer64);
|
||||
default:
|
||||
if (code_size % 8 == 0) {
|
||||
HC(HammingComputerM8);
|
||||
} else if (code_size % 4 == 0) {
|
||||
HC(HammingComputerM4);
|
||||
} else {
|
||||
HC(HammingComputerDefault);
|
||||
}
|
||||
}
|
||||
#undef HC
|
||||
|
||||
}
|
||||
|
||||
void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
|
||||
|
@ -616,6 +632,84 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void IndexBinaryIVF::range_search(
|
||||
idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *res) const
|
||||
{
|
||||
|
||||
std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
|
||||
std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
|
||||
|
||||
double t0 = getmillisecs();
|
||||
quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
|
||||
indexIVF_stats.quantization_time += getmillisecs() - t0;
|
||||
|
||||
t0 = getmillisecs();
|
||||
invlists->prefetch_lists(idx.get(), n * nprobe);
|
||||
|
||||
bool store_pairs = false;
|
||||
size_t nlistv = 0, ndis = 0;
|
||||
|
||||
std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
|
||||
|
||||
#pragma omp parallel reduction(+: nlistv, ndis)
|
||||
{
|
||||
RangeSearchPartialResult pres(res);
|
||||
std::unique_ptr<BinaryInvertedListScanner> scanner
|
||||
(get_InvertedListScanner(store_pairs));
|
||||
FAISS_THROW_IF_NOT (scanner.get ());
|
||||
|
||||
all_pres[omp_get_thread_num()] = &pres;
|
||||
|
||||
auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres)
|
||||
{
|
||||
|
||||
idx_t key = idx[i * nprobe + ik]; /* select the list */
|
||||
if (key < 0) return;
|
||||
FAISS_THROW_IF_NOT_FMT (
|
||||
key < (idx_t) nlist,
|
||||
"Invalid key=%ld at ik=%ld nlist=%ld\n",
|
||||
key, ik, nlist);
|
||||
const size_t list_size = invlists->list_size(key);
|
||||
|
||||
if (list_size == 0) return;
|
||||
|
||||
InvertedLists::ScopedCodes scodes (invlists, key);
|
||||
InvertedLists::ScopedIds ids (invlists, key);
|
||||
|
||||
scanner->set_list (key, coarse_dis[i * nprobe + ik]);
|
||||
nlistv++;
|
||||
ndis += list_size;
|
||||
scanner->scan_codes_range (list_size, scodes.get(),
|
||||
ids.get(), radius, qres);
|
||||
};
|
||||
|
||||
#pragma omp for
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
scanner->set_query (x + i * code_size);
|
||||
|
||||
RangeQueryResult & qres = pres.new_result (i);
|
||||
|
||||
for (size_t ik = 0; ik < nprobe; ik++) {
|
||||
scan_list_func (i, ik, qres);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pres.finalize();
|
||||
|
||||
}
|
||||
indexIVF_stats.nq += n;
|
||||
indexIVF_stats.nlist += nlistv;
|
||||
indexIVF_stats.ndis += ndis;
|
||||
indexIVF_stats.search_time += getmillisecs() - t0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
IndexBinaryIVF::~IndexBinaryIVF() {
|
||||
if (own_invlists) {
|
||||
delete invlists;
|
||||
|
|
|
@ -109,8 +109,11 @@ struct IndexBinaryIVF : IndexBinary {
|
|||
bool store_pairs=false) const;
|
||||
|
||||
/** assign the vectors, then call search_preassign */
|
||||
virtual void search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const override;
|
||||
void search(idx_t n, const uint8_t *x, idx_t k,
|
||||
int32_t *distances, idx_t *labels) const override;
|
||||
|
||||
void range_search(idx_t n, const uint8_t *x, int radius,
|
||||
RangeSearchResult *result) const override;
|
||||
|
||||
void reconstruct(idx_t key, uint8_t *recons) const override;
|
||||
|
||||
|
@ -202,6 +205,12 @@ struct BinaryInvertedListScanner {
|
|||
int32_t *distances, idx_t *labels,
|
||||
size_t k) const = 0;
|
||||
|
||||
virtual void scan_codes_range (size_t n,
|
||||
const uint8_t *codes,
|
||||
const idx_t *ids,
|
||||
int radius,
|
||||
RangeQueryResult &result) const = 0;
|
||||
|
||||
virtual ~BinaryInvertedListScanner () {}
|
||||
|
||||
};
|
||||
|
|
|
@ -19,6 +19,7 @@ namespace faiss {
|
|||
|
||||
/** Index that stores the full vectors and performs exhaustive search */
|
||||
struct IndexFlat: Index {
|
||||
|
||||
/// database vectors, size ntotal * d
|
||||
std::vector<float> xb;
|
||||
|
||||
|
@ -144,7 +145,7 @@ struct IndexRefineFlat: Index {
|
|||
};
|
||||
|
||||
|
||||
/// optimized version for 1D "vectors"
|
||||
/// optimized version for 1D "vectors".
|
||||
struct IndexFlat1D:IndexFlatL2 {
|
||||
bool continuous_update; ///< is the permutation updated continuously?
|
||||
|
||||
|
|
|
@ -612,7 +612,6 @@ InvertedListScanner *IndexIVF::get_InvertedListScanner (
|
|||
void IndexIVF::reconstruct (idx_t key, float* recons) const
|
||||
{
|
||||
idx_t lo = direct_map.get (key);
|
||||
|
||||
reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
|
||||
}
|
||||
|
||||
|
|
17
IndexIVFPQ.h
17
IndexIVFPQ.h
|
@ -42,14 +42,14 @@ struct IndexIVFPQ: IndexIVF {
|
|||
int polysemous_ht; ///< Hamming thresh for polysemous filtering
|
||||
|
||||
/** Precompute table that speed up query preprocessing at some
|
||||
* memory cost
|
||||
* memory cost (used only for by_residual with L2 metric)
|
||||
* =-1: force disable
|
||||
* =0: decide heuristically (default: use tables only if they are
|
||||
* < precomputed_tables_max_bytes)
|
||||
* =1: tables that work for all quantizers (size 256 * nlist * M)
|
||||
* =2: specific version for MultiIndexQuantizer (much more compact)
|
||||
*/
|
||||
int use_precomputed_table; ///< if by_residual, build precompute tables
|
||||
int use_precomputed_table;
|
||||
static size_t precomputed_table_max_bytes;
|
||||
|
||||
/// if use_precompute_table
|
||||
|
@ -93,9 +93,9 @@ struct IndexIVFPQ: IndexIVF {
|
|||
* the duplicates are returned in pre-allocated arrays (see the
|
||||
* max sizes).
|
||||
*
|
||||
* @params lims limits between groups of duplicates
|
||||
* @param lims limits between groups of duplicates
|
||||
* (max size ntotal / 2 + 1)
|
||||
* @params ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
|
||||
* @param ids ids[lims[i]] : ids[lims[i+1]-1] is a group of
|
||||
* duplicates (max size ntotal)
|
||||
* @return n number of groups found
|
||||
*/
|
||||
|
@ -135,15 +135,14 @@ struct IndexIVFPQ: IndexIVF {
|
|||
/// statistics are robust to internal threading, but not if
|
||||
/// IndexIVFPQ::search_preassigned is called by multiple threads
|
||||
struct IndexIVFPQStats {
|
||||
size_t nrefine; // nb of refines (IVFPQR)
|
||||
size_t nrefine; ///< nb of refines (IVFPQR)
|
||||
|
||||
size_t n_hamming_pass;
|
||||
// nb of passed Hamming distance tests (for polysemous)
|
||||
///< nb of passed Hamming distance tests (for polysemous)
|
||||
|
||||
// timings measured with the CPU RTC
|
||||
// on all threads
|
||||
// timings measured with the CPU RTC on all threads
|
||||
size_t search_cycles;
|
||||
size_t refine_cycles; // only for IVFPQR
|
||||
size_t refine_cycles; ///< only for IVFPQR
|
||||
|
||||
IndexIVFPQStats () {reset (); }
|
||||
void reset ();
|
||||
|
|
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <faiss/Clustering.h>
|
||||
#include <faiss/utils/random.h>
|
||||
#include <faiss/utils/distances.h>
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/IndexHNSW.h>
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
enum WeightedKMeansType {
|
||||
WKMT_FlatL2,
|
||||
WKMT_FlatIP,
|
||||
WKMT_FlatIP_spherical,
|
||||
WKMT_HNSW,
|
||||
};
|
||||
|
||||
|
||||
float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
|
||||
const float *input,
|
||||
const float *weights,
|
||||
float *centroids,
|
||||
WeightedKMeansType index_num)
|
||||
{
|
||||
using namespace faiss;
|
||||
Clustering clus (d, k);
|
||||
clus.verbose = true;
|
||||
|
||||
std::unique_ptr<Index> index;
|
||||
|
||||
switch (index_num) {
|
||||
case WKMT_FlatL2:
|
||||
index.reset(new IndexFlatL2 (d));
|
||||
break;
|
||||
case WKMT_FlatIP:
|
||||
index.reset(new IndexFlatIP (d));
|
||||
break;
|
||||
case WKMT_FlatIP_spherical:
|
||||
index.reset(new IndexFlatIP (d));
|
||||
clus.spherical = true;
|
||||
break;
|
||||
case WKMT_HNSW:
|
||||
IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
|
||||
ihnsw->hnsw.efSearch = 128;
|
||||
index.reset(ihnsw);
|
||||
break;
|
||||
}
|
||||
|
||||
clus.train(n, input, *index.get(), weights);
|
||||
// on output the index contains the centroids.
|
||||
memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
|
||||
return clus.iteration_stats.back().obj;
|
||||
}
|
||||
|
||||
|
||||
int d = 32;
|
||||
float sigma = 0.1;
|
||||
|
||||
#define BIGTEST
|
||||
|
||||
#ifdef BIGTEST
|
||||
// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
|
||||
int nc = 200000;
|
||||
int n_big = 4;
|
||||
int n_small = 2;
|
||||
#else
|
||||
int nc = 5;
|
||||
int n_big = 100;
|
||||
int n_small = 10;
|
||||
#endif
|
||||
|
||||
int n; // number of training points
|
||||
|
||||
void generate_trainset (std::vector<float> & ccent,
|
||||
std::vector<float> & x,
|
||||
std::vector<float> & weights)
|
||||
{
|
||||
// same sampling as test_build_blocks.py test_weighted
|
||||
|
||||
ccent.resize (d * 2 * nc);
|
||||
faiss::float_randn (ccent.data(), d * 2 * nc, 123);
|
||||
faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
|
||||
n = nc * n_big + nc * n_small;
|
||||
x.resize(d * n);
|
||||
weights.resize(n);
|
||||
faiss::float_randn (x.data(), x.size(), 1234);
|
||||
|
||||
float *xi = x.data();
|
||||
float *w = weights.data();
|
||||
for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
|
||||
int np = ci < nc ? n_big : n_small; // nb of points around this centroid
|
||||
for (int i = 0; i < np; i++) {
|
||||
for (int j = 0; j < d; j++) {
|
||||
xi[j] = xi[j] * sigma + ccent[ci * d + j];
|
||||
}
|
||||
*w++ = ci < nc ? 0.1 : 10;
|
||||
xi += d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
std::vector<float> ccent;
|
||||
std::vector<float> x;
|
||||
std::vector<float> weights;
|
||||
|
||||
printf("generate training set\n");
|
||||
generate_trainset(ccent, x, weights);
|
||||
|
||||
std::vector<float> centroids;
|
||||
centroids.resize(nc * d);
|
||||
|
||||
int the_index_num = -1;
|
||||
int the_with_weights = -1;
|
||||
|
||||
if (argc == 3) {
|
||||
the_index_num = atoi(argv[1]);
|
||||
the_with_weights = atoi(argv[2]);
|
||||
}
|
||||
|
||||
|
||||
for (int index_num = WKMT_FlatL2;
|
||||
index_num <= WKMT_HNSW;
|
||||
index_num++) {
|
||||
|
||||
if (the_index_num >= 0 && index_num != the_index_num) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int with_weights = 0; with_weights <= 1; with_weights++) {
|
||||
if (the_with_weights >= 0 && with_weights != the_with_weights) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("=================== index_num=%d Run %s weights\n",
|
||||
index_num, with_weights ? "with" : "without");
|
||||
|
||||
weighted_kmeans_clustering (
|
||||
d, n, nc, x.data(),
|
||||
with_weights ? weights.data() : nullptr,
|
||||
centroids.data(), (WeightedKMeansType)index_num
|
||||
);
|
||||
|
||||
{ // compute distance of points to centroids
|
||||
faiss::IndexFlatL2 cent_index(d);
|
||||
cent_index.add(nc, centroids.data());
|
||||
std::vector<float> dis (n);
|
||||
std::vector<faiss::Index::idx_t> idx (n);
|
||||
|
||||
cent_index.search (nc * 2, ccent.data(), 1,
|
||||
dis.data(), idx.data());
|
||||
|
||||
float dis1 = 0, dis2 = 0;
|
||||
for (int i = 0; i < nc ; i++) {
|
||||
dis1 += dis[i];
|
||||
}
|
||||
printf("average distance of points from big clusters: %g\n",
|
||||
dis1 / nc);
|
||||
|
||||
for (int i = 0; i < nc ; i++) {
|
||||
dis2 += dis[i + nc];
|
||||
}
|
||||
|
||||
printf("average distance of points from small clusters: %g\n",
|
||||
dis2 / nc);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -29,8 +29,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
|
|||
config),
|
||||
config_(std::move(config)),
|
||||
data_(nullptr) {
|
||||
verifySettings_();
|
||||
|
||||
// Flat index doesn't need training
|
||||
this->is_trained = true;
|
||||
|
||||
|
@ -44,8 +42,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
|
|||
GpuIndex(resources, dims, metric, 0, config),
|
||||
config_(std::move(config)),
|
||||
data_(nullptr) {
|
||||
verifySettings_();
|
||||
|
||||
// Flat index doesn't need training
|
||||
this->is_trained = true;
|
||||
|
||||
|
@ -298,21 +294,6 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n,
|
|||
fromDevice<float, 2>(residualDevice, residuals, stream);
|
||||
}
|
||||
|
||||
void
|
||||
GpuIndexFlat::verifySettings_() const {
|
||||
// If we want Hgemm, ensure that it is supported on this device
|
||||
if (config_.useFloat16Accumulator) {
|
||||
FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
|
||||
"useFloat16Accumulator can only be enabled "
|
||||
"with useFloat16");
|
||||
|
||||
FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
|
||||
"Device %d does not support Hgemm "
|
||||
"(useFloat16Accumulator)",
|
||||
config_.device);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// GpuIndexFlatL2
|
||||
//
|
||||
|
|
|
@ -25,17 +25,12 @@ struct FlatIndex;
|
|||
struct GpuIndexFlatConfig : public GpuIndexConfig {
|
||||
inline GpuIndexFlatConfig()
|
||||
: useFloat16(false),
|
||||
useFloat16Accumulator(false),
|
||||
storeTransposed(false) {
|
||||
}
|
||||
|
||||
/// Whether or not data is stored as float16
|
||||
bool useFloat16;
|
||||
|
||||
/// This option is now deprecated and doesn't do anything. All accumulation of
|
||||
/// float16 or float32 data is now done in float32.
|
||||
bool useFloat16Accumulator;
|
||||
|
||||
/// Whether or not data is stored (transparently) in a transposed
|
||||
/// layout, enabling use of the NN GEMM call, which is ~10% faster.
|
||||
/// This will improve the speed of the flat index, but will
|
||||
|
@ -123,10 +118,6 @@ class GpuIndexFlat : public GpuIndex {
|
|||
float* distances,
|
||||
faiss::Index::idx_t* labels) const override;
|
||||
|
||||
private:
|
||||
/// Checks user settings for consistency
|
||||
void verifySettings_() const;
|
||||
|
||||
protected:
|
||||
/// Our config object
|
||||
const GpuIndexFlatConfig config_;
|
||||
|
|
|
@ -62,6 +62,22 @@ FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
|
|||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
Tensor<float, 2, true>&
|
||||
FlatIndex::getVectorsRef<float>() {
|
||||
// Should not call this unless we are in float32 mode
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
return getVectorsFloat32Ref();
|
||||
}
|
||||
|
||||
template <>
|
||||
Tensor<half, 2, true>&
|
||||
FlatIndex::getVectorsRef<half>() {
|
||||
// Should not call this unless we are in float16 mode
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
return getVectorsFloat16Ref();
|
||||
}
|
||||
|
||||
Tensor<float, 2, true>&
|
||||
FlatIndex::getVectorsFloat32Ref() {
|
||||
// Should not call this unless we are in float32 mode
|
||||
|
|
|
@ -26,16 +26,23 @@ class FlatIndex {
|
|||
bool storeTransposed,
|
||||
MemorySpace space);
|
||||
|
||||
/// Whether or not this flat index primarily stores data in float16
|
||||
bool getUseFloat16() const;
|
||||
|
||||
/// Returns the number of vectors we contain
|
||||
int getSize() const;
|
||||
|
||||
/// Returns the dimensionality of the vectors
|
||||
int getDim() const;
|
||||
|
||||
/// Reserve storage that can contain at least this many vectors
|
||||
void reserve(size_t numVecs, cudaStream_t stream);
|
||||
|
||||
/// Returns the vectors based on the type desired; the FlatIndex must be of
|
||||
/// the same type (float16 or float32) to not assert
|
||||
template <typename T>
|
||||
Tensor<T, 2, true>& getVectorsRef();
|
||||
|
||||
/// Returns a reference to our vectors currently in use
|
||||
Tensor<float, 2, true>& getVectorsFloat32Ref();
|
||||
|
||||
|
|
|
@ -123,8 +123,6 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
||||
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
||||
|
||||
FAISS_ASSERT(!quantizer_->getUseFloat16());
|
||||
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
||||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
|
||||
|
@ -155,7 +153,13 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||
DeviceTensor<float, 2, true> residuals(
|
||||
mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
|
||||
|
||||
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
||||
if (quantizer_->getUseFloat16()) {
|
||||
auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
|
||||
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
||||
} else {
|
||||
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
||||
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
||||
}
|
||||
|
||||
// Residuals are in the form
|
||||
// (vec x numSubQuantizer x dimPerSubQuantizer)
|
||||
|
@ -437,8 +441,9 @@ IVFPQ::setPQCentroids_(float* data) {
|
|||
pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
IVFPQ::precomputeCodes_() {
|
||||
IVFPQ::precomputeCodesT_() {
|
||||
FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
|
||||
|
||||
//
|
||||
|
@ -449,8 +454,6 @@ IVFPQ::precomputeCodes_() {
|
|||
|
||||
// Terms 1 and 3 are available only at query time. We compute term 2
|
||||
// here.
|
||||
FAISS_ASSERT(!quantizer_->getUseFloat16());
|
||||
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
||||
|
||||
// Compute ||y_R||^2 by treating
|
||||
// (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
|
||||
|
@ -473,9 +476,10 @@ IVFPQ::precomputeCodes_() {
|
|||
// (centroid id)(sub q)(dim)
|
||||
// Transpose (centroid id)(sub q)(sub dim) to
|
||||
// (sub q)(centroid id)(sub dim)
|
||||
auto centroidView = coarseCentroids.view<3>(
|
||||
auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
|
||||
auto centroidView = coarseCentroids.template view<3>(
|
||||
{coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
|
||||
DeviceTensor<float, 3, true> centroidsTransposed(
|
||||
DeviceTensor<CentroidT, 3, true> centroidsTransposed(
|
||||
{numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
|
||||
|
||||
runTransposeAny(centroidView, 0, 1, centroidsTransposed,
|
||||
|
@ -521,6 +525,15 @@ IVFPQ::precomputeCodes_() {
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
IVFPQ::precomputeCodes_() {
|
||||
if (quantizer_->getUseFloat16()) {
|
||||
precomputeCodesT_<half>();
|
||||
} else {
|
||||
precomputeCodesT_<float>();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
IVFPQ::query(Tensor<float, 2, true>& queries,
|
||||
int nprobe,
|
||||
|
@ -688,16 +701,16 @@ IVFPQ::runPQPrecomputedCodes_(
|
|||
resources_);
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
IVFPQ::runPQNoPrecomputedCodes_(
|
||||
IVFPQ::runPQNoPrecomputedCodesT_(
|
||||
Tensor<float, 2, true>& queries,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices) {
|
||||
FAISS_ASSERT(!quantizer_->getUseFloat16());
|
||||
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
||||
auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
|
||||
|
||||
runPQScanMultiPassNoPrecomputed(queries,
|
||||
coarseCentroids,
|
||||
|
@ -719,4 +732,29 @@ IVFPQ::runPQNoPrecomputedCodes_(
|
|||
resources_);
|
||||
}
|
||||
|
||||
void
|
||||
IVFPQ::runPQNoPrecomputedCodes_(
|
||||
Tensor<float, 2, true>& queries,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices) {
|
||||
if (quantizer_->getUseFloat16()) {
|
||||
runPQNoPrecomputedCodesT_<half>(queries,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
k,
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else {
|
||||
runPQNoPrecomputedCodesT_<float>(queries,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
k,
|
||||
outDistances,
|
||||
outIndices);
|
||||
}
|
||||
}
|
||||
|
||||
} } // namespace
|
||||
|
|
|
@ -83,6 +83,11 @@ class IVFPQ : public IVFBase {
|
|||
/// Calculate precomputed residual distance information
|
||||
void precomputeCodes_();
|
||||
|
||||
/// Calculate precomputed residual distance information (for different coarse
|
||||
/// centroid type)
|
||||
template <typename CentroidT>
|
||||
void precomputeCodesT_();
|
||||
|
||||
/// Runs kernels for scanning inverted lists with precomputed codes
|
||||
void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
|
@ -99,6 +104,16 @@ class IVFPQ : public IVFBase {
|
|||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices);
|
||||
|
||||
/// Runs kernels for scanning inverted lists without precomputed codes (for
|
||||
/// different coarse centroid type)
|
||||
template <typename CentroidT>
|
||||
void runPQNoPrecomputedCodesT_(Tensor<float, 2, true>& queries,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices);
|
||||
|
||||
private:
|
||||
/// Number of sub-quantizers per vector
|
||||
const int numSubQuantizers_;
|
||||
|
|
|
@ -0,0 +1,561 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include <faiss/gpu/impl/BroadcastSum.cuh>
|
||||
#include <faiss/gpu/impl/Distance.cuh>
|
||||
#include <faiss/gpu/impl/L2Norm.cuh>
|
||||
#include <faiss/gpu/utils/ConversionOperators.cuh>
|
||||
#include <faiss/gpu/utils/DeviceDefs.cuh>
|
||||
#include <faiss/gpu/utils/DeviceUtils.h>
|
||||
#include <faiss/gpu/utils/Float16.cuh>
|
||||
#include <faiss/gpu/utils/MatrixMult.cuh>
|
||||
#include <faiss/gpu/utils/PtxUtils.cuh>
|
||||
#include <faiss/gpu/utils/StaticUtils.h>
|
||||
#include <faiss/gpu/utils/Transpose.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
// Kernel responsible for calculating distance from residual vector to
|
||||
// each product quantizer code centroid
|
||||
template <typename OutCodeT,
|
||||
typename CentroidT,
|
||||
int DimsPerSubQuantizer,
|
||||
bool L2Distance>
|
||||
__global__ void
|
||||
__launch_bounds__(288, 4)
|
||||
pqCodeDistances(Tensor<float, 2, true> queries,
|
||||
int queriesPerBlock,
|
||||
Tensor<CentroidT, 2, true> coarseCentroids,
|
||||
Tensor<float, 3, true> pqCentroids,
|
||||
Tensor<int, 2, true> topQueryToCentroid,
|
||||
// (query id)(coarse)(subquantizer)(code) -> dist
|
||||
Tensor<OutCodeT, 4, true> outCodeDistances) {
|
||||
const auto numSubQuantizers = pqCentroids.getSize(0);
|
||||
const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
|
||||
assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
|
||||
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
|
||||
|
||||
bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
|
||||
int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
|
||||
// Each thread calculates a single code
|
||||
float subQuantizerData[DimsPerSubQuantizer];
|
||||
|
||||
auto code = threadIdx.x;
|
||||
auto subQuantizer = blockIdx.y;
|
||||
|
||||
// Each thread will load the pq centroid data for the code that it
|
||||
// is processing
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DimsPerSubQuantizer; ++i) {
|
||||
subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
|
||||
}
|
||||
|
||||
// Where we store our query vector
|
||||
float* smemQuery = smem;
|
||||
|
||||
// Where we store our residual vector; this is double buffered so we
|
||||
// can be loading the next one while processing the current one
|
||||
float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
|
||||
float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
|
||||
|
||||
// Where we pre-load the coarse centroid IDs
|
||||
int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
|
||||
|
||||
// Each thread is calculating the distance for a single code,
|
||||
// performing the reductions locally
|
||||
|
||||
// Handle multiple queries per block
|
||||
auto startQueryId = blockIdx.x * queriesPerBlock;
|
||||
auto numQueries = queries.getSize(0) - startQueryId;
|
||||
if (numQueries > queriesPerBlock) {
|
||||
numQueries = queriesPerBlock;
|
||||
}
|
||||
|
||||
for (int query = 0; query < numQueries; ++query) {
|
||||
auto queryId = startQueryId + query;
|
||||
|
||||
auto querySubQuantizer =
|
||||
queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
|
||||
|
||||
// Load current query vector
|
||||
for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
|
||||
smemQuery[i] = querySubQuantizer[i];
|
||||
}
|
||||
|
||||
// Load list of coarse centroids found
|
||||
for (int i = threadIdx.x;
|
||||
i < topQueryToCentroid.getSize(1); i += blockDim.x) {
|
||||
coarseIds[i] = topQueryToCentroid[queryId][i];
|
||||
}
|
||||
|
||||
// We need coarseIds below
|
||||
// FIXME: investigate loading separately, so we don't need this
|
||||
__syncthreads();
|
||||
|
||||
// Preload first buffer of residual data
|
||||
if (isLoadingThread) {
|
||||
for (int i = loadingThreadId;
|
||||
i < DimsPerSubQuantizer;
|
||||
i += blockDim.x - codesPerSubQuantizer) {
|
||||
auto coarseId = coarseIds[0];
|
||||
// In case NaNs were in the original query data
|
||||
coarseId = coarseId == -1 ? 0 : coarseId;
|
||||
auto coarseCentroidSubQuantizer =
|
||||
coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
|
||||
|
||||
if (L2Distance) {
|
||||
smemResidual1[i] = smemQuery[i] -
|
||||
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
|
||||
} else {
|
||||
smemResidual1[i] =
|
||||
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The block walks the list for a single query
|
||||
for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
|
||||
// Wait for smemResidual1 to be loaded
|
||||
__syncthreads();
|
||||
|
||||
if (isLoadingThread) {
|
||||
// Preload second buffer of residual data
|
||||
for (int i = loadingThreadId;
|
||||
i < DimsPerSubQuantizer;
|
||||
i += blockDim.x - codesPerSubQuantizer) {
|
||||
// FIXME: try always making this centroid id 0 so we can
|
||||
// terminate
|
||||
if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
|
||||
auto coarseId = coarseIds[coarse + 1];
|
||||
// In case NaNs were in the original query data
|
||||
coarseId = coarseId == -1 ? 0 : coarseId;
|
||||
|
||||
auto coarseCentroidSubQuantizer =
|
||||
coarseCentroids[coarseId]
|
||||
[subQuantizer * dimsPerSubQuantizer].data();
|
||||
|
||||
if (L2Distance) {
|
||||
smemResidual2[i] = smemQuery[i] -
|
||||
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
|
||||
} else {
|
||||
smemResidual2[i] =
|
||||
ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// These are the processing threads
|
||||
float dist = 0.0f;
|
||||
|
||||
constexpr int kUnroll = 4;
|
||||
constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
|
||||
constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
|
||||
float vals[kUnroll];
|
||||
|
||||
// Calculate residual - pqCentroid for each dim that we're
|
||||
// processing
|
||||
|
||||
// Unrolled loop
|
||||
if (L2Distance) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] = smemResidual1[i * kUnroll + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] -= subQuantizerData[i * kUnroll + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] *= vals[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
dist += vals[j];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Inner product: query slice against the reconstructed sub-quantizer
|
||||
// for this coarse cell (query o (centroid + subQCentroid))
|
||||
#pragma unroll
|
||||
for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] = smemResidual1[i * kUnroll + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] += subQuantizerData[i * kUnroll + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] *= smemQuery[i * kUnroll + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
dist += vals[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remainder loop
|
||||
if (L2Distance) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
vals[j] = smemResidual1[kRemainderBase + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
vals[j] -= subQuantizerData[kRemainderBase + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
vals[j] *= vals[j];
|
||||
}
|
||||
} else {
|
||||
// Inner product
|
||||
// Inner product: query slice against the reconstructed sub-quantizer
|
||||
// for this coarse cell (query o (centroid + subQCentroid))
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
vals[j] = smemResidual1[kRemainderBase + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
vals[j] += subQuantizerData[kRemainderBase + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
vals[j] *= smemQuery[kRemainderBase + j];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kRemainder; ++j) {
|
||||
dist += vals[j];
|
||||
}
|
||||
|
||||
// We have the distance for our code; write it out
|
||||
outCodeDistances[queryId][coarse][subQuantizer][code] =
|
||||
ConvertTo<OutCodeT>::to(dist);
|
||||
} // !isLoadingThread
|
||||
|
||||
// Swap residual buffers
|
||||
float* tmp = smemResidual1;
|
||||
smemResidual1 = smemResidual2;
|
||||
smemResidual2 = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
__global__ void
|
||||
residualVector(Tensor<float, 2, true> queries,
|
||||
Tensor<CentroidT, 2, true> coarseCentroids,
|
||||
Tensor<int, 2, true> topQueryToCentroid,
|
||||
int numSubDim,
|
||||
// output is transposed:
|
||||
// (sub q)(query id)(centroid id)(sub dim)
|
||||
Tensor<float, 4, true> residual) {
|
||||
// block x is query id
|
||||
// block y is centroid id
|
||||
// thread x is dim
|
||||
auto queryId = blockIdx.x;
|
||||
auto centroidId = blockIdx.y;
|
||||
|
||||
int realCentroidId = topQueryToCentroid[queryId][centroidId];
|
||||
|
||||
for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
|
||||
float q = queries[queryId][dim];
|
||||
float c = ConvertTo<float>::to(coarseCentroids[realCentroidId][dim]);
|
||||
|
||||
residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = q - c;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
runResidualVector(Tensor<float, 3, true>& pqCentroids,
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<CentroidT, 2, true>& coarseCentroids,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<float, 4, true>& residual,
|
||||
cudaStream_t stream) {
|
||||
auto grid =
|
||||
dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
|
||||
auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
|
||||
|
||||
residualVector<<<grid, block, 0, stream>>>(
|
||||
queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
|
||||
residual);
|
||||
|
||||
CUDA_TEST_ERROR();
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<CentroidT, 2, true>& coarseCentroids,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
NoTypeTensor<4, true>& outCodeDistances,
|
||||
bool useFloat16Lookup,
|
||||
DeviceMemory& mem,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
// Calculate (q - c) residual vector
|
||||
// (sub q)(query id)(centroid id)(sub dim)
|
||||
DeviceTensor<float, 4, true> residual(
|
||||
mem,
|
||||
{pqCentroids.getSize(0),
|
||||
topQueryToCentroid.getSize(0),
|
||||
topQueryToCentroid.getSize(1),
|
||||
pqCentroids.getSize(1)},
|
||||
stream);
|
||||
|
||||
runResidualVector(pqCentroids, queries,
|
||||
coarseCentroids, topQueryToCentroid,
|
||||
residual, stream);
|
||||
|
||||
// Calculate ||q - c||^2
|
||||
DeviceTensor<float, 1, true> residualNorms(
|
||||
mem,
|
||||
{pqCentroids.getSize(0) *
|
||||
topQueryToCentroid.getSize(0) *
|
||||
topQueryToCentroid.getSize(1)},
|
||||
stream);
|
||||
|
||||
auto residualView2 = residual.view<2>(
|
||||
{pqCentroids.getSize(0) *
|
||||
topQueryToCentroid.getSize(0) *
|
||||
topQueryToCentroid.getSize(1),
|
||||
pqCentroids.getSize(1)});
|
||||
|
||||
runL2Norm(residualView2, true, residualNorms, true, stream);
|
||||
|
||||
// Perform a batch MM:
|
||||
// (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
|
||||
// (sub q) x {(q * c)(code)}
|
||||
auto residualView3 = residual.view<3>(
|
||||
{pqCentroids.getSize(0),
|
||||
topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
|
||||
pqCentroids.getSize(1)});
|
||||
|
||||
DeviceTensor<float, 3, true> residualDistance(
|
||||
mem,
|
||||
{pqCentroids.getSize(0),
|
||||
topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
|
||||
pqCentroids.getSize(2)},
|
||||
stream);
|
||||
|
||||
runIteratedMatrixMult(residualDistance, false,
|
||||
residualView3, false,
|
||||
pqCentroids, false,
|
||||
-2.0f, 0.0f,
|
||||
handle,
|
||||
stream);
|
||||
|
||||
// Sum ||q - c||^2 along rows
|
||||
auto residualDistanceView2 = residualDistance.view<2>(
|
||||
{pqCentroids.getSize(0) *
|
||||
topQueryToCentroid.getSize(0) *
|
||||
topQueryToCentroid.getSize(1),
|
||||
pqCentroids.getSize(2)});
|
||||
|
||||
runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
|
||||
|
||||
Tensor<float, 4, true> outCodeDistancesF;
|
||||
DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
|
||||
|
||||
if (useFloat16Lookup) {
|
||||
outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
|
||||
mem, {outCodeDistances.getSize(0),
|
||||
outCodeDistances.getSize(1),
|
||||
outCodeDistances.getSize(2),
|
||||
outCodeDistances.getSize(3)},
|
||||
stream);
|
||||
|
||||
outCodeDistancesF = outCodeDistancesFloatMem;
|
||||
} else {
|
||||
outCodeDistancesF = outCodeDistances.toTensor<float>();
|
||||
}
|
||||
|
||||
// Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
|
||||
// is where we build our output distances)
|
||||
auto outCodeDistancesView = outCodeDistancesF.view<3>(
|
||||
{topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
|
||||
outCodeDistances.getSize(2),
|
||||
outCodeDistances.getSize(3)});
|
||||
|
||||
runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
|
||||
|
||||
// Calculate code norms per each sub-dim
|
||||
// (sub q)(sub dim)(code) is pqCentroids
|
||||
// transpose to (sub q)(code)(sub dim)
|
||||
DeviceTensor<float, 3, true> pqCentroidsTranspose(
|
||||
mem,
|
||||
{pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
|
||||
stream);
|
||||
|
||||
runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
|
||||
|
||||
auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
|
||||
{pqCentroids.getSize(0) * pqCentroids.getSize(2),
|
||||
pqCentroids.getSize(1)});
|
||||
|
||||
DeviceTensor<float, 1, true> pqCentroidsNorm(
|
||||
mem,
|
||||
{pqCentroids.getSize(0) * pqCentroids.getSize(2)},
|
||||
stream);
|
||||
|
||||
runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
|
||||
|
||||
// View output as (q * c)(sub q * code), and add centroid norm to
|
||||
// each row
|
||||
auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
|
||||
{topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
|
||||
outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
|
||||
|
||||
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
|
||||
|
||||
if (useFloat16Lookup) {
|
||||
// Need to convert back
|
||||
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
|
||||
convertTensor<float, half, 4>(stream,
|
||||
outCodeDistancesF,
|
||||
outCodeDistancesH);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<CentroidT, 2, true>& coarseCentroids,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
NoTypeTensor<4, true>& outCodeDistances,
|
||||
bool l2Distance,
|
||||
bool useFloat16Lookup,
|
||||
cudaStream_t stream) {
|
||||
const auto numSubQuantizers = pqCentroids.getSize(0);
|
||||
const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
|
||||
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
|
||||
|
||||
// FIXME: tune
|
||||
// Reuse of pq centroid data is based on both # of queries * nprobe,
|
||||
// and we should really be tiling in both dimensions
|
||||
constexpr int kQueriesPerBlock = 8;
|
||||
|
||||
auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
|
||||
numSubQuantizers);
|
||||
|
||||
// Reserve one block of threads for double buffering
|
||||
// FIXME: probably impractical for large # of dims?
|
||||
auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
|
||||
auto block = dim3(codesPerSubQuantizer + loadingThreads);
|
||||
|
||||
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
|
||||
+ topQueryToCentroid.getSize(1) * sizeof(int);
|
||||
|
||||
#define RUN_CODE(DIMS, L2) \
|
||||
do { \
|
||||
if (useFloat16Lookup) { \
|
||||
auto outCodeDistancesT = outCodeDistances.toTensor<half>(); \
|
||||
\
|
||||
pqCodeDistances<half, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
|
||||
queries, kQueriesPerBlock, \
|
||||
coarseCentroids, pqCentroids, \
|
||||
topQueryToCentroid, outCodeDistancesT); \
|
||||
} else { \
|
||||
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
|
||||
\
|
||||
pqCodeDistances<float, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
|
||||
queries, kQueriesPerBlock, \
|
||||
coarseCentroids, pqCentroids, \
|
||||
topQueryToCentroid, outCodeDistancesT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CODE_L2(DIMS) \
|
||||
do { \
|
||||
if (l2Distance) { \
|
||||
RUN_CODE(DIMS, true); \
|
||||
} else { \
|
||||
RUN_CODE(DIMS, false); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
switch (dimsPerSubQuantizer) {
|
||||
case 1:
|
||||
CODE_L2(1);
|
||||
break;
|
||||
case 2:
|
||||
CODE_L2(2);
|
||||
break;
|
||||
case 3:
|
||||
CODE_L2(3);
|
||||
break;
|
||||
case 4:
|
||||
CODE_L2(4);
|
||||
break;
|
||||
case 6:
|
||||
CODE_L2(6);
|
||||
break;
|
||||
case 8:
|
||||
CODE_L2(8);
|
||||
break;
|
||||
case 10:
|
||||
CODE_L2(10);
|
||||
break;
|
||||
case 12:
|
||||
CODE_L2(12);
|
||||
break;
|
||||
case 16:
|
||||
CODE_L2(16);
|
||||
break;
|
||||
case 20:
|
||||
CODE_L2(20);
|
||||
break;
|
||||
case 24:
|
||||
CODE_L2(24);
|
||||
break;
|
||||
case 28:
|
||||
CODE_L2(28);
|
||||
break;
|
||||
case 32:
|
||||
CODE_L2(32);
|
||||
break;
|
||||
// FIXME: larger sizes require too many registers - we need the
|
||||
// MM implementation working
|
||||
default:
|
||||
FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
|
||||
"not currently supported");
|
||||
}
|
||||
|
||||
#undef RUN_CODE
|
||||
#undef CODE_L2
|
||||
|
||||
CUDA_TEST_ERROR();
|
||||
}
|
||||
|
||||
} } // namespace
|
|
@ -20,18 +20,20 @@ class DeviceMemory;
|
|||
/// Calculates the distance from the (query - centroid) residual to
|
||||
/// each sub-code vector, for the given list of query results in
|
||||
/// topQueryToCentroid
|
||||
template <typename CentroidT>
|
||||
void runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<float, 2, true>& coarseCentroids,
|
||||
Tensor<CentroidT, 2, true>& coarseCentroids,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
NoTypeTensor<4, true>& outCodeDistances,
|
||||
bool l2Distance,
|
||||
bool useFloat16Lookup,
|
||||
cudaStream_t stream);
|
||||
|
||||
template <typename CentroidT>
|
||||
void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<float, 2, true>& coarseCentroids,
|
||||
Tensor<CentroidT, 2, true>& coarseCentroids,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
NoTypeTensor<4, true>& outCodeDistances,
|
||||
bool useFloat16Lookup,
|
||||
|
@ -40,3 +42,5 @@ void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||
cudaStream_t stream);
|
||||
|
||||
} } // namespace
|
||||
|
||||
#include <faiss/gpu/impl/PQCodeDistances-inl.cuh>
|
||||
|
|
|
@ -0,0 +1,599 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include <faiss/gpu/GpuResources.h>
|
||||
#include <faiss/gpu/impl/PQCodeDistances.cuh>
|
||||
#include <faiss/gpu/impl/PQCodeLoad.cuh>
|
||||
#include <faiss/gpu/impl/IVFUtils.cuh>
|
||||
#include <faiss/gpu/utils/ConversionOperators.cuh>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/DeviceUtils.h>
|
||||
#include <faiss/gpu/utils/Float16.cuh>
|
||||
#include <faiss/gpu/utils/LoadStoreOperators.cuh>
|
||||
#include <faiss/gpu/utils/NoTypeTensor.cuh>
|
||||
#include <faiss/gpu/utils/StaticUtils.h>
|
||||
|
||||
#include <faiss/gpu/utils/HostTensor.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
// This must be kept in sync with PQCodeDistances.cu
|
||||
inline bool isSupportedNoPrecomputedSubDimSize(int dims) {
|
||||
switch (dims) {
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 6:
|
||||
case 8:
|
||||
case 10:
|
||||
case 12:
|
||||
case 16:
|
||||
case 20:
|
||||
case 24:
|
||||
case 28:
|
||||
case 32:
|
||||
return true;
|
||||
default:
|
||||
// FIXME: larger sizes require too many registers - we need the
|
||||
// MM implementation working
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LookupT, typename LookupVecT>
|
||||
struct LoadCodeDistances {
|
||||
static inline __device__ void load(LookupT* smem,
|
||||
LookupT* codes,
|
||||
int numCodes) {
|
||||
constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
|
||||
|
||||
// We can only use the vector type if the data is guaranteed to be
|
||||
// aligned. The codes are innermost, so if it is evenly divisible,
|
||||
// then any slice will be aligned.
|
||||
if (numCodes % kWordSize == 0) {
|
||||
// Load the data by float4 for efficiency, and then handle any remainder
|
||||
// limitVec is the number of whole vec words we can load, in terms
|
||||
// of whole blocks performing the load
|
||||
constexpr int kUnroll = 2;
|
||||
int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
|
||||
limitVec *= kUnroll * blockDim.x;
|
||||
|
||||
LookupVecT* smemV = (LookupVecT*) smem;
|
||||
LookupVecT* codesV = (LookupVecT*) codes;
|
||||
|
||||
for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
|
||||
LookupVecT vals[kUnroll];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] =
|
||||
LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// This is where we start loading the remainder that does not evenly
|
||||
// fit into kUnroll x blockDim.x
|
||||
int remainder = limitVec * kWordSize;
|
||||
|
||||
for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
|
||||
smem[i] = codes[i];
|
||||
}
|
||||
} else {
|
||||
// Potential unaligned load
|
||||
constexpr int kUnroll = 4;
|
||||
|
||||
int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
|
||||
|
||||
int i = threadIdx.x;
|
||||
for (; i < limit; i += kUnroll * blockDim.x) {
|
||||
LookupT vals[kUnroll];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
vals[j] = codes[i + j * blockDim.x];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kUnroll; ++j) {
|
||||
smem[i + j * blockDim.x] = vals[j];
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < numCodes; i += blockDim.x) {
|
||||
smem[i] = codes[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
|
||||
__global__ void
|
||||
pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
|
||||
Tensor<float, 3, true> pqCentroids,
|
||||
Tensor<int, 2, true> topQueryToCentroid,
|
||||
Tensor<LookupT, 4, true> codeDistances,
|
||||
void** listCodes,
|
||||
int* listLengths,
|
||||
Tensor<int, 2, true> prefixSumOffsets,
|
||||
Tensor<float, 1, true> distance) {
|
||||
const auto codesPerSubQuantizer = pqCentroids.getSize(2);
|
||||
|
||||
// Where the pq code -> residual distance is stored
|
||||
extern __shared__ char smemCodeDistances[];
|
||||
LookupT* codeDist = (LookupT*) smemCodeDistances;
|
||||
|
||||
// Each block handles a single query
|
||||
auto queryId = blockIdx.y;
|
||||
auto probeId = blockIdx.x;
|
||||
|
||||
// This is where we start writing out data
|
||||
// We ensure that before the array (at offset -1), there is a 0 value
|
||||
int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
|
||||
float* distanceOut = distance[outBase].data();
|
||||
|
||||
auto listId = topQueryToCentroid[queryId][probeId];
|
||||
// Safety guard in case NaNs in input cause no list ID to be generated
|
||||
if (listId == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned char* codeList = (unsigned char*) listCodes[listId];
|
||||
int limit = listLengths[listId];
|
||||
|
||||
constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
|
||||
(NumSubQuantizers / 4);
|
||||
unsigned int code32[kNumCode32];
|
||||
unsigned int nextCode32[kNumCode32];
|
||||
|
||||
// We double-buffer the code loading, which improves memory utilization
|
||||
if (threadIdx.x < limit) {
|
||||
LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
|
||||
}
|
||||
|
||||
LoadCodeDistances<LookupT, LookupVecT>::load(
|
||||
codeDist,
|
||||
codeDistances[queryId][probeId].data(),
|
||||
codeDistances.getSize(2) * codeDistances.getSize(3));
|
||||
|
||||
// Prevent WAR dependencies
|
||||
__syncthreads();
|
||||
|
||||
// Each thread handles one code element in the list, with a
|
||||
// block-wide stride
|
||||
for (int codeIndex = threadIdx.x;
|
||||
codeIndex < limit;
|
||||
codeIndex += blockDim.x) {
|
||||
// Prefetch next codes
|
||||
if (codeIndex + blockDim.x < limit) {
|
||||
LoadCode32<NumSubQuantizers>::load(
|
||||
nextCode32, codeList, codeIndex + blockDim.x);
|
||||
}
|
||||
|
||||
float dist = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
for (int word = 0; word < kNumCode32; ++word) {
|
||||
constexpr int kBytesPerCode32 =
|
||||
NumSubQuantizers < 4 ? NumSubQuantizers : 4;
|
||||
|
||||
if (kBytesPerCode32 == 1) {
|
||||
auto code = code32[0];
|
||||
dist = ConvertTo<float>::to(codeDist[code]);
|
||||
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int byte = 0; byte < kBytesPerCode32; ++byte) {
|
||||
auto code = getByte(code32[word], byte * 8, 8);
|
||||
|
||||
auto offset =
|
||||
codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
|
||||
|
||||
dist += ConvertTo<float>::to(codeDist[offset + code]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out intermediate distance result
|
||||
// We do not maintain indices here, in order to reduce global
|
||||
// memory traffic. Those are recovered in the final selection step.
|
||||
distanceOut[codeIndex] = dist;
|
||||
|
||||
// Rotate buffers
|
||||
#pragma unroll
|
||||
for (int word = 0; word < kNumCode32; ++word) {
|
||||
code32[word] = nextCode32[word];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
runMultiPassTile(Tensor<float, 2, true>& queries,
|
||||
Tensor<CentroidT, 2, true>& centroids,
|
||||
Tensor<float, 3, true>& pqCentroidsInnermostCode,
|
||||
NoTypeTensor<4, true>& codeDistances,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
int numSubQuantizerCodes,
|
||||
thrust::device_vector<void*>& listCodes,
|
||||
thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
thrust::device_vector<int>& listLengths,
|
||||
Tensor<char, 1, true>& thrustMem,
|
||||
Tensor<int, 2, true>& prefixSumOffsets,
|
||||
Tensor<float, 1, true>& allDistances,
|
||||
Tensor<float, 3, true>& heapDistances,
|
||||
Tensor<int, 3, true>& heapIndices,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices,
|
||||
cudaStream_t stream) {
|
||||
// We only support two metrics at the moment
|
||||
FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
|
||||
metric == MetricType::METRIC_L2);
|
||||
|
||||
bool l2Distance = metric == MetricType::METRIC_L2;
|
||||
|
||||
// Calculate offset lengths, so we know where to write out
|
||||
// intermediate results
|
||||
runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
|
||||
thrustMem, stream);
|
||||
|
||||
// Calculate residual code distances, since this is without
|
||||
// precomputed codes
|
||||
runPQCodeDistances(pqCentroidsInnermostCode,
|
||||
queries,
|
||||
centroids,
|
||||
topQueryToCentroid,
|
||||
codeDistances,
|
||||
l2Distance,
|
||||
useFloat16Lookup,
|
||||
stream);
|
||||
|
||||
// Convert all codes to a distance, and write out (distance,
|
||||
// index) values for all intermediate results
|
||||
{
|
||||
auto kThreadsPerBlock = 256;
|
||||
|
||||
auto grid = dim3(topQueryToCentroid.getSize(1),
|
||||
topQueryToCentroid.getSize(0));
|
||||
auto block = dim3(kThreadsPerBlock);
|
||||
|
||||
// pq centroid distances
|
||||
auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
||||
|
||||
smem *= numSubQuantizers * numSubQuantizerCodes;
|
||||
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
||||
|
||||
#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T) \
|
||||
do { \
|
||||
auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>(); \
|
||||
\
|
||||
pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T> \
|
||||
<<<grid, block, smem, stream>>>( \
|
||||
queries, \
|
||||
pqCentroidsInnermostCode, \
|
||||
topQueryToCentroid, \
|
||||
codeDistancesT, \
|
||||
listCodes.data().get(), \
|
||||
listLengths.data().get(), \
|
||||
prefixSumOffsets, \
|
||||
allDistances); \
|
||||
} while (0)
|
||||
|
||||
#define RUN_PQ(NUM_SUB_Q) \
|
||||
do { \
|
||||
if (useFloat16Lookup) { \
|
||||
RUN_PQ_OPT(NUM_SUB_Q, half, Half8); \
|
||||
} else { \
|
||||
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
switch (bytesPerCode) {
|
||||
case 1:
|
||||
RUN_PQ(1);
|
||||
break;
|
||||
case 2:
|
||||
RUN_PQ(2);
|
||||
break;
|
||||
case 3:
|
||||
RUN_PQ(3);
|
||||
break;
|
||||
case 4:
|
||||
RUN_PQ(4);
|
||||
break;
|
||||
case 8:
|
||||
RUN_PQ(8);
|
||||
break;
|
||||
case 12:
|
||||
RUN_PQ(12);
|
||||
break;
|
||||
case 16:
|
||||
RUN_PQ(16);
|
||||
break;
|
||||
case 20:
|
||||
RUN_PQ(20);
|
||||
break;
|
||||
case 24:
|
||||
RUN_PQ(24);
|
||||
break;
|
||||
case 28:
|
||||
RUN_PQ(28);
|
||||
break;
|
||||
case 32:
|
||||
RUN_PQ(32);
|
||||
break;
|
||||
case 40:
|
||||
RUN_PQ(40);
|
||||
break;
|
||||
case 48:
|
||||
RUN_PQ(48);
|
||||
break;
|
||||
case 56:
|
||||
RUN_PQ(56);
|
||||
break;
|
||||
case 64:
|
||||
RUN_PQ(64);
|
||||
break;
|
||||
case 96:
|
||||
RUN_PQ(96);
|
||||
break;
|
||||
default:
|
||||
FAISS_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
|
||||
#undef RUN_PQ
|
||||
#undef RUN_PQ_OPT
|
||||
}
|
||||
|
||||
CUDA_TEST_ERROR();
|
||||
|
||||
// k-select the output in chunks, to increase parallelism
|
||||
runPass1SelectLists(prefixSumOffsets,
|
||||
allDistances,
|
||||
topQueryToCentroid.getSize(1),
|
||||
k,
|
||||
!l2Distance, // L2 distance chooses smallest
|
||||
heapDistances,
|
||||
heapIndices,
|
||||
stream);
|
||||
|
||||
// k-select final output
|
||||
auto flatHeapDistances = heapDistances.downcastInner<2>();
|
||||
auto flatHeapIndices = heapIndices.downcastInner<2>();
|
||||
|
||||
runPass2SelectLists(flatHeapDistances,
|
||||
flatHeapIndices,
|
||||
listIndices,
|
||||
indicesOptions,
|
||||
prefixSumOffsets,
|
||||
topQueryToCentroid,
|
||||
k,
|
||||
!l2Distance, // L2 distance chooses smallest
|
||||
outDistances,
|
||||
outIndices,
|
||||
stream);
|
||||
}
|
||||
|
||||
template <typename CentroidT>
|
||||
void
|
||||
runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
||||
Tensor<CentroidT, 2, true>& centroids,
|
||||
Tensor<float, 3, true>& pqCentroidsInnermostCode,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
int numSubQuantizerCodes,
|
||||
thrust::device_vector<void*>& listCodes,
|
||||
thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
thrust::device_vector<int>& listLengths,
|
||||
int maxListLength,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
// output
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
// output
|
||||
Tensor<long, 2, true>& outIndices,
|
||||
GpuResources* res) {
|
||||
constexpr int kMinQueryTileSize = 8;
|
||||
constexpr int kMaxQueryTileSize = 128;
|
||||
constexpr int kThrustMemSize = 16384;
|
||||
|
||||
int nprobe = topQueryToCentroid.getSize(1);
|
||||
|
||||
auto& mem = res->getMemoryManagerCurrentDevice();
|
||||
auto stream = res->getDefaultStreamCurrentDevice();
|
||||
|
||||
// Make a reservation for Thrust to do its dirty work (global memory
|
||||
// cross-block reduction space); hopefully this is large enough.
|
||||
DeviceTensor<char, 1, true> thrustMem1(
|
||||
mem, {kThrustMemSize}, stream);
|
||||
DeviceTensor<char, 1, true> thrustMem2(
|
||||
mem, {kThrustMemSize}, stream);
|
||||
DeviceTensor<char, 1, true>* thrustMem[2] =
|
||||
{&thrustMem1, &thrustMem2};
|
||||
|
||||
// How much temporary storage is available?
|
||||
// If possible, we'd like to fit within the space available.
|
||||
size_t sizeAvailable = mem.getSizeAvailable();
|
||||
|
||||
// We run two passes of heap selection
|
||||
// This is the size of the first-level heap passes
|
||||
constexpr int kNProbeSplit = 8;
|
||||
int pass2Chunks = std::min(nprobe, kNProbeSplit);
|
||||
|
||||
size_t sizeForFirstSelectPass =
|
||||
pass2Chunks * k * (sizeof(float) + sizeof(int));
|
||||
|
||||
// How much temporary storage we need per each query
|
||||
size_t sizePerQuery =
|
||||
2 * // streams
|
||||
((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
|
||||
nprobe * maxListLength * sizeof(float) + // allDistances
|
||||
// residual distances
|
||||
nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
|
||||
sizeForFirstSelectPass);
|
||||
|
||||
int queryTileSize = (int) (sizeAvailable / sizePerQuery);
|
||||
|
||||
if (queryTileSize < kMinQueryTileSize) {
|
||||
queryTileSize = kMinQueryTileSize;
|
||||
} else if (queryTileSize > kMaxQueryTileSize) {
|
||||
queryTileSize = kMaxQueryTileSize;
|
||||
}
|
||||
|
||||
// FIXME: we should adjust queryTileSize to deal with this, since
|
||||
// indexing is in int32
|
||||
FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
|
||||
std::numeric_limits<int>::max());
|
||||
|
||||
// Temporary memory buffers
|
||||
// Make sure there is space prior to the start which will be 0, and
|
||||
// will handle the boundary condition without branches
|
||||
DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
|
||||
mem, {queryTileSize * nprobe + 1}, stream);
|
||||
DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
|
||||
mem, {queryTileSize * nprobe + 1}, stream);
|
||||
|
||||
DeviceTensor<int, 2, true> prefixSumOffsets1(
|
||||
prefixSumOffsetSpace1[1].data(),
|
||||
{queryTileSize, nprobe});
|
||||
DeviceTensor<int, 2, true> prefixSumOffsets2(
|
||||
prefixSumOffsetSpace2[1].data(),
|
||||
{queryTileSize, nprobe});
|
||||
DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
|
||||
{&prefixSumOffsets1, &prefixSumOffsets2};
|
||||
|
||||
// Make sure the element before prefixSumOffsets is 0, since we
|
||||
// depend upon simple, boundary-less indexing to get proper results
|
||||
CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
|
||||
0,
|
||||
sizeof(int),
|
||||
stream));
|
||||
CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
|
||||
0,
|
||||
sizeof(int),
|
||||
stream));
|
||||
|
||||
int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
||||
|
||||
int totalCodeDistancesSize =
|
||||
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
|
||||
codeDistanceTypeSize;
|
||||
|
||||
DeviceTensor<char, 1, true> codeDistances1Mem(
|
||||
mem, {totalCodeDistancesSize}, stream);
|
||||
NoTypeTensor<4, true> codeDistances1(
|
||||
codeDistances1Mem.data(),
|
||||
codeDistanceTypeSize,
|
||||
{queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
|
||||
|
||||
DeviceTensor<char, 1, true> codeDistances2Mem(
|
||||
mem, {totalCodeDistancesSize}, stream);
|
||||
NoTypeTensor<4, true> codeDistances2(
|
||||
codeDistances2Mem.data(),
|
||||
codeDistanceTypeSize,
|
||||
{queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
|
||||
|
||||
NoTypeTensor<4, true>* codeDistances[2] =
|
||||
{&codeDistances1, &codeDistances2};
|
||||
|
||||
DeviceTensor<float, 1, true> allDistances1(
|
||||
mem, {queryTileSize * nprobe * maxListLength}, stream);
|
||||
DeviceTensor<float, 1, true> allDistances2(
|
||||
mem, {queryTileSize * nprobe * maxListLength}, stream);
|
||||
DeviceTensor<float, 1, true>* allDistances[2] =
|
||||
{&allDistances1, &allDistances2};
|
||||
|
||||
DeviceTensor<float, 3, true> heapDistances1(
|
||||
mem, {queryTileSize, pass2Chunks, k}, stream);
|
||||
DeviceTensor<float, 3, true> heapDistances2(
|
||||
mem, {queryTileSize, pass2Chunks, k}, stream);
|
||||
DeviceTensor<float, 3, true>* heapDistances[2] =
|
||||
{&heapDistances1, &heapDistances2};
|
||||
|
||||
DeviceTensor<int, 3, true> heapIndices1(
|
||||
mem, {queryTileSize, pass2Chunks, k}, stream);
|
||||
DeviceTensor<int, 3, true> heapIndices2(
|
||||
mem, {queryTileSize, pass2Chunks, k}, stream);
|
||||
DeviceTensor<int, 3, true>* heapIndices[2] =
|
||||
{&heapIndices1, &heapIndices2};
|
||||
|
||||
auto streams = res->getAlternateStreamsCurrentDevice();
|
||||
streamWait(streams, {stream});
|
||||
|
||||
int curStream = 0;
|
||||
|
||||
for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
|
||||
int numQueriesInTile =
|
||||
std::min(queryTileSize, queries.getSize(0) - query);
|
||||
|
||||
auto prefixSumOffsetsView =
|
||||
prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
|
||||
|
||||
auto codeDistancesView =
|
||||
codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
|
||||
auto coarseIndicesView =
|
||||
topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
|
||||
auto queryView =
|
||||
queries.narrowOutermost(query, numQueriesInTile);
|
||||
|
||||
auto heapDistancesView =
|
||||
heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
|
||||
auto heapIndicesView =
|
||||
heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
|
||||
|
||||
auto outDistanceView =
|
||||
outDistances.narrowOutermost(query, numQueriesInTile);
|
||||
auto outIndicesView =
|
||||
outIndices.narrowOutermost(query, numQueriesInTile);
|
||||
|
||||
runMultiPassTile(queryView,
|
||||
centroids,
|
||||
pqCentroidsInnermostCode,
|
||||
codeDistancesView,
|
||||
coarseIndicesView,
|
||||
useFloat16Lookup,
|
||||
bytesPerCode,
|
||||
numSubQuantizers,
|
||||
numSubQuantizerCodes,
|
||||
listCodes,
|
||||
listIndices,
|
||||
indicesOptions,
|
||||
listLengths,
|
||||
*thrustMem[curStream],
|
||||
prefixSumOffsetsView,
|
||||
*allDistances[curStream],
|
||||
heapDistancesView,
|
||||
heapIndicesView,
|
||||
k,
|
||||
metric,
|
||||
outDistanceView,
|
||||
outIndicesView,
|
||||
streams[curStream]);
|
||||
|
||||
curStream = (curStream + 1) % 2;
|
||||
}
|
||||
|
||||
streamWait({stream}, streams);
|
||||
}
|
||||
|
||||
} } // namespace
|
|
@ -21,8 +21,9 @@ class GpuResources;
|
|||
/// per subquantizer?
|
||||
bool isSupportedNoPrecomputedSubDimSize(int dims);
|
||||
|
||||
template <typename CentroidT>
|
||||
void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
||||
Tensor<float, 2, true>& centroids,
|
||||
Tensor<CentroidT, 2, true>& centroids,
|
||||
Tensor<float, 3, true>& pqCentroidsInnermostCode,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
bool useFloat16Lookup,
|
||||
|
@ -43,3 +44,5 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
|||
GpuResources* res);
|
||||
|
||||
} } // namespace
|
||||
|
||||
#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh>
|
||||
|
|
|
@ -76,7 +76,6 @@ int main(int argc, char** argv) {
|
|||
GpuIndexFlatConfig config;
|
||||
config.device = dev;
|
||||
config.useFloat16 = FLAGS_use_float16;
|
||||
config.useFloat16Accumulator = FLAGS_use_float16_math;
|
||||
config.storeTransposed = FLAGS_transposed;
|
||||
config.memorySpace = FLAGS_use_unified_mem ?
|
||||
MemorySpace::Unified : MemorySpace::Device;
|
||||
|
|
|
@ -187,6 +187,41 @@ TEST(TestGpuIndexIVFPQ, Query_IP) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST(TestGpuIndexIVFPQ, Float16Coarse) {
|
||||
Options opt;
|
||||
|
||||
std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
|
||||
std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
|
||||
|
||||
faiss::IndexFlatL2 coarseQuantizer(opt.dim);
|
||||
faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
|
||||
opt.codes, opt.bitsPerCode);
|
||||
cpuIndex.nprobe = opt.nprobe;
|
||||
cpuIndex.train(opt.numTrain, trainVecs.data());
|
||||
|
||||
faiss::gpu::StandardGpuResources res;
|
||||
res.noTempMemory();
|
||||
|
||||
faiss::gpu::GpuIndexIVFPQConfig config;
|
||||
config.device = opt.device;
|
||||
config.flatConfig.useFloat16 = true;
|
||||
config.usePrecomputedTables = opt.usePrecomputed;
|
||||
config.indicesOptions = opt.indicesOpt;
|
||||
config.useFloat16LookupTables = opt.useFloat16;
|
||||
|
||||
faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
|
||||
gpuIndex.setNumProbes(opt.nprobe);
|
||||
|
||||
gpuIndex.add(opt.numAdd, addVecs.data());
|
||||
cpuIndex.add(opt.numAdd, addVecs.data());
|
||||
|
||||
faiss::gpu::compareIndices(cpuIndex, gpuIndex,
|
||||
opt.numQuery, opt.dim, opt.k, opt.toString(),
|
||||
opt.getCompareEpsilon(),
|
||||
opt.getPctMaxDiff1(),
|
||||
opt.getPctMaxDiffN());
|
||||
}
|
||||
|
||||
TEST(TestGpuIndexIVFPQ, Add_L2) {
|
||||
for (int tries = 0; tries < 2; ++tries) {
|
||||
Options opt;
|
||||
|
|
|
@ -98,7 +98,9 @@ class EvalIVFPQAccuracy(unittest.TestCase):
|
|||
|
||||
D, Inew = gpu_index.search(xq, 10)
|
||||
|
||||
self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
|
||||
# 0.99: allow some tolerance in results otherwise test
|
||||
# fails occasionally (not reproducible)
|
||||
self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
|
||||
|
||||
def test_cpu_to_gpu_IVFPQ(self):
|
||||
self.do_cpu_to_gpu('IVF128,PQ4')
|
||||
|
@ -267,6 +269,45 @@ class TestGPUKmeans(unittest.TestCase):
|
|||
assert np.allclose(obj1, obj2)
|
||||
|
||||
|
||||
class TestAlternativeDistances(unittest.TestCase):
|
||||
|
||||
def do_test(self, metric, metric_arg=0):
|
||||
res = faiss.StandardGpuResources()
|
||||
d = 32
|
||||
nb = 1000
|
||||
nq = 100
|
||||
|
||||
rs = np.random.RandomState(123)
|
||||
xb = rs.rand(nb, d).astype('float32')
|
||||
xq = rs.rand(nq, d).astype('float32')
|
||||
|
||||
index_ref = faiss.IndexFlat(d, metric)
|
||||
index_ref.metric_arg = metric_arg
|
||||
index_ref.add(xb)
|
||||
Dref, Iref = index_ref.search(xq, 10)
|
||||
|
||||
# build from other index
|
||||
index = faiss.GpuIndexFlat(res, index_ref)
|
||||
Dnew, Inew = index.search(xq, 10)
|
||||
np.testing.assert_array_equal(Inew, Iref)
|
||||
np.testing.assert_allclose(Dnew, Dref, rtol=1e-6)
|
||||
|
||||
# build from scratch
|
||||
index = faiss.GpuIndexFlat(res, d, metric)
|
||||
index.metric_arg = metric_arg
|
||||
index.add(xb)
|
||||
|
||||
Dnew, Inew = index.search(xq, 10)
|
||||
np.testing.assert_array_equal(Inew, Iref)
|
||||
|
||||
def test_L1(self):
|
||||
self.do_test(faiss.METRIC_L1)
|
||||
|
||||
def test_Linf(self):
|
||||
self.do_test(faiss.METRIC_Linf)
|
||||
|
||||
def test_Lp(self):
|
||||
self.do_test(faiss.METRIC_Lp, 0.7)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cublas_v2.h>
|
||||
#include <faiss/gpu/utils/Tensor.cuh>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/HostTensor.cuh>
|
||||
#include <faiss/gpu/utils/Float16.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
class DeviceMemory;
|
||||
|
||||
template <typename T>
|
||||
struct GetCudaType;
|
||||
|
||||
template <>
|
||||
struct GetCudaType<float> {
|
||||
static constexpr cudaDataType_t Type = CUDA_R_32F;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GetCudaType<half> {
|
||||
static constexpr cudaDataType_t Type = CUDA_R_16F;
|
||||
};
|
||||
|
||||
template <typename AT, typename BT>
|
||||
cublasStatus_t
|
||||
rawGemm(cublasHandle_t handle,
|
||||
cublasOperation_t transa,
|
||||
cublasOperation_t transb,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
const float fAlpha,
|
||||
const AT *A,
|
||||
int lda,
|
||||
const BT *B,
|
||||
int ldb,
|
||||
const float fBeta,
|
||||
float *C,
|
||||
int ldc) {
|
||||
auto cAT = GetCudaType<AT>::Type;
|
||||
auto cBT = GetCudaType<BT>::Type;
|
||||
|
||||
// Always accumulate in f32
|
||||
return cublasSgemmEx(handle, transa, transb, m, n, k,
|
||||
&fAlpha, A, cAT, lda,
|
||||
B, cBT, ldb,
|
||||
&fBeta,
|
||||
C, CUDA_R_32F, ldc);
|
||||
}
|
||||
|
||||
template <typename AT, typename BT>
|
||||
void
|
||||
runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<AT, 2, true>& a, bool transA,
|
||||
Tensor<BT, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
cublasSetStream(handle, stream);
|
||||
|
||||
// Check that we have (m x k) * (k x n) = (m x n)
|
||||
// using the input row-major layout
|
||||
int aM = transA ? a.getSize(1) : a.getSize(0);
|
||||
int aK = transA ? a.getSize(0) : a.getSize(1);
|
||||
|
||||
int bK = transB ? b.getSize(1) : b.getSize(0);
|
||||
int bN = transB ? b.getSize(0) : b.getSize(1);
|
||||
|
||||
int cM = transC ? c.getSize(1) : c.getSize(0);
|
||||
int cN = transC ? c.getSize(0) : c.getSize(1);
|
||||
|
||||
FAISS_ASSERT(aM == cM);
|
||||
FAISS_ASSERT(aK == bK);
|
||||
FAISS_ASSERT(bN == cN);
|
||||
|
||||
FAISS_ASSERT(a.getStride(1) == 1);
|
||||
FAISS_ASSERT(b.getStride(1) == 1);
|
||||
FAISS_ASSERT(c.getStride(1) == 1);
|
||||
|
||||
// Now, we have to represent the matrix multiplication in
|
||||
// column-major layout
|
||||
float* pC = c.data();
|
||||
|
||||
int m = c.getSize(1); // stride 1 size
|
||||
int n = c.getSize(0); // other size
|
||||
int k = transA ? a.getSize(0) : a.getSize(1);
|
||||
|
||||
int lda = transC ? a.getStride(0) : b.getStride(0);
|
||||
int ldb = transC ? b.getStride(0) : a.getStride(0);
|
||||
int ldc = c.getStride(0);
|
||||
|
||||
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
|
||||
if (transC) {
|
||||
gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
}
|
||||
|
||||
cublasStatus_t err;
|
||||
|
||||
if (transC) {
|
||||
err = rawGemm(handle,
|
||||
gemmTrA, gemmTrB,
|
||||
m, n, k, alpha,
|
||||
a.data(), lda, b.data(), ldb, beta,
|
||||
pC, ldc);
|
||||
} else {
|
||||
err = rawGemm(handle,
|
||||
gemmTrA, gemmTrB,
|
||||
m, n, k, alpha,
|
||||
b.data(), lda, a.data(), ldb, beta,
|
||||
pC, ldc);
|
||||
}
|
||||
|
||||
FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
|
||||
"cublas failed (%d): "
|
||||
"(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
|
||||
(int) err,
|
||||
a.getSize(0), a.getSize(1), transA ? "'" : "",
|
||||
b.getSize(0), b.getSize(1), transB ? "'" : "",
|
||||
c.getSize(0), c.getSize(1), transC ? "'" : "");
|
||||
CUDA_TEST_ERROR();
|
||||
}
|
||||
|
||||
template <typename AT, typename BT>
|
||||
void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
|
||||
Tensor<AT, 3, true>& a, bool transA,
|
||||
Tensor<BT, 3, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
FAISS_ASSERT(c.getSize(0) == a.getSize(0));
|
||||
FAISS_ASSERT(a.getSize(0) == b.getSize(0));
|
||||
|
||||
for (int i = 0; i < a.getSize(0); ++i) {
|
||||
auto cView = c[i].view();
|
||||
auto aView = a[i].view();
|
||||
auto bView = b[i].view();
|
||||
|
||||
runMatrixMult(cView, transC,
|
||||
aView, transA,
|
||||
bView, transB,
|
||||
alpha, beta, handle, stream);
|
||||
}
|
||||
}
|
||||
|
||||
} } // namespace
|
|
@ -8,176 +8,9 @@
|
|||
|
||||
#include <faiss/gpu/utils/MatrixMult.cuh>
|
||||
#include <faiss/gpu/utils/DeviceMemory.h>
|
||||
#include <faiss/gpu/utils/DeviceUtils.h>
|
||||
#include <faiss/gpu/utils/Float16.cuh>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/HostTensor.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
template <typename T>
|
||||
struct CublasGemm {
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CublasGemm<float> {
|
||||
static cublasStatus_t gemm(cublasHandle_t handle,
|
||||
cublasOperation_t transa,
|
||||
cublasOperation_t transb,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
float fAlpha,
|
||||
const float *A,
|
||||
int lda,
|
||||
const float *B,
|
||||
int ldb,
|
||||
float fBeta,
|
||||
float *C,
|
||||
int ldc) {
|
||||
return cublasSgemm(handle, transa, transb, m, n, k,
|
||||
&fAlpha, A, lda, B, ldb, &fBeta, C, ldc);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CublasGemm<half> {
|
||||
static cublasStatus_t gemm(cublasHandle_t handle,
|
||||
cublasOperation_t transa,
|
||||
cublasOperation_t transb,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
const float fAlpha,
|
||||
const half *A,
|
||||
int lda,
|
||||
const half *B,
|
||||
int ldb,
|
||||
const float fBeta,
|
||||
float *C,
|
||||
int ldc) {
|
||||
// Always accumulate in f32
|
||||
return cublasSgemmEx(handle, transa, transb, m, n, k,
|
||||
&fAlpha, A, CUDA_R_16F, lda,
|
||||
B, CUDA_R_16F, ldb,
|
||||
&fBeta,
|
||||
C, CUDA_R_32F, ldc);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<T, 2, true>& a, bool transA,
|
||||
Tensor<T, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
cublasSetStream(handle, stream);
|
||||
|
||||
// Check that we have (m x k) * (k x n) = (m x n)
|
||||
// using the input row-major layout
|
||||
int aM = transA ? a.getSize(1) : a.getSize(0);
|
||||
int aK = transA ? a.getSize(0) : a.getSize(1);
|
||||
|
||||
int bK = transB ? b.getSize(1) : b.getSize(0);
|
||||
int bN = transB ? b.getSize(0) : b.getSize(1);
|
||||
|
||||
int cM = transC ? c.getSize(1) : c.getSize(0);
|
||||
int cN = transC ? c.getSize(0) : c.getSize(1);
|
||||
|
||||
FAISS_ASSERT(aM == cM);
|
||||
FAISS_ASSERT(aK == bK);
|
||||
FAISS_ASSERT(bN == cN);
|
||||
|
||||
FAISS_ASSERT(a.getStride(1) == 1);
|
||||
FAISS_ASSERT(b.getStride(1) == 1);
|
||||
FAISS_ASSERT(c.getStride(1) == 1);
|
||||
|
||||
// Now, we have to represent the matrix multiplication in
|
||||
// column-major layout
|
||||
T* pA = transC ? a.data() : b.data();
|
||||
T* pB = transC ? b.data() : a.data();
|
||||
float* pC = c.data();
|
||||
|
||||
int m = c.getSize(1); // stride 1 size
|
||||
int n = c.getSize(0); // other size
|
||||
int k = transA ? a.getSize(0) : a.getSize(1);
|
||||
|
||||
int lda = transC ? a.getStride(0) : b.getStride(0);
|
||||
int ldb = transC ? b.getStride(0) : a.getStride(0);
|
||||
int ldc = c.getStride(0);
|
||||
|
||||
auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
|
||||
if (transC) {
|
||||
gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
}
|
||||
|
||||
auto err = CublasGemm<T>::gemm(handle,
|
||||
gemmTrA, gemmTrB,
|
||||
m, n, k, alpha,
|
||||
pA, lda, pB, ldb, beta,
|
||||
pC, ldc);
|
||||
|
||||
FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
|
||||
"cublas failed (%d): "
|
||||
"(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
|
||||
(int) err,
|
||||
a.getSize(0), a.getSize(1), transA ? "'" : "",
|
||||
b.getSize(0), b.getSize(1), transB ? "'" : "",
|
||||
c.getSize(0), c.getSize(1), transC ? "'" : "");
|
||||
CUDA_TEST_ERROR();
|
||||
}
|
||||
|
||||
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<float, 2, true>& a, bool transA,
|
||||
Tensor<float, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
return runMatrixMult<float>(c, transC, a, transA, b, transB,
|
||||
alpha, beta, handle, stream);
|
||||
}
|
||||
|
||||
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<half, 2, true>& a, bool transA,
|
||||
Tensor<half, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
return runMatrixMult<half>(c, transC, a, transA, b, transB,
|
||||
alpha, beta, handle, stream);
|
||||
}
|
||||
|
||||
void
|
||||
runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
|
||||
Tensor<float, 3, true>& a, bool transA,
|
||||
Tensor<float, 3, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream) {
|
||||
FAISS_ASSERT(c.getSize(0) == a.getSize(0));
|
||||
FAISS_ASSERT(a.getSize(0) == b.getSize(0));
|
||||
|
||||
for (int i = 0; i < a.getSize(0); ++i) {
|
||||
auto cView = c[i].view();
|
||||
auto aView = a[i].view();
|
||||
auto bView = b[i].view();
|
||||
|
||||
runMatrixMult(cView, transC,
|
||||
aView, transA,
|
||||
bView, transB,
|
||||
alpha, beta, handle, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
|
||||
Tensor<float, 3, true>& a, bool transA,
|
||||
|
|
|
@ -10,6 +10,9 @@
|
|||
|
||||
#include <cublas_v2.h>
|
||||
#include <faiss/gpu/utils/Tensor.cuh>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/HostTensor.cuh>
|
||||
#include <faiss/gpu/utils/Float16.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
|
@ -17,30 +20,23 @@ class DeviceMemory;
|
|||
|
||||
/// C = alpha * A * B + beta * C
|
||||
/// Expects row major layout, not fortran/blas column major!
|
||||
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<float, 2, true>& a, bool transA,
|
||||
Tensor<float, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream);
|
||||
|
||||
/// C = alpha * A * B + beta * C
|
||||
/// Expects row major layout, not fortran/blas column major!
|
||||
void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<half, 2, true>& a, bool transA,
|
||||
Tensor<half, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream);
|
||||
template <typename AT, typename BT>
|
||||
void
|
||||
runMatrixMult(Tensor<float, 2, true>& c, bool transC,
|
||||
Tensor<AT, 2, true>& a, bool transA,
|
||||
Tensor<BT, 2, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
cudaStream_t stream);
|
||||
|
||||
/// C_i = alpha * A_i * B_i + beta * C_i
|
||||
/// where `i` is the outermost dimension, via iterated gemm
|
||||
/// Expects row major layout, not fortran/blas column major!
|
||||
template <typename AT, typename BT>
|
||||
void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
|
||||
Tensor<float, 3, true>& a, bool transA,
|
||||
Tensor<float, 3, true>& b, bool transB,
|
||||
Tensor<AT, 3, true>& a, bool transA,
|
||||
Tensor<BT, 3, true>& b, bool transB,
|
||||
float alpha,
|
||||
float beta,
|
||||
cublasHandle_t handle,
|
||||
|
@ -59,3 +55,5 @@ void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
|
|||
cudaStream_t stream);
|
||||
|
||||
} } // namespace
|
||||
|
||||
#include <faiss/gpu/utils/MatrixMult-inl.cuh>
|
||||
|
|
|
@ -51,9 +51,7 @@ struct RangeSearchResult {
|
|||
};
|
||||
|
||||
|
||||
/**
|
||||
|
||||
Encapsulates a set of ids to remove. */
|
||||
/** Encapsulates a set of ids to remove. */
|
||||
struct IDSelector {
|
||||
typedef Index::idx_t idx_t;
|
||||
virtual bool is_member (idx_t id) const = 0;
|
||||
|
|
|
@ -123,15 +123,15 @@ struct PolysemousTraining: SimulatedAnnealingParameters {
|
|||
enum Optimization_type_t {
|
||||
OT_None,
|
||||
OT_ReproduceDistances_affine, ///< default
|
||||
OT_Ranking_weighted_diff /// same as _2, but use rank of y+ - rank of y-
|
||||
OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of y-
|
||||
};
|
||||
Optimization_type_t optimization_type;
|
||||
|
||||
// use 1/4 of the training points for the optimization, with
|
||||
// max. ntrain_permutation. If ntrain_permutation == 0: train on
|
||||
// centroids
|
||||
/** use 1/4 of the training points for the optimization, with
|
||||
* max. ntrain_permutation. If ntrain_permutation == 0: train on
|
||||
* centroids */
|
||||
int ntrain_permutation;
|
||||
double dis_weight_factor; // decay of exp that weights distance loss
|
||||
double dis_weight_factor; ///< decay of exp that weights distance loss
|
||||
|
||||
// filename pattern for the logging of iterations
|
||||
std::string log_pattern;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/impl/io.h>
|
||||
#include <faiss/utils/hamming.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/VectorTransform.h>
|
||||
|
@ -41,6 +42,7 @@
|
|||
#include <faiss/IndexBinaryFromFloat.h>
|
||||
#include <faiss/IndexBinaryHNSW.h>
|
||||
#include <faiss/IndexBinaryIVF.h>
|
||||
#include <faiss/IndexBinaryHash.h>
|
||||
|
||||
|
||||
|
||||
|
@ -752,6 +754,56 @@ static void read_binary_ivf_header (
|
|||
read_direct_map (&ivf->direct_map, f);
|
||||
}
|
||||
|
||||
static void read_binary_hash_invlists (
|
||||
IndexBinaryHash::InvertedListMap &invlists,
|
||||
int b, IOReader *f)
|
||||
{
|
||||
size_t sz;
|
||||
READ1 (sz);
|
||||
int il_nbit = 0;
|
||||
READ1 (il_nbit);
|
||||
// buffer for bitstrings
|
||||
std::vector<uint8_t> buf((b + il_nbit) * sz);
|
||||
READVECTOR (buf);
|
||||
BitstringReader rd (buf.data(), buf.size());
|
||||
invlists.reserve (sz);
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
uint64_t hash = rd.read(b);
|
||||
uint64_t ilsz = rd.read(il_nbit);
|
||||
auto & il = invlists[hash];
|
||||
READVECTOR (il.ids);
|
||||
FAISS_THROW_IF_NOT (il.ids.size() == ilsz);
|
||||
READVECTOR (il.vecs);
|
||||
}
|
||||
}
|
||||
|
||||
static void read_binary_multi_hash_map(
|
||||
IndexBinaryMultiHash::Map &map,
|
||||
int b, size_t ntotal,
|
||||
IOReader *f)
|
||||
{
|
||||
int id_bits;
|
||||
size_t sz;
|
||||
READ1 (id_bits);
|
||||
READ1 (sz);
|
||||
std::vector<uint8_t> buf;
|
||||
READVECTOR (buf);
|
||||
size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
|
||||
FAISS_THROW_IF_NOT (buf.size() == (nbit + 7) / 8);
|
||||
BitstringReader rd (buf.data(), buf.size());
|
||||
map.reserve (sz);
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
uint64_t hash = rd.read(b);
|
||||
uint64_t ilsz = rd.read(id_bits);
|
||||
auto & il = map[hash];
|
||||
for (size_t j = 0; j < ilsz; j++) {
|
||||
il.push_back (rd.read (id_bits));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
IndexBinary *read_index_binary (IOReader *f, int io_flags) {
|
||||
IndexBinary * idx = nullptr;
|
||||
uint32_t h;
|
||||
|
@ -793,6 +845,28 @@ IndexBinary *read_index_binary (IOReader *f, int io_flags) {
|
|||
static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map ();
|
||||
}
|
||||
idx = idxmap;
|
||||
} else if(h == fourcc("IBHh")) {
|
||||
IndexBinaryHash *idxh = new IndexBinaryHash ();
|
||||
read_index_binary_header (idxh, f);
|
||||
READ1 (idxh->b);
|
||||
READ1 (idxh->nflip);
|
||||
read_binary_hash_invlists(idxh->invlists, idxh->b, f);
|
||||
idx = idxh;
|
||||
} else if(h == fourcc("IBHm")) {
|
||||
IndexBinaryMultiHash* idxmh = new IndexBinaryMultiHash ();
|
||||
read_index_binary_header (idxmh, f);
|
||||
idxmh->storage = dynamic_cast<IndexBinaryFlat*> (read_index_binary (f));
|
||||
FAISS_THROW_IF_NOT(idxmh->storage && idxmh->storage->ntotal == idxmh->ntotal);
|
||||
idxmh->own_fields = true;
|
||||
READ1 (idxmh->b);
|
||||
READ1 (idxmh->nhash);
|
||||
READ1 (idxmh->nflip);
|
||||
idxmh->maps.resize (idxmh->nhash);
|
||||
for (int i = 0; i < idxmh->nhash; i++) {
|
||||
read_binary_multi_hash_map(
|
||||
idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
|
||||
}
|
||||
idx = idxmh;
|
||||
} else {
|
||||
FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
|
||||
idx = nullptr;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/impl/io.h>
|
||||
#include <faiss/utils/hamming.h>
|
||||
|
||||
#include <faiss/IndexFlat.h>
|
||||
#include <faiss/VectorTransform.h>
|
||||
|
@ -41,6 +42,7 @@
|
|||
#include <faiss/IndexBinaryFromFloat.h>
|
||||
#include <faiss/IndexBinaryHNSW.h>
|
||||
#include <faiss/IndexBinaryIVF.h>
|
||||
#include <faiss/IndexBinaryHash.h>
|
||||
|
||||
|
||||
|
||||
|
@ -515,6 +517,67 @@ static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) {
|
|||
write_direct_map (&ivf->direct_map, f);
|
||||
}
|
||||
|
||||
static void write_binary_hash_invlists (
|
||||
const IndexBinaryHash::InvertedListMap &invlists,
|
||||
int b, IOWriter *f)
|
||||
{
|
||||
size_t sz = invlists.size();
|
||||
WRITE1 (sz);
|
||||
size_t maxil = 0;
|
||||
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
|
||||
if(it->second.ids.size() > maxil) {
|
||||
maxil = it->second.ids.size();
|
||||
}
|
||||
}
|
||||
int il_nbit = 0;
|
||||
while(maxil >= ((uint64_t)1 << il_nbit)) {
|
||||
il_nbit++;
|
||||
}
|
||||
WRITE1(il_nbit);
|
||||
|
||||
// first write sizes then data, may be useful if we want to
|
||||
// memmap it at some point
|
||||
|
||||
// buffer for bitstrings
|
||||
std::vector<uint8_t> buf (((b + il_nbit) * sz + 7) / 8);
|
||||
BitstringWriter wr (buf.data(), buf.size());
|
||||
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
|
||||
wr.write (it->first, b);
|
||||
wr.write (it->second.ids.size(), il_nbit);
|
||||
}
|
||||
WRITEVECTOR (buf);
|
||||
|
||||
for (auto it = invlists.begin(); it != invlists.end(); ++it) {
|
||||
WRITEVECTOR (it->second.ids);
|
||||
WRITEVECTOR (it->second.vecs);
|
||||
}
|
||||
}
|
||||
|
||||
static void write_binary_multi_hash_map(
|
||||
const IndexBinaryMultiHash::Map &map,
|
||||
int b, size_t ntotal,
|
||||
IOWriter *f)
|
||||
{
|
||||
int id_bits = 0;
|
||||
while ((ntotal > ((Index::idx_t)1 << id_bits))) {
|
||||
id_bits++;
|
||||
}
|
||||
WRITE1(id_bits);
|
||||
size_t sz = map.size();
|
||||
WRITE1(sz);
|
||||
size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
|
||||
std::vector<uint8_t> buf((nbit + 7) / 8);
|
||||
BitstringWriter wr (buf.data(), buf.size());
|
||||
for (auto it = map.begin(); it != map.end(); ++it) {
|
||||
wr.write(it->first, b);
|
||||
wr.write(it->second.size(), id_bits);
|
||||
for (auto id : it->second) {
|
||||
wr.write(id, id_bits);
|
||||
}
|
||||
}
|
||||
WRITEVECTOR (buf);
|
||||
}
|
||||
|
||||
void write_index_binary (const IndexBinary *idx, IOWriter *f) {
|
||||
if (const IndexBinaryFlat *idxf =
|
||||
dynamic_cast<const IndexBinaryFlat *> (idx)) {
|
||||
|
@ -551,6 +614,27 @@ void write_index_binary (const IndexBinary *idx, IOWriter *f) {
|
|||
write_index_binary_header (idxmap, f);
|
||||
write_index_binary (idxmap->index, f);
|
||||
WRITEVECTOR (idxmap->id_map);
|
||||
} else if (const IndexBinaryHash *idxh =
|
||||
dynamic_cast<const IndexBinaryHash *> (idx)) {
|
||||
uint32_t h = fourcc ("IBHh");
|
||||
WRITE1 (h);
|
||||
write_index_binary_header (idxh, f);
|
||||
WRITE1 (idxh->b);
|
||||
WRITE1 (idxh->nflip);
|
||||
write_binary_hash_invlists(idxh->invlists, idxh->b, f);
|
||||
} else if (const IndexBinaryMultiHash *idxmh =
|
||||
dynamic_cast<const IndexBinaryMultiHash *> (idx)) {
|
||||
uint32_t h = fourcc ("IBHm");
|
||||
WRITE1 (h);
|
||||
write_index_binary_header (idxmh, f);
|
||||
write_index_binary (idxmh->storage, f);
|
||||
WRITE1 (idxmh->b);
|
||||
WRITE1 (idxmh->nhash);
|
||||
WRITE1 (idxmh->nflip);
|
||||
for (int i = 0; i < idxmh->nhash; i++) {
|
||||
write_binary_multi_hash_map(
|
||||
idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
|
||||
}
|
||||
} else {
|
||||
FAISS_THROW_MSG ("don't know how to serialize this type of index");
|
||||
}
|
||||
|
|
112
impl/io.cpp
112
impl/io.cpp
|
@ -37,7 +37,6 @@ int IOWriter::fileno ()
|
|||
***********************************************************************/
|
||||
|
||||
|
||||
|
||||
size_t VectorIOWriter::operator()(
|
||||
const void *ptr, size_t size, size_t nitems)
|
||||
{
|
||||
|
@ -132,6 +131,117 @@ int FileIOWriter::fileno() {
|
|||
return ::fileno (f);
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* IO buffer
|
||||
***********************************************************************/
|
||||
|
||||
BufferedIOReader::BufferedIOReader(IOReader *reader, size_t bsz, size_t totsz):
|
||||
reader(reader), bsz(bsz), totsz(totsz), ofs(0), b0(0), b1(0), buffer(bsz)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
size_t BufferedIOReader::operator()(void *ptr, size_t unitsize, size_t nitems)
|
||||
{
|
||||
size_t size = unitsize * nitems;
|
||||
if (size == 0) return 0;
|
||||
char * dst = (char*)ptr;
|
||||
size_t nb;
|
||||
|
||||
{ // first copy available bytes
|
||||
nb = std::min(b1 - b0, size);
|
||||
memcpy (dst, buffer.data() + b0, nb);
|
||||
b0 += nb;
|
||||
dst += nb;
|
||||
size -= nb;
|
||||
}
|
||||
|
||||
if (size > totsz - ofs) {
|
||||
size = totsz - ofs;
|
||||
}
|
||||
// while we would like to have more data
|
||||
while (size > 0) {
|
||||
assert (b0 == b1); // buffer empty on input
|
||||
// try to read from main reader
|
||||
b0 = 0;
|
||||
b1 = (*reader)(buffer.data(), 1, std::min(bsz, size));
|
||||
|
||||
if (b1 == 0) {
|
||||
// no more bytes available
|
||||
break;
|
||||
}
|
||||
ofs += b1;
|
||||
|
||||
// copy remaining bytes
|
||||
size_t nb2 = std::min(b1, size);
|
||||
memcpy (dst, buffer.data(), nb2);
|
||||
b0 = nb2;
|
||||
nb += nb2;
|
||||
dst += nb2;
|
||||
size -= nb2;
|
||||
}
|
||||
return nb / unitsize;
|
||||
}
|
||||
|
||||
|
||||
BufferedIOWriter::BufferedIOWriter(IOWriter *writer, size_t bsz):
|
||||
writer(writer), bsz(bsz), b0(0), buffer(bsz)
|
||||
{
|
||||
}
|
||||
|
||||
size_t BufferedIOWriter::operator()(const void *ptr, size_t unitsize, size_t nitems)
|
||||
{
|
||||
size_t size = unitsize * nitems;
|
||||
if (size == 0) return 0;
|
||||
const char * src = (const char*)ptr;
|
||||
size_t nb;
|
||||
|
||||
{ // copy as many bytes as possible to buffer
|
||||
nb = std::min(bsz - b0, size);
|
||||
memcpy (buffer.data() + b0, src, nb);
|
||||
b0 += nb;
|
||||
src += nb;
|
||||
size -= nb;
|
||||
}
|
||||
while (size > 0) {
|
||||
assert(b0 == bsz);
|
||||
// now we need to flush to add more bytes
|
||||
size_t ofs = 0;
|
||||
do {
|
||||
assert (ofs < 10000000);
|
||||
size_t written = (*writer)(buffer.data() + ofs, 1, bsz - ofs);
|
||||
FAISS_THROW_IF_NOT(written > 0);
|
||||
ofs += written;
|
||||
} while(ofs != bsz);
|
||||
|
||||
// copy src to buffer
|
||||
size_t nb1 = std::min(bsz, size);
|
||||
memcpy (buffer.data(), src, nb1);
|
||||
b0 = nb1;
|
||||
nb += nb1;
|
||||
src += nb1;
|
||||
size -= nb1;
|
||||
}
|
||||
|
||||
return nb / unitsize;
|
||||
}
|
||||
|
||||
BufferedIOWriter::~BufferedIOWriter()
|
||||
{
|
||||
size_t ofs = 0;
|
||||
while(ofs != b0) {
|
||||
printf("Destructor write %ld \n", b0 - ofs);
|
||||
size_t written = (*writer)(buffer.data() + ofs, 1, b0 - ofs);
|
||||
FAISS_THROW_IF_NOT(written > 0);
|
||||
ofs += written;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
uint32_t fourcc (const char sx[4]) {
|
||||
assert(4 == strlen(sx));
|
||||
const unsigned char *x = (unsigned char*)sx;
|
||||
|
|
38
impl/io.h
38
impl/io.h
|
@ -9,6 +9,9 @@
|
|||
|
||||
/***********************************************************
|
||||
* Abstract I/O objects
|
||||
*
|
||||
* I/O is always sequential, seek does not need to be supported
|
||||
* (indexes could be read or written to a pipe).
|
||||
***********************************************************/
|
||||
|
||||
#pragma once
|
||||
|
@ -92,6 +95,41 @@ struct FileIOWriter: IOWriter {
|
|||
int fileno() override;
|
||||
};
|
||||
|
||||
/*******************************************************
|
||||
* Buffered reader + writer
|
||||
*******************************************************/
|
||||
|
||||
|
||||
|
||||
/** wraps an ioreader to make buffered reads to avoid too small reads */
|
||||
struct BufferedIOReader: IOReader {
|
||||
|
||||
IOReader *reader;
|
||||
size_t bsz, totsz, ofs;
|
||||
size_t b0, b1; ///< range of available bytes in the buffer
|
||||
std::vector<char> buffer;
|
||||
|
||||
BufferedIOReader(IOReader *reader, size_t bsz,
|
||||
size_t totsz=(size_t)(-1));
|
||||
|
||||
size_t operator()(void *ptr, size_t size, size_t nitems) override;
|
||||
};
|
||||
|
||||
struct BufferedIOWriter: IOWriter {
|
||||
|
||||
IOWriter *writer;
|
||||
size_t bsz, ofs;
|
||||
size_t b0; ///< amount of data in buffer
|
||||
std::vector<char> buffer;
|
||||
|
||||
BufferedIOWriter(IOWriter *writer, size_t bsz);
|
||||
|
||||
size_t operator()(const void *ptr, size_t size, size_t nitems) override;
|
||||
|
||||
// flushes
|
||||
~BufferedIOWriter();
|
||||
};
|
||||
|
||||
/// cast a 4-character string to a uint32_t that can be written and read easily
|
||||
uint32_t fourcc (const char sx[4]);
|
||||
|
||||
|
|
|
@ -283,6 +283,18 @@ def handle_IndexBinary(the_class):
|
|||
swig_ptr(labels))
|
||||
return distances, labels
|
||||
|
||||
def replacement_range_search(self, x, thresh):
|
||||
n, d = x.shape
|
||||
assert d * 8 == self.d
|
||||
res = RangeSearchResult(n)
|
||||
self.range_search_c(n, swig_ptr(x), thresh, res)
|
||||
# get pointers and copy them
|
||||
lims = rev_swig_ptr(res.lims, n + 1).copy()
|
||||
nd = int(lims[-1])
|
||||
D = rev_swig_ptr(res.distances, nd).copy()
|
||||
I = rev_swig_ptr(res.labels, nd).copy()
|
||||
return lims, D, I
|
||||
|
||||
def replacement_remove_ids(self, x):
|
||||
if isinstance(x, IDSelector):
|
||||
sel = x
|
||||
|
@ -295,6 +307,7 @@ def handle_IndexBinary(the_class):
|
|||
replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
|
||||
replace_method(the_class, 'train', replacement_train)
|
||||
replace_method(the_class, 'search', replacement_search)
|
||||
replace_method(the_class, 'range_search', replacement_range_search)
|
||||
replace_method(the_class, 'reconstruct', replacement_reconstruct)
|
||||
replace_method(the_class, 'remove_ids', replacement_remove_ids)
|
||||
|
||||
|
@ -461,6 +474,9 @@ add_ref_in_constructor(IndexBinaryIDMap2, 0)
|
|||
add_ref_in_method(IndexReplicas, 'addIndex', 0)
|
||||
add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0)
|
||||
|
||||
add_ref_in_constructor(BufferedIOWriter, 0)
|
||||
add_ref_in_constructor(BufferedIOReader, 0)
|
||||
|
||||
# seems really marginal...
|
||||
# remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
|
||||
|
||||
|
@ -751,9 +767,24 @@ def deserialize_index(data):
|
|||
copy_array_to_vector(data, reader.data)
|
||||
return read_index(reader)
|
||||
|
||||
def serialize_index_binary(index):
|
||||
""" convert an index to a numpy uint8 array """
|
||||
writer = VectorIOWriter()
|
||||
write_index_binary(index, writer)
|
||||
return vector_to_array(writer.data)
|
||||
|
||||
def deserialize_index_binary(data):
|
||||
reader = VectorIOReader()
|
||||
copy_array_to_vector(data, reader.data)
|
||||
return read_index_binary(reader)
|
||||
|
||||
|
||||
###########################################
|
||||
# ResultHeap
|
||||
###########################################
|
||||
|
||||
class ResultHeap:
|
||||
"""Combine query results from a sliced dataset. The final result will
|
||||
"""Accumulate query results from a sliced dataset. The final result will
|
||||
be in self.D, self.I."""
|
||||
|
||||
def __init__(self, nq, k):
|
||||
|
|
|
@ -32,7 +32,7 @@ are implemented on the GPU. It is developed by Facebook AI Research.
|
|||
"""
|
||||
setup(
|
||||
name='faiss',
|
||||
version='1.6.2',
|
||||
version='1.6.3',
|
||||
description='A library for efficient similarity search and clustering of dense vectors',
|
||||
long_description=long_description,
|
||||
url='https://github.com/facebookresearch/faiss',
|
||||
|
|
|
@ -93,6 +93,7 @@ extern "C" {
|
|||
#include <faiss/IndexBinaryIVF.h>
|
||||
#include <faiss/IndexBinaryFromFloat.h>
|
||||
#include <faiss/IndexBinaryHNSW.h>
|
||||
#include <faiss/IndexBinaryHash.h>
|
||||
|
||||
#include <faiss/impl/io.h>
|
||||
#include <faiss/index_io.h>
|
||||
|
@ -359,6 +360,7 @@ void gpu_sync_all_devices()
|
|||
%include <faiss/IndexBinaryIVF.h>
|
||||
%include <faiss/IndexBinaryFromFloat.h>
|
||||
%include <faiss/IndexBinaryHNSW.h>
|
||||
%include <faiss/IndexBinaryHash.h>
|
||||
|
||||
|
||||
|
||||
|
@ -979,6 +981,124 @@ struct MapLong2Long {
|
|||
|
||||
%}
|
||||
|
||||
/*******************************************************************
|
||||
* Support I/O to arbitrary functions
|
||||
*******************************************************************/
|
||||
|
||||
|
||||
%inline %{
|
||||
|
||||
#ifdef SWIGPYTHON
|
||||
|
||||
|
||||
struct PyCallbackIOWriter: faiss::IOWriter {
|
||||
|
||||
PyObject * callback;
|
||||
size_t bs; // maximum write size
|
||||
|
||||
PyCallbackIOWriter(PyObject *callback,
|
||||
size_t bs = 1024 * 1024):
|
||||
callback(callback), bs(bs) {
|
||||
Py_INCREF(callback);
|
||||
name = "PyCallbackIOWriter";
|
||||
}
|
||||
|
||||
size_t operator()(const void *ptrv, size_t size, size_t nitems) override {
|
||||
size_t ws = size * nitems;
|
||||
const char *ptr = (const char*)ptrv;
|
||||
PyGILState_STATE gstate;
|
||||
gstate = PyGILState_Ensure();
|
||||
while(ws > 0) {
|
||||
size_t wi = ws > bs ? bs : ws;
|
||||
PyObject* bo = PyBytes_FromStringAndSize(ptr, wi);
|
||||
PyObject *arglist = Py_BuildValue("(N)", bo);
|
||||
if(!arglist) {
|
||||
PyGILState_Release(gstate);
|
||||
return 0;
|
||||
}
|
||||
ptr += wi;
|
||||
ws -= wi;
|
||||
PyObject * result = PyObject_CallObject(callback, arglist);
|
||||
Py_DECREF(arglist);
|
||||
if (result == NULL) {
|
||||
PyGILState_Release(gstate);
|
||||
return 0;
|
||||
}
|
||||
Py_DECREF(result);
|
||||
}
|
||||
PyGILState_Release(gstate);
|
||||
return nitems;
|
||||
}
|
||||
|
||||
~PyCallbackIOWriter() {
|
||||
Py_DECREF(callback);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct PyCallbackIOReader: faiss::IOReader {
|
||||
|
||||
PyObject * callback;
|
||||
size_t bs; // maximum buffer size
|
||||
|
||||
PyCallbackIOReader(PyObject *callback,
|
||||
size_t bs = 1024 * 1024):
|
||||
callback(callback), bs(bs) {
|
||||
Py_INCREF(callback);
|
||||
name = "PyCallbackIOReader";
|
||||
}
|
||||
|
||||
size_t operator()(void *ptrv, size_t size, size_t nitems) override {
|
||||
size_t rs = size * nitems;
|
||||
char *ptr = (char*)ptrv;
|
||||
PyGILState_STATE gstate;
|
||||
gstate = PyGILState_Ensure();
|
||||
while(rs > 0) {
|
||||
size_t ri = rs > bs ? bs : rs;
|
||||
PyObject *arglist = Py_BuildValue("(n)", ri);
|
||||
PyObject * result = PyObject_CallObject(callback, arglist);
|
||||
Py_DECREF(arglist);
|
||||
if (result == NULL) {
|
||||
PyGILState_Release(gstate);
|
||||
return 0;
|
||||
}
|
||||
if(!PyBytes_Check(result)) {
|
||||
Py_DECREF(result);
|
||||
PyErr_SetString(PyExc_RuntimeError,
|
||||
"read callback did not return a bytes object");
|
||||
PyGILState_Release(gstate);
|
||||
throw faiss::FaissException("reader error");
|
||||
}
|
||||
size_t sz = PyBytes_Size(result);
|
||||
if (sz == 0 || sz > rs) {
|
||||
Py_DECREF(result);
|
||||
PyErr_Format(PyExc_RuntimeError,
|
||||
"read callback returned %ld bytes (asked %ld)",
|
||||
sz, rs);
|
||||
PyGILState_Release(gstate);
|
||||
throw faiss::FaissException("reader error");
|
||||
}
|
||||
memcpy(ptr, PyBytes_AsString(result), sz);
|
||||
Py_DECREF(result);
|
||||
ptr += sz;
|
||||
rs -= sz;
|
||||
}
|
||||
PyGILState_Release(gstate);
|
||||
return nitems;
|
||||
}
|
||||
|
||||
~PyCallbackIOReader() {
|
||||
Py_DECREF(callback);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
%}
|
||||
|
||||
|
||||
|
||||
%inline %{
|
||||
void wait() {
|
||||
// in gdb, use return to get out of this function
|
||||
|
|
|
@ -97,3 +97,32 @@ def get_dataset_2(d, nt, nb, nq):
|
|||
x = np.sin(x)
|
||||
x = x.astype('float32')
|
||||
return x[:nt], x[nt:nt + nb], x[nt + nb:]
|
||||
|
||||
|
||||
def make_binary_dataset(d, nt, nb, nq):
|
||||
assert d % 8 == 0
|
||||
rs = np.random.RandomState(123)
|
||||
x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
|
||||
return x[:nt], x[nt:-nq], x[-nq:]
|
||||
|
||||
|
||||
def compare_binary_result_lists(D1, I1, D2, I2):
|
||||
"""comparing result lists is difficult because there are many
|
||||
ties. Here we sort by (distance, index) pairs and ignore the largest
|
||||
distance of each result. Compatible result lists should pass this."""
|
||||
assert D1.shape == I1.shape == D2.shape == I2.shape
|
||||
n, k = D1.shape
|
||||
ndiff = (D1 != D2).sum()
|
||||
assert ndiff == 0, '%d differences in distance matrix %s' % (
|
||||
ndiff, D1.shape)
|
||||
|
||||
def normalize_DI(D, I):
|
||||
norm = I.max() + 1.0
|
||||
Dr = D.astype('float64') + I / norm
|
||||
# ignore -1s and elements on last column
|
||||
Dr[I1 == -1] = 1e20
|
||||
Dr[D == D[:, -1:]] = 1e20
|
||||
Dr.sort(axis=1)
|
||||
return Dr
|
||||
ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
|
||||
assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
|
||||
|
|
|
@ -0,0 +1,183 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
import faiss
|
||||
|
||||
from common import make_binary_dataset
|
||||
|
||||
|
||||
def bitvec_shuffle(a, order):
|
||||
n, d = a.shape
|
||||
db, = order.shape
|
||||
b = np.empty((n, db // 8), dtype='uint8')
|
||||
faiss.bitvec_shuffle(
|
||||
n, d * 8, db,
|
||||
faiss.swig_ptr(order),
|
||||
faiss.swig_ptr(a), faiss.swig_ptr(b))
|
||||
return b
|
||||
|
||||
|
||||
class TestSmallFuncs(unittest.TestCase):
|
||||
|
||||
def test_shuffle(self):
|
||||
d = 256
|
||||
n = 1000
|
||||
rs = np.random.RandomState(123)
|
||||
o = rs.permutation(d).astype('int32')
|
||||
|
||||
x = rs.randint(256, size=(n, d // 8)).astype('uint8')
|
||||
|
||||
y1 = bitvec_shuffle(x, o[:128])
|
||||
y2 = bitvec_shuffle(x, o[128:])
|
||||
y = np.hstack((y1, y2))
|
||||
|
||||
oinv = np.empty(d, dtype='int32')
|
||||
oinv[o] = np.arange(d)
|
||||
z = bitvec_shuffle(y, oinv)
|
||||
|
||||
np.testing.assert_array_equal(x, z)
|
||||
|
||||
|
||||
class TestRange(unittest.TestCase):
|
||||
|
||||
def test_hash(self):
|
||||
d = 128
|
||||
nq = 100
|
||||
nb = 2000
|
||||
|
||||
(_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
|
||||
|
||||
index_ref = faiss.IndexBinaryFlat(d)
|
||||
index_ref.add(xb)
|
||||
|
||||
radius = 55
|
||||
|
||||
Lref, Dref, Iref = index_ref.range_search(xq, radius)
|
||||
|
||||
print("nb res: ", Lref[-1])
|
||||
|
||||
index = faiss.IndexBinaryHash(d, 10)
|
||||
index.add(xb)
|
||||
# index.display()
|
||||
nfound = []
|
||||
ndis = []
|
||||
stats = faiss.cvar.indexBinaryHash_stats
|
||||
for n_bitflips in range(index.b + 1):
|
||||
index.nflip = n_bitflips
|
||||
stats.reset()
|
||||
Lnew, Dnew, Inew = index.range_search(xq, radius)
|
||||
for i in range(nq):
|
||||
ref = Iref[Lref[i]:Lref[i + 1]]
|
||||
new = Inew[Lnew[i]:Lnew[i + 1]]
|
||||
snew = set(new)
|
||||
# no duplicates
|
||||
self.assertTrue(len(new) == len(snew))
|
||||
# subset of real results
|
||||
self.assertTrue(snew <= set(ref))
|
||||
nfound.append(Lnew[-1])
|
||||
ndis.append(stats.ndis)
|
||||
print('nfound=', nfound)
|
||||
print('ndis=', ndis)
|
||||
nfound = np.array(nfound)
|
||||
self.assertTrue(nfound[-1] == Lref[-1])
|
||||
self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
|
||||
|
||||
def test_multihash(self):
|
||||
d = 128
|
||||
nq = 100
|
||||
nb = 2000
|
||||
|
||||
(_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
|
||||
|
||||
index_ref = faiss.IndexBinaryFlat(d)
|
||||
index_ref.add(xb)
|
||||
|
||||
radius = 55
|
||||
|
||||
Lref, Dref, Iref = index_ref.range_search(xq, radius)
|
||||
|
||||
print("nb res: ", Lref[-1])
|
||||
|
||||
nfound = []
|
||||
ndis = []
|
||||
|
||||
for nh in 1, 3, 5:
|
||||
index = faiss.IndexBinaryMultiHash(d, nh, 10)
|
||||
index.add(xb)
|
||||
# index.display()
|
||||
stats = faiss.cvar.indexBinaryHash_stats
|
||||
index.nflip = 2
|
||||
stats.reset()
|
||||
Lnew, Dnew, Inew = index.range_search(xq, radius)
|
||||
for i in range(nq):
|
||||
ref = Iref[Lref[i]:Lref[i + 1]]
|
||||
new = Inew[Lnew[i]:Lnew[i + 1]]
|
||||
snew = set(new)
|
||||
# no duplicates
|
||||
self.assertTrue(len(new) == len(snew))
|
||||
# subset of real results
|
||||
self.assertTrue(snew <= set(ref))
|
||||
nfound.append(Lnew[-1])
|
||||
ndis.append(stats.ndis)
|
||||
print('nfound=', nfound)
|
||||
print('ndis=', ndis)
|
||||
nfound = np.array(nfound)
|
||||
# self.assertTrue(nfound[-1] == Lref[-1])
|
||||
self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
|
||||
|
||||
|
||||
class TestKnn(unittest.TestCase):
|
||||
|
||||
def test_hash_and_multihash(self):
|
||||
d = 128
|
||||
nq = 100
|
||||
nb = 2000
|
||||
|
||||
(_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
|
||||
|
||||
index_ref = faiss.IndexBinaryFlat(d)
|
||||
index_ref.add(xb)
|
||||
k = 10
|
||||
Dref, Iref = index_ref.search(xq, k)
|
||||
|
||||
nfound = {}
|
||||
for nh in 0, 1, 3, 5:
|
||||
|
||||
for nbit in 4, 7:
|
||||
if nh == 0:
|
||||
index = faiss.IndexBinaryHash(d, nbit)
|
||||
else:
|
||||
index = faiss.IndexBinaryMultiHash(d, nh, nbit)
|
||||
index.add(xb)
|
||||
index.nflip = 2
|
||||
Dnew, Inew = index.search(xq, k)
|
||||
nf = 0
|
||||
for i in range(nq):
|
||||
ref = Iref[i]
|
||||
new = Inew[i]
|
||||
snew = set(new)
|
||||
# no duplicates
|
||||
self.assertTrue(len(new) == len(snew))
|
||||
nf += len(set(ref) & snew)
|
||||
print('nfound', nh, nbit, nf)
|
||||
nfound[(nh, nbit)] = nf
|
||||
self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
|
||||
|
||||
# test serialization
|
||||
index2 = faiss.deserialize_index_binary(
|
||||
faiss.serialize_index_binary(index))
|
||||
|
||||
D2, I2 = index2.search(xq, k)
|
||||
np.testing.assert_array_equal(Inew, I2)
|
||||
np.testing.assert_array_equal(Dnew, D2)
|
||||
|
||||
print('nfound=', nfound)
|
||||
self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
|
||||
self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
|
||||
self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
|
|
@ -13,7 +13,7 @@ import faiss
|
|||
import tempfile
|
||||
import os
|
||||
import re
|
||||
|
||||
import warnings
|
||||
|
||||
from common import get_dataset, get_dataset_2
|
||||
|
||||
|
@ -24,7 +24,6 @@ class TestModuleInterface(unittest.TestCase):
|
|||
assert re.match('^\\d+\\.\\d+\\.\\d+$', faiss.__version__)
|
||||
|
||||
|
||||
|
||||
class EvalIVFPQAccuracy(unittest.TestCase):
|
||||
|
||||
def test_IndexIVFPQ(self):
|
||||
|
@ -506,37 +505,6 @@ class TestHNSW(unittest.TestCase):
|
|||
assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
|
||||
|
||||
|
||||
class TestIOError(unittest.TestCase):
|
||||
|
||||
def test_io_error(self):
|
||||
d, n = 32, 1000
|
||||
x = np.random.uniform(size=(n, d)).astype('float32')
|
||||
index = faiss.IndexFlatL2(d)
|
||||
index.add(x)
|
||||
_, fname = tempfile.mkstemp()
|
||||
try:
|
||||
faiss.write_index(index, fname)
|
||||
|
||||
# should be fine
|
||||
faiss.read_index(fname)
|
||||
|
||||
# now damage file
|
||||
data = open(fname, 'rb').read()
|
||||
data = data[:int(len(data) / 2)]
|
||||
open(fname, 'wb').write(data)
|
||||
|
||||
# should make a nice readable exception that mentions the
|
||||
try:
|
||||
faiss.read_index(fname)
|
||||
except RuntimeError as e:
|
||||
if fname not in str(e):
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
class TestDistancesPositive(unittest.TestCase):
|
||||
|
|
|
@ -10,12 +10,8 @@ import numpy as np
|
|||
import unittest
|
||||
import faiss
|
||||
|
||||
from common import compare_binary_result_lists, make_binary_dataset
|
||||
|
||||
def make_binary_dataset(d, nt, nb, nq):
|
||||
assert d % 8 == 0
|
||||
rs = np.random.RandomState(123)
|
||||
x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
|
||||
return x[:nt], x[nt:-nq], x[-nq:]
|
||||
|
||||
|
||||
def binary_to_float(x):
|
||||
|
@ -124,6 +120,29 @@ class TestBinaryFlat(unittest.TestCase):
|
|||
assert(np.all(Iflat == -1))
|
||||
assert(np.all(Dflat == 2147483647)) # NOTE(hoss): int32_t max
|
||||
|
||||
def test_range_search(self):
|
||||
d = self.xq.shape[1] * 8
|
||||
|
||||
index = faiss.IndexBinaryFlat(d)
|
||||
index.add(self.xb)
|
||||
D, I = index.search(self.xq, 10)
|
||||
thresh = int(np.median(D[:, -1]))
|
||||
|
||||
lims, D2, I2 = index.range_search(self.xq, thresh)
|
||||
nt1 = nt2 = 0
|
||||
for i in range(len(self.xq)):
|
||||
range_res = I2[lims[i]:lims[i + 1]]
|
||||
if thresh > D[i, -1]:
|
||||
self.assertTrue(set(I[i]) <= set(range_res))
|
||||
nt1 += 1
|
||||
elif thresh < D[i, -1]:
|
||||
self.assertTrue(set(range_res) <= set(I[i]))
|
||||
nt2 += 1
|
||||
# in case of equality we have a problem with ties
|
||||
print('nb tests', nt1, nt2)
|
||||
# nb tests is actually low...
|
||||
self.assertTrue(nt1 > 19 and nt2 > 19)
|
||||
|
||||
|
||||
class TestBinaryIVF(unittest.TestCase):
|
||||
|
||||
|
@ -166,6 +185,29 @@ class TestBinaryIVF(unittest.TestCase):
|
|||
|
||||
self.assertEqual((self.Dref == Divfflat).sum(), 4122)
|
||||
|
||||
def test_ivf_range(self):
|
||||
d = self.xq.shape[1] * 8
|
||||
|
||||
quantizer = faiss.IndexBinaryFlat(d)
|
||||
index = faiss.IndexBinaryIVF(quantizer, d, 8)
|
||||
index.cp.min_points_per_centroid = 5 # quiet warning
|
||||
index.nprobe = 4
|
||||
index.train(self.xt)
|
||||
index.add(self.xb)
|
||||
D, I = index.search(self.xq, 10)
|
||||
|
||||
radius = int(np.median(D[:, -1]) + 1)
|
||||
Lr, Dr, Ir = index.range_search(self.xq, radius)
|
||||
|
||||
for i in range(len(self.xq)):
|
||||
res = Ir[Lr[i]:Lr[i + 1]]
|
||||
if D[i, -1] < radius:
|
||||
self.assertTrue(set(I[i]) <= set(res))
|
||||
else:
|
||||
subset = I[i, D[i, :] < radius]
|
||||
self.assertTrue(set(subset) == set(res))
|
||||
|
||||
|
||||
def test_ivf_flat_empty(self):
|
||||
d = self.xq.shape[1] * 8
|
||||
|
||||
|
@ -257,27 +299,6 @@ class TestHNSW(unittest.TestCase):
|
|||
self.assertTrue((Dref == Dbin).all())
|
||||
|
||||
|
||||
def compare_binary_result_lists(D1, I1, D2, I2):
|
||||
"""comparing result lists is difficult because there are many
|
||||
ties. Here we sort by (distance, index) pairs and ignore the largest
|
||||
distance of each result. Compatible result lists should pass this."""
|
||||
assert D1.shape == I1.shape == D2.shape == I2.shape
|
||||
n, k = D1.shape
|
||||
ndiff = (D1 != D2).sum()
|
||||
assert ndiff == 0, '%d differences in distance matrix %s' % (
|
||||
ndiff, D1.shape)
|
||||
|
||||
def normalize_DI(D, I):
|
||||
norm = I.max() + 1.0
|
||||
Dr = D.astype('float64') + I / norm
|
||||
# ignore -1s and elements on last column
|
||||
Dr[I1 == -1] = 1e20
|
||||
Dr[D == D[:, -1:]] = 1e20
|
||||
Dr.sort(axis=1)
|
||||
return Dr
|
||||
ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
|
||||
assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
|
||||
|
||||
|
||||
class TestReplicasAndShards(unittest.TestCase):
|
||||
|
||||
|
|
|
@ -0,0 +1,220 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import numpy as np
|
||||
import unittest
|
||||
import faiss
|
||||
import tempfile
|
||||
import os
|
||||
import io
|
||||
import sys
|
||||
import warnings
|
||||
from multiprocessing.dummy import Pool as ThreadPool
|
||||
|
||||
from common import get_dataset, get_dataset_2
|
||||
|
||||
|
||||
class TestIOVariants(unittest.TestCase):
|
||||
|
||||
def test_io_error(self):
|
||||
d, n = 32, 1000
|
||||
x = np.random.uniform(size=(n, d)).astype('float32')
|
||||
index = faiss.IndexFlatL2(d)
|
||||
index.add(x)
|
||||
_, fname = tempfile.mkstemp()
|
||||
try:
|
||||
faiss.write_index(index, fname)
|
||||
|
||||
# should be fine
|
||||
faiss.read_index(fname)
|
||||
|
||||
# now damage file
|
||||
data = open(fname, 'rb').read()
|
||||
data = data[:int(len(data) / 2)]
|
||||
open(fname, 'wb').write(data)
|
||||
|
||||
# should make a nice readable exception that mentions the filename
|
||||
try:
|
||||
faiss.read_index(fname)
|
||||
except RuntimeError as e:
|
||||
if fname not in str(e):
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
class TestCallbacks(unittest.TestCase):
|
||||
|
||||
def do_write_callback(self, bsz):
|
||||
d, n = 32, 1000
|
||||
x = np.random.uniform(size=(n, d)).astype('float32')
|
||||
index = faiss.IndexFlatL2(d)
|
||||
index.add(x)
|
||||
|
||||
f = io.BytesIO()
|
||||
# test with small block size
|
||||
writer = faiss.PyCallbackIOWriter(f.write, 1234)
|
||||
|
||||
if bsz > 0:
|
||||
writer = faiss.BufferedIOWriter(writer, bsz)
|
||||
|
||||
faiss.write_index(index, writer)
|
||||
del writer # make sure all writes committed
|
||||
|
||||
if sys.version_info[0] < 3:
|
||||
buf = f.getvalue()
|
||||
else:
|
||||
buf = f.getbuffer()
|
||||
|
||||
index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8'))
|
||||
|
||||
self.assertEqual(index.d, index2.d)
|
||||
self.assertTrue(np.all(
|
||||
faiss.vector_to_array(index.xb) == faiss.vector_to_array(index2.xb)
|
||||
))
|
||||
|
||||
# This is not a callable function: shoudl raise an exception
|
||||
writer = faiss.PyCallbackIOWriter("blabla")
|
||||
self.assertRaises(
|
||||
Exception,
|
||||
faiss.write_index, index, writer
|
||||
)
|
||||
|
||||
def test_buf_read(self):
|
||||
x = np.random.uniform(size=20)
|
||||
|
||||
_, fname = tempfile.mkstemp()
|
||||
try:
|
||||
x.tofile(fname)
|
||||
|
||||
f = open(fname, 'rb')
|
||||
reader = faiss.PyCallbackIOReader(f.read, 1234)
|
||||
|
||||
bsz = 123
|
||||
reader = faiss.BufferedIOReader(reader, bsz)
|
||||
|
||||
y = np.zeros_like(x)
|
||||
print('nbytes=', y.nbytes)
|
||||
reader(faiss.swig_ptr(y), y.nbytes, 1)
|
||||
|
||||
np.testing.assert_array_equal(x, y)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.unlink(fname)
|
||||
|
||||
def do_read_callback(self, bsz):
|
||||
d, n = 32, 1000
|
||||
x = np.random.uniform(size=(n, d)).astype('float32')
|
||||
index = faiss.IndexFlatL2(d)
|
||||
index.add(x)
|
||||
|
||||
_, fname = tempfile.mkstemp()
|
||||
try:
|
||||
faiss.write_index(index, fname)
|
||||
|
||||
f = open(fname, 'rb')
|
||||
|
||||
reader = faiss.PyCallbackIOReader(f.read, 1234)
|
||||
|
||||
if bsz > 0:
|
||||
reader = faiss.BufferedIOReader(reader, bsz)
|
||||
|
||||
index2 = faiss.read_index(reader)
|
||||
|
||||
self.assertEqual(index.d, index2.d)
|
||||
np.testing.assert_array_equal(
|
||||
faiss.vector_to_array(index.xb),
|
||||
faiss.vector_to_array(index2.xb)
|
||||
)
|
||||
|
||||
# This is not a callable function: should raise an exception
|
||||
reader = faiss.PyCallbackIOReader("blabla")
|
||||
self.assertRaises(
|
||||
Exception,
|
||||
faiss.read_index, reader
|
||||
)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.unlink(fname)
|
||||
|
||||
def test_write_callback(self):
|
||||
self.do_write_callback(0)
|
||||
|
||||
def test_write_buffer(self):
|
||||
self.do_write_callback(123)
|
||||
self.do_write_callback(2345)
|
||||
|
||||
def test_read_callback(self):
|
||||
self.do_read_callback(0)
|
||||
|
||||
def test_read_callback_buffered(self):
|
||||
self.do_read_callback(123)
|
||||
self.do_read_callback(12345)
|
||||
|
||||
def test_read_buffer(self):
|
||||
d, n = 32, 1000
|
||||
x = np.random.uniform(size=(n, d)).astype('float32')
|
||||
index = faiss.IndexFlatL2(d)
|
||||
index.add(x)
|
||||
|
||||
_, fname = tempfile.mkstemp()
|
||||
try:
|
||||
faiss.write_index(index, fname)
|
||||
|
||||
reader = faiss.BufferedIOReader(
|
||||
faiss.FileIOReader(fname), 1234)
|
||||
|
||||
index2 = faiss.read_index(reader)
|
||||
|
||||
self.assertEqual(index.d, index2.d)
|
||||
np.testing.assert_array_equal(
|
||||
faiss.vector_to_array(index.xb),
|
||||
faiss.vector_to_array(index2.xb)
|
||||
)
|
||||
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.unlink(fname)
|
||||
|
||||
|
||||
def test_transfer_pipe(self):
|
||||
""" transfer an index through a Unix pipe """
|
||||
|
||||
d, n = 32, 1000
|
||||
x = np.random.uniform(size=(n, d)).astype('float32')
|
||||
index = faiss.IndexFlatL2(d)
|
||||
index.add(x)
|
||||
Dref, Iref = index.search(x, 10)
|
||||
|
||||
rf, wf = os.pipe()
|
||||
|
||||
# start thread that will decompress the index
|
||||
|
||||
def index_from_pipe():
|
||||
reader = faiss.PyCallbackIOReader(lambda size: os.read(rf, size))
|
||||
return faiss.read_index(reader)
|
||||
|
||||
fut = ThreadPool(1).apply_async(index_from_pipe, ())
|
||||
|
||||
# write to pipe
|
||||
writer = faiss.PyCallbackIOWriter(lambda b: os.write(wf, b))
|
||||
faiss.write_index(index, writer)
|
||||
|
||||
index2 = fut.get()
|
||||
|
||||
# closing is not really useful but it does not hurt
|
||||
os.close(wf)
|
||||
os.close(rf)
|
||||
|
||||
Dnew, Inew = index2.search(x, 10)
|
||||
|
||||
np.testing.assert_array_equal(Iref, Inew)
|
||||
np.testing.assert_array_equal(Dref, Dnew)
|
|
@ -34,6 +34,7 @@
|
|||
#include <faiss/utils/Heap.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
#include <faiss/impl/AuxIndexStructures.h>
|
||||
|
||||
static const size_t BLOCKSIZE_QUERY = 8192;
|
||||
|
||||
|
@ -484,6 +485,30 @@ void bitvec_print (const uint8_t * b, size_t d)
|
|||
}
|
||||
|
||||
|
||||
void bitvec_shuffle (size_t n, size_t da, size_t db,
|
||||
const int *order,
|
||||
const uint8_t *a,
|
||||
uint8_t *b)
|
||||
{
|
||||
for(size_t i = 0; i < db; i++) {
|
||||
FAISS_THROW_IF_NOT (order[i] >= 0 && order[i] < da);
|
||||
}
|
||||
size_t lda = (da + 7) / 8;
|
||||
size_t ldb = (db + 7) / 8;
|
||||
|
||||
#pragma omp parallel for if(n > 10000)
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
const uint8_t *ai = a + i * lda;
|
||||
uint8_t *bi = b + i * ldb;
|
||||
memset (bi, 0, ldb);
|
||||
for(size_t i = 0; i < db; i++) {
|
||||
int o = order[i];
|
||||
uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
|
||||
bi[i >> 3] |= the_bit << (i & 7);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -527,6 +552,7 @@ void hammings_knn(
|
|||
{
|
||||
hammings_knn_hc(ha, a, b, nb, ncodes, order);
|
||||
}
|
||||
|
||||
void hammings_knn_hc (
|
||||
int_maxheap_array_t * ha,
|
||||
const uint8_t * a,
|
||||
|
@ -610,7 +636,66 @@ void hammings_knn_mc(
|
|||
}
|
||||
}
|
||||
}
|
||||
template <class HammingComputer>
|
||||
static
|
||||
void hamming_range_search_template (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
int radius,
|
||||
size_t code_size,
|
||||
RangeSearchResult *res)
|
||||
{
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
RangeSearchPartialResult pres (res);
|
||||
|
||||
#pragma omp for
|
||||
for (size_t i = 0; i < na; i++) {
|
||||
HammingComputer hc (a + i * code_size, code_size);
|
||||
const uint8_t * yi = b;
|
||||
RangeQueryResult & qres = pres.new_result (i);
|
||||
|
||||
for (size_t j = 0; j < nb; j++) {
|
||||
int dis = hc.hamming (yi);
|
||||
if (dis < radius) {
|
||||
qres.add(dis, j);
|
||||
}
|
||||
yi += code_size;
|
||||
}
|
||||
}
|
||||
pres.finalize ();
|
||||
}
|
||||
}
|
||||
|
||||
void hamming_range_search (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
int radius,
|
||||
size_t code_size,
|
||||
RangeSearchResult *result)
|
||||
{
|
||||
|
||||
#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
|
||||
|
||||
switch(code_size) {
|
||||
case 4: HC(HammingComputer4); break;
|
||||
case 8: HC(HammingComputer8); break;
|
||||
case 16: HC(HammingComputer16); break;
|
||||
case 32: HC(HammingComputer32); break;
|
||||
default:
|
||||
if (code_size % 8 == 0) {
|
||||
HC(HammingComputerM8);
|
||||
} else {
|
||||
HC(HammingComputerDefault);
|
||||
}
|
||||
}
|
||||
#undef HC
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ namespace faiss {
|
|||
* General bit vector functions
|
||||
**************************************************/
|
||||
|
||||
struct RangeSearchResult;
|
||||
|
||||
void bitvec_print (const uint8_t * b, size_t d);
|
||||
|
||||
|
@ -65,6 +66,14 @@ void bitvecs2fvecs (
|
|||
|
||||
void fvec2bitvec (const float * x, uint8_t * b, size_t d);
|
||||
|
||||
/** Shuffle the bits from b(i, j) := a(i, order[j])
|
||||
*/
|
||||
void bitvec_shuffle (size_t n, size_t da, size_t db,
|
||||
const int *order,
|
||||
const uint8_t *a,
|
||||
uint8_t *b);
|
||||
|
||||
|
||||
/***********************************************
|
||||
* Generic reader/writer for bit strings
|
||||
***********************************************/
|
||||
|
@ -171,6 +180,17 @@ void hammings_knn_mc (
|
|||
int32_t *distances,
|
||||
int64_t *labels);
|
||||
|
||||
/** same as hammings_knn except we are doing a range search with radius */
|
||||
void hamming_range_search (
|
||||
const uint8_t * a,
|
||||
const uint8_t * b,
|
||||
size_t na,
|
||||
size_t nb,
|
||||
int radius,
|
||||
size_t ncodes,
|
||||
RangeSearchResult *result);
|
||||
|
||||
|
||||
/* Counting the number of matches or of cross-matches (without returning them)
|
||||
For use with function that assume pre-allocated memory */
|
||||
void hamming_count_thres (
|
||||
|
|
Loading…
Reference in New Issue