faiss/demos/demo_imi_pq.cpp

200 lines
6.5 KiB
C++

/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <sys/time.h>
#include <faiss/IndexPQ.h>
#include <faiss/IndexIVFPQ.h>
#include <faiss/IndexFlat.h>
#include <faiss/index_io.h>
double elapsed ()
{
struct timeval tv;
gettimeofday (&tv, nullptr);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
int main ()
{
double t0 = elapsed();
// dimension of the vectors to index
int d = 64;
// size of the database we plan to index
size_t nb = 1000 * 1000;
size_t add_bs = 10000; // # size of the blocks to add
// make a set of nt training vectors in the unit cube
// (could be the database)
size_t nt = 100 * 1000;
//---------------------------------------------------------------
// Define the core quantizer
// We choose a multiple inverted index for faster training with less data
// and because it usually offers best accuracy/speed trade-offs
//
// We here assume that its lifespan of this coarse quantizer will cover the
// lifespan of the inverted-file quantizer IndexIVFFlat below
// With dynamic allocation, one may give the responsability to free the
// quantizer to the inverted-file index (with attribute do_delete_quantizer)
//
// Note: a regular clustering algorithm would be defined as:
// faiss::IndexFlatL2 coarse_quantizer (d);
//
// Use nhash=2 subquantizers used to define the product coarse quantizer
// Number of bits: we will have 2^nbits_coarse centroids per subquantizer
// meaning (2^12)^nhash distinct inverted lists
//
// The parameter bytes_per_code is determined by the memory
// constraint, the dataset will use nb * (bytes_per_code + 8)
// bytes.
//
// The parameter nbits_subq is determined by the size of the dataset to index.
//
size_t nhash = 2;
size_t nbits_subq = 9;
size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
int bytes_per_code = 16;
faiss::MultiIndexQuantizer coarse_quantizer (d, nhash, nbits_subq);
printf ("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
nhash, nbits_subq, ncentroids, nb);
// the coarse quantizer should not be dealloced before the index
// 4 = nb of bytes per code (d must be a multiple of this)
// 8 = nb of bits per sub-code (almost always 8)
faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
faiss::IndexIVFPQ index (&coarse_quantizer, d, ncentroids, bytes_per_code, 8);
index.quantizer_trains_alone = true;
// define the number of probes. 2048 is for high-dim, overkill in practice
// Use 4-1024 depending on the trade-off speed accuracy that you want
index.nprobe = 2048;
{ // training.
// The distribution of the training vectors should be the same
// as the database vectors. It could be a sub-sample of the
// database vectors, if sampling is not biased. Here we just
// randomly generate the vectors.
printf ("[%.3f s] Generating %ld vectors in %dD for training\n",
elapsed() - t0, nt, d);
std::vector <float> trainvecs (nt * d);
for (size_t i = 0; i < nt; i++) {
for (size_t j = 0; j < d; j++) {
trainvecs[i * d + j] = drand48();
}
}
printf ("[%.3f s] Training the index\n", elapsed() - t0);
index.verbose = true;
index.train (nt, trainvecs.data());
}
// the index can be re-loaded later with
// faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
faiss::write_index(&index, "/tmp/trained_index.faissindex");
size_t nq;
std::vector<float> queries;
{ // populating the database
printf ("[%.3f s] Building a dataset of %ld vectors to index\n",
elapsed() - t0, nb);
std::vector <float> database (nb * d);
std::vector <long> ids (nb);
for (size_t i = 0; i < nb; i++) {
for (size_t j = 0; j < d; j++) {
database[i * d + j] = drand48();
}
ids[i] = 8760000000L + i;
}
printf ("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
for (size_t begin = 0; begin < nb; begin += add_bs) {
size_t end = std::min (begin + add_bs, nb);
index.add_with_ids (end - begin,
database.data() + d * begin,
ids.data() + begin);
}
// remember a few elements from the database as queries
int i0 = 1234;
int i1 = 1244;
nq = i1 - i0;
queries.resize (nq * d);
for (int i = i0; i < i1; i++) {
for (int j = 0; j < d; j++) {
queries [(i - i0) * d + j] = database [i * d + j];
}
}
}
// A few notes on the internal format of the index:
//
// - the positing lists for PQ codes are index.codes, which is a
// std::vector < std::vector<uint8_t> >
// if n is the length of posting list #i, codes[i] has length bytes_per_code * n
//
// - the corresponding ids are stored in index.ids
//
// - given a vector float *x, finding which k centroids are
// closest to it (ie to find the nearest neighbors) can be done with
//
// long *centroid_ids = new long[k];
// float *distances = new float[k];
// index.quantizer->search (1, x, k, dis, centroids_ids);
//
faiss::write_index(&index, "/tmp/populated_index.faissindex");
{ // searching the database
int k = 5;
printf ("[%.3f s] Searching the %d nearest neighbors "
"of %ld vectors in the index\n",
elapsed() - t0, k, nq);
std::vector<faiss::Index::idx_t> nns (k * nq);
std::vector<float> dis (k * nq);
index.search (nq, queries.data(), k, dis.data(), nns.data());
printf ("[%.3f s] Query results (vector ids, then distances):\n",
elapsed() - t0);
for (int i = 0; i < nq; i++) {
printf ("query %2d: ", i);
for (int j = 0; j < k; j++) {
printf ("%7ld ", nns[j + i * k]);
}
printf ("\n dis: ");
for (int j = 0; j < k; j++) {
printf ("%7g ", dis[j + i * k]);
}
printf ("\n");
}
}
return 0;
}