200 lines
5.6 KiB
C++
200 lines
5.6 KiB
C++
/**
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* This source code is licensed under the MIT license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// -*- c++ -*-
|
|
|
|
#ifndef FAISS_INDEX_PQ_H
|
|
#define FAISS_INDEX_PQ_H
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <vector>
|
|
|
|
#include <faiss/Index.h>
|
|
#include <faiss/impl/ProductQuantizer.h>
|
|
#include <faiss/impl/PolysemousTraining.h>
|
|
|
|
namespace faiss {
|
|
|
|
|
|
/** Index based on a product quantizer. Stored vectors are
|
|
* approximated by PQ codes. */
|
|
struct IndexPQ: Index {
|
|
|
|
/// The product quantizer used to encode the vectors
|
|
ProductQuantizer pq;
|
|
|
|
/// Codes. Size ntotal * pq.code_size
|
|
std::vector<uint8_t> codes;
|
|
|
|
/** Constructor.
|
|
*
|
|
* @param d dimensionality of the input vectors
|
|
* @param M number of subquantizers
|
|
* @param nbits number of bit per subvector index
|
|
*/
|
|
IndexPQ (int d, ///< dimensionality of the input vectors
|
|
size_t M, ///< number of subquantizers
|
|
size_t nbits, ///< number of bit per subvector index
|
|
MetricType metric = METRIC_L2);
|
|
|
|
IndexPQ ();
|
|
|
|
void train(idx_t n, const float* x) override;
|
|
|
|
void add(idx_t n, const float* x) override;
|
|
|
|
void search(
|
|
idx_t n,
|
|
const float* x,
|
|
idx_t k,
|
|
float* distances,
|
|
idx_t* labels) const override;
|
|
|
|
void reset() override;
|
|
|
|
void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
|
|
|
|
void reconstruct(idx_t key, float* recons) const override;
|
|
|
|
size_t remove_ids(const IDSelector& sel) override;
|
|
|
|
/* The standalone codec interface */
|
|
size_t sa_code_size () const override;
|
|
|
|
void sa_encode (idx_t n, const float *x,
|
|
uint8_t *bytes) const override;
|
|
|
|
void sa_decode (idx_t n, const uint8_t *bytes,
|
|
float *x) const override;
|
|
|
|
|
|
DistanceComputer * get_distance_computer() const override;
|
|
|
|
/******************************************************
|
|
* Polysemous codes implementation
|
|
******************************************************/
|
|
bool do_polysemous_training; ///< false = standard PQ
|
|
|
|
/// parameters used for the polysemous training
|
|
PolysemousTraining polysemous_training;
|
|
|
|
/// how to perform the search in search_core
|
|
enum Search_type_t {
|
|
ST_PQ, ///< asymmetric product quantizer (default)
|
|
ST_HE, ///< Hamming distance on codes
|
|
ST_generalized_HE, ///< nb of same codes
|
|
ST_SDC, ///< symmetric product quantizer (SDC)
|
|
ST_polysemous, ///< HE filter (using ht) + PQ combination
|
|
ST_polysemous_generalize, ///< Filter on generalized Hamming
|
|
};
|
|
|
|
Search_type_t search_type;
|
|
|
|
// just encode the sign of the components, instead of using the PQ encoder
|
|
// used only for the queries
|
|
bool encode_signs;
|
|
|
|
/// Hamming threshold used for polysemy
|
|
int polysemous_ht;
|
|
|
|
// actual polysemous search
|
|
void search_core_polysemous (idx_t n, const float *x, idx_t k,
|
|
float *distances, idx_t *labels) const;
|
|
|
|
/// prepare query for a polysemous search, but instead of
|
|
/// computing the result, just get the histogram of Hamming
|
|
/// distances. May be computed on a provided dataset if xb != NULL
|
|
/// @param dist_histogram (M * nbits + 1)
|
|
void hamming_distance_histogram (idx_t n, const float *x,
|
|
idx_t nb, const float *xb,
|
|
int64_t *dist_histogram);
|
|
|
|
/** compute pairwise distances between queries and database
|
|
*
|
|
* @param n nb of query vectors
|
|
* @param x query vector, size n * d
|
|
* @param dis output distances, size n * ntotal
|
|
*/
|
|
void hamming_distance_table (idx_t n, const float *x,
|
|
int32_t *dis) const;
|
|
|
|
};
|
|
|
|
|
|
/// statistics are robust to internal threading, but not if
|
|
/// IndexPQ::search is called by multiple threads
|
|
struct IndexPQStats {
|
|
size_t nq; // nb of queries run
|
|
size_t ncode; // nb of codes visited
|
|
|
|
size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy)
|
|
|
|
IndexPQStats () {reset (); }
|
|
void reset ();
|
|
};
|
|
|
|
extern IndexPQStats indexPQ_stats;
|
|
|
|
|
|
|
|
/** Quantizer where centroids are virtual: they are the Cartesian
|
|
* product of sub-centroids. */
|
|
struct MultiIndexQuantizer: Index {
|
|
ProductQuantizer pq;
|
|
|
|
MultiIndexQuantizer (int d, ///< dimension of the input vectors
|
|
size_t M, ///< number of subquantizers
|
|
size_t nbits); ///< number of bit per subvector index
|
|
|
|
void train(idx_t n, const float* x) override;
|
|
|
|
void search(
|
|
idx_t n, const float* x, idx_t k,
|
|
float* distances, idx_t* labels) const override;
|
|
|
|
/// add and reset will crash at runtime
|
|
void add(idx_t n, const float* x) override;
|
|
void reset() override;
|
|
|
|
MultiIndexQuantizer () {}
|
|
|
|
void reconstruct(idx_t key, float* recons) const override;
|
|
};
|
|
|
|
|
|
/** MultiIndexQuantizer where the PQ assignmnet is performed by sub-indexes
|
|
*/
|
|
struct MultiIndexQuantizer2: MultiIndexQuantizer {
|
|
|
|
/// M Indexes on d / M dimensions
|
|
std::vector<Index*> assign_indexes;
|
|
bool own_fields;
|
|
|
|
MultiIndexQuantizer2 (
|
|
int d, size_t M, size_t nbits,
|
|
Index **indexes);
|
|
|
|
MultiIndexQuantizer2 (
|
|
int d, size_t nbits,
|
|
Index *assign_index_0,
|
|
Index *assign_index_1);
|
|
|
|
void train(idx_t n, const float* x) override;
|
|
|
|
void search(
|
|
idx_t n, const float* x, idx_t k,
|
|
float* distances, idx_t* labels) const override;
|
|
|
|
};
|
|
|
|
|
|
} // namespace faiss
|
|
|
|
|
|
#endif
|