167 lines
4.8 KiB
C++
167 lines
4.8 KiB
C++
|
|
/**
|
|
* Copyright (c) 2015-present, Facebook, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the CC-by-NC license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// Copyright 2004-present Facebook. All Rights Reserved.
|
|
// -*- c++ -*-
|
|
|
|
#ifndef FAISS_INDEX_PQ_H
|
|
#define FAISS_INDEX_PQ_H
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <vector>
|
|
|
|
#include "Index.h"
|
|
#include "ProductQuantizer.h"
|
|
#include "PolysemousTraining.h"
|
|
|
|
namespace faiss {
|
|
|
|
|
|
/** Index based on a product quantizer. Stored vectors are
|
|
* approximated by PQ codes. */
|
|
struct IndexPQ: Index {
|
|
|
|
/// The product quantizer used to encode the vectors
|
|
ProductQuantizer pq;
|
|
|
|
/// Codes. Size ntotal * pq.code_size
|
|
std::vector<uint8_t> codes;
|
|
|
|
/** Constructor.
|
|
*
|
|
* @param d dimensionality of the input vectors
|
|
* @param M number of subquantizers
|
|
* @param nbits number of bit per subvector index
|
|
*/
|
|
IndexPQ (int d, ///< dimensionality of the input vectors
|
|
size_t M, ///< number of subquantizers
|
|
size_t nbits, ///< number of bit per subvector index
|
|
MetricType metric = METRIC_L2);
|
|
|
|
IndexPQ ();
|
|
|
|
virtual void set_typename () override;
|
|
|
|
virtual void train (idx_t n, const float *x) override;
|
|
|
|
virtual void add (idx_t n, const float *x) override;
|
|
|
|
virtual void search (
|
|
idx_t n, const float *x, idx_t k,
|
|
float *distances, idx_t *labels) const override;
|
|
|
|
virtual void reset() override;
|
|
|
|
virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons)
|
|
const override;
|
|
|
|
virtual void reconstruct (idx_t key, float * recons)
|
|
const override;
|
|
|
|
/******************************************************
|
|
* Polysemous codes implementation
|
|
******************************************************/
|
|
bool do_polysemous_training; ///< false = standard PQ
|
|
|
|
/// parameters used for the polysemous training
|
|
PolysemousTraining polysemous_training;
|
|
|
|
/// how to perform the search in search_core
|
|
enum Search_type_t {
|
|
ST_PQ, ///< asymmetric product quantizer (default)
|
|
ST_HE, ///< Hamming distance on codes
|
|
ST_generalized_HE, ///< nb of same codes
|
|
ST_SDC, ///< symmetric product quantizer (SDC)
|
|
ST_polysemous, ///< HE filter (using ht) + PQ combination
|
|
ST_polysemous_generalize, ///< Filter on generalized Hamming
|
|
};
|
|
|
|
Search_type_t search_type;
|
|
|
|
// just encode the sign of the components, instead of using the PQ encoder
|
|
// used only for the queries
|
|
bool encode_signs;
|
|
|
|
/// Hamming threshold used for polysemy
|
|
int polysemous_ht;
|
|
|
|
// actual polysemous search
|
|
void search_core_polysemous (idx_t n, const float *x, idx_t k,
|
|
float *distances, idx_t *labels) const;
|
|
|
|
/// prepare query for a polysemous search, but instead of
|
|
/// computing the result, just get the histogram of Hamming
|
|
/// distances. May be computed on a provided dataset if xb != NULL
|
|
/// @param dist_histogram (M * nbits + 1)
|
|
void hamming_distance_histogram (idx_t n, const float *x,
|
|
idx_t nb, const float *xb,
|
|
long *dist_histogram);
|
|
|
|
/** compute pairwise distances between queries and database
|
|
*
|
|
* @param n nb of query vectors
|
|
* @param x query vector, size n * d
|
|
* @param dis output distances, size n * ntotal
|
|
*/
|
|
void hamming_distance_table (idx_t n, const float *x,
|
|
int32_t *dis) const;
|
|
|
|
};
|
|
|
|
|
|
/// statistics are robust to internal threading, but not if
|
|
/// IndexPQ::search is called by multiple threads
|
|
struct IndexPQStats {
|
|
size_t nq; // nb of queries run
|
|
size_t ncode; // nb of codes visited
|
|
|
|
size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy)
|
|
|
|
IndexPQStats () {reset (); }
|
|
void reset ();
|
|
};
|
|
|
|
extern IndexPQStats indexPQ_stats;
|
|
|
|
|
|
|
|
/** Quantizer where centroids are virtual: they are the Cartesian
|
|
* product of sub-centroids. */
|
|
struct MultiIndexQuantizer: Index {
|
|
ProductQuantizer pq;
|
|
|
|
MultiIndexQuantizer (int d, ///< dimension of the input vectors
|
|
size_t M, ///< number of subquantizers
|
|
size_t nbits); ///< number of bit per subvector index
|
|
|
|
virtual void set_typename ();
|
|
|
|
virtual void train (idx_t n, const float *x);
|
|
|
|
|
|
virtual void search (idx_t n, const float *x, idx_t k,
|
|
float *distances, idx_t *labels) const;
|
|
|
|
/// add and reset will crash at runtime
|
|
virtual void add (idx_t n, const float *x);
|
|
virtual void reset ();
|
|
|
|
MultiIndexQuantizer () {}
|
|
|
|
virtual void reconstruct (idx_t key, float * recons) const;
|
|
};
|
|
|
|
|
|
} // namespace faiss
|
|
|
|
|
|
|
|
#endif
|