2017-02-23 06:26:44 +08:00
|
|
|
/**
|
|
|
|
* Copyright (c) 2015-present, Facebook, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
2017-07-30 15:18:45 +08:00
|
|
|
* This source code is licensed under the BSD+Patents license found in the
|
2017-02-23 06:26:44 +08:00
|
|
|
* LICENSE file in the root directory of this source tree.
|
|
|
|
*/
|
|
|
|
|
|
|
|
// Copyright 2004-present Facebook. All Rights Reserved.
|
|
|
|
// -*- c++ -*-
|
|
|
|
|
|
|
|
#ifndef FAISS_INDEX_IVF_H
|
|
|
|
#define FAISS_INDEX_IVF_H
|
|
|
|
|
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
|
|
|
|
#include "Index.h"
|
|
|
|
#include "Clustering.h"
|
|
|
|
#include "Heap.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace faiss {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Index based on a inverted file (IVF)
|
|
|
|
*
|
|
|
|
* In the inverted file, the quantizer (an Index instance) provides a
|
|
|
|
* quantization index for each vector to be added. The quantization
|
|
|
|
* index maps to a list (aka inverted list or posting list), where the
|
|
|
|
* id of the vector is then stored.
|
|
|
|
*
|
|
|
|
* At search time, the vector to be searched is also quantized, and
|
|
|
|
* only the list corresponding to the quantization index is
|
|
|
|
* searched. This speeds up the search by making it
|
|
|
|
* non-exhaustive. This can be relaxed using multi-probe search: a few
|
|
|
|
* (nprobe) quantization indices are selected and several inverted
|
|
|
|
* lists are visited.
|
|
|
|
*
|
|
|
|
* Sub-classes implement a post-filtering of the index that refines
|
|
|
|
* the distance estimation from the query to databse vectors.
|
|
|
|
*/
|
|
|
|
struct IndexIVF: Index {
|
|
|
|
size_t nlist; ///< number of possible key values
|
|
|
|
size_t nprobe; ///< number of probes at query time
|
|
|
|
|
|
|
|
Index * quantizer; ///< quantizer that maps vectors to inverted lists
|
|
|
|
bool quantizer_trains_alone; ///< just pass over the trainset to quantizer
|
|
|
|
bool own_fields; ///< whether object owns the quantizer
|
|
|
|
|
|
|
|
ClusteringParameters cp; ///< to override default clustering params
|
|
|
|
|
|
|
|
std::vector < std::vector<long> > ids; ///< Inverted lists for indexes
|
|
|
|
|
|
|
|
/// map for direct access to the elements. Enables reconstruct().
|
|
|
|
bool maintain_direct_map;
|
|
|
|
std::vector <long> direct_map;
|
|
|
|
|
|
|
|
/** The Inverted file takes a quantizer (an Index) on input,
|
|
|
|
* which implements the function mapping a vector to a list
|
|
|
|
* identifier. The pointer is borrowed: the quantizer should not
|
|
|
|
* be deleted while the IndexIVF is in use.
|
|
|
|
*/
|
|
|
|
IndexIVF (Index * quantizer, size_t d, size_t nlist,
|
|
|
|
MetricType metric = METRIC_INNER_PRODUCT);
|
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
void reset() override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
/// Trains the quantizer and calls train_residual to train sub-quantizers
|
2017-06-21 21:54:28 +08:00
|
|
|
void train(idx_t n, const float* x) override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
/// Quantizes x and calls add_with_key
|
2017-06-21 21:54:28 +08:00
|
|
|
void add(idx_t n, const float* x) override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
/// Sub-classes that encode the residuals can train their encoders here
|
|
|
|
/// does nothing by default
|
|
|
|
virtual void train_residual (idx_t n, const float *x);
|
|
|
|
|
|
|
|
/** moves the entries from another dataset to self. On output,
|
|
|
|
* other is empty. add_id is added to all moved ids (for
|
|
|
|
* sequential ids, this would be this->ntotal */
|
|
|
|
virtual void merge_from (IndexIVF &other, idx_t add_id);
|
|
|
|
|
|
|
|
/** implemented by sub-classes */
|
|
|
|
virtual void merge_from_residuals (IndexIVF &other) = 0;
|
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
~IndexIVF() override;
|
|
|
|
|
|
|
|
size_t get_list_size (size_t list_no) const
|
|
|
|
{ return ids[list_no].size(); }
|
|
|
|
|
2017-07-18 17:51:27 +08:00
|
|
|
/** intialize a direct map
|
|
|
|
*
|
|
|
|
* @param new_maintain_direct_map if true, create a direct map,
|
|
|
|
* else clear it
|
|
|
|
*/
|
|
|
|
void make_direct_map (bool new_maintain_direct_map=true);
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
/// 1= perfectly balanced, >1: imbalanced
|
|
|
|
double imbalance_factor () const;
|
|
|
|
|
|
|
|
/// display some stats about the inverted lists
|
|
|
|
void print_stats () const;
|
|
|
|
|
|
|
|
IndexIVF ();
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2017-04-06 19:33:41 +08:00
|
|
|
struct IndexIVFFlatStats {
|
|
|
|
size_t nq; // nb of queries run
|
|
|
|
size_t nlist; // nb of inverted lists scanned
|
|
|
|
size_t ndis; // nb of distancs computed
|
|
|
|
size_t npartial; // nb of bound computations (IndexIVFFlatIPBounds)
|
|
|
|
|
|
|
|
IndexIVFFlatStats () {reset (); }
|
|
|
|
void reset ();
|
|
|
|
};
|
|
|
|
|
|
|
|
// global var that collects them all
|
|
|
|
extern IndexIVFFlatStats indexIVFFlat_stats;
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
|
|
|
|
/** Inverted file with stored vectors. Here the inverted file
|
|
|
|
* pre-selects the vectors to be searched, but they are not otherwise
|
|
|
|
* encoded.
|
|
|
|
*/
|
|
|
|
struct IndexIVFFlat: IndexIVF {
|
|
|
|
/** Inverted list of original vectors. Each list is a nl * d
|
|
|
|
* matrix, where nl is the nb of vectors stored in the list. */
|
|
|
|
std::vector < std::vector<float> > vecs;
|
|
|
|
|
|
|
|
IndexIVFFlat (
|
|
|
|
Index * quantizer, size_t d, size_t nlist_,
|
|
|
|
MetricType = METRIC_INNER_PRODUCT);
|
|
|
|
|
|
|
|
/// same as add_with_ids, with precomputed coarse quantizer
|
2017-04-06 19:33:41 +08:00
|
|
|
virtual void add_core (idx_t n, const float * x, const long *xids,
|
2017-02-23 06:26:44 +08:00
|
|
|
const long *precomputed_idx);
|
|
|
|
|
|
|
|
/// implemented for all IndexIVF* classes
|
2017-06-21 21:54:28 +08:00
|
|
|
void add_with_ids(idx_t n, const float* x, const long* xids) override;
|
|
|
|
|
|
|
|
void search(
|
|
|
|
idx_t n,
|
|
|
|
const float* x,
|
|
|
|
idx_t k,
|
|
|
|
float* distances,
|
|
|
|
idx_t* labels) const override;
|
|
|
|
|
|
|
|
/// perform search, without computing the assignment to the quantizer
|
|
|
|
void search_preassigned (idx_t n, const float *x, idx_t k,
|
|
|
|
const idx_t *assign,
|
|
|
|
float *distances, idx_t *labels) const;
|
|
|
|
|
|
|
|
void range_search(
|
|
|
|
idx_t n,
|
|
|
|
const float* x,
|
|
|
|
float radius,
|
|
|
|
RangeSearchResult* result) const override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
/** copy a subset of the entries index to the other index
|
|
|
|
*
|
|
|
|
* if subset_type == 0: copies ids in [a1, a2)
|
|
|
|
* if subset_type == 1: copies ids if id % a1 == a2
|
|
|
|
*/
|
|
|
|
void copy_subset_to (IndexIVFFlat & other, int subset_type,
|
|
|
|
long a1, long a2) const;
|
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
void reset() override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
long remove_ids(const IDSelector& sel) override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
/// Implementation of the search for the inner product metric
|
|
|
|
void search_knn_inner_product (
|
|
|
|
size_t nx, const float * x,
|
|
|
|
const long * keys,
|
|
|
|
float_minheap_array_t * res) const;
|
|
|
|
|
|
|
|
/// Implementation of the search for the L2 metric
|
|
|
|
void search_knn_L2sqr (
|
|
|
|
size_t nx, const float * x,
|
|
|
|
const long * keys,
|
|
|
|
float_maxheap_array_t * res) const;
|
|
|
|
|
2017-07-18 17:51:27 +08:00
|
|
|
/** Update a subset of vectors.
|
|
|
|
*
|
|
|
|
* The index must have a direct_map
|
|
|
|
*
|
|
|
|
* @param nv nb of vectors to update
|
|
|
|
* @param idx vector indices to update, size nv
|
|
|
|
* @param v vectors of new values, size nv*d
|
|
|
|
*/
|
|
|
|
void update_vectors (int nv, idx_t *idx, const float *v);
|
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
void reconstruct(idx_t key, float* recons) const override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
void merge_from_residuals(IndexIVF& other) override;
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
IndexIVFFlat () {}
|
|
|
|
};
|
|
|
|
|
2017-04-06 19:33:41 +08:00
|
|
|
struct IndexIVFFlatIPBounds: IndexIVFFlat {
|
|
|
|
|
|
|
|
/// nb of dimensions of pre-filter
|
|
|
|
size_t fsize;
|
|
|
|
|
|
|
|
/// norm of remainder (dimensions fsize:d)
|
|
|
|
std::vector<std::vector<float> > part_norms;
|
|
|
|
|
|
|
|
IndexIVFFlatIPBounds (
|
|
|
|
Index * quantizer, size_t d, size_t nlist,
|
|
|
|
size_t fsize);
|
2017-02-23 06:26:44 +08:00
|
|
|
|
2017-06-21 21:54:28 +08:00
|
|
|
void add_core(
|
|
|
|
idx_t n,
|
|
|
|
const float* x,
|
|
|
|
const long* xids,
|
|
|
|
const long* precomputed_idx) override;
|
|
|
|
|
|
|
|
void search(
|
|
|
|
idx_t n,
|
|
|
|
const float* x,
|
|
|
|
idx_t k,
|
|
|
|
float* distances,
|
|
|
|
idx_t* labels) const override;
|
2017-04-06 19:33:41 +08:00
|
|
|
};
|
2017-02-23 06:26:44 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace faiss
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|