faiss/IndexIVFPQ.cpp

/**
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

// -*- c++ -*-

#include "IndexIVFPQ.h"

#include <cmath>
#include <cstdio>
#include <cassert>

#include <algorithm>

#include "Heap.h"
#include "utils.h"

#include "Clustering.h"
#include "IndexFlat.h"

#include "hamming.h"

#include "FaissAssert.h"

#include "AuxIndexStructures.h"

namespace faiss {


/*****************************************
 * IndexIVFPQ implementation
 ******************************************/

IndexIVFPQ::IndexIVFPQ (Index * quantizer, size_t d, size_t nlist,
                        size_t M, size_t nbits_per_idx):
    IndexIVF (quantizer, d, nlist, 0, METRIC_L2),
    pq (d, M, nbits_per_idx)
{
    FAISS_THROW_IF_NOT (nbits_per_idx <= 8);
    code_size = pq.code_size;
    invlists->code_size = code_size;
    is_trained = false;
    by_residual = true;
    use_precomputed_table = 0;
    scan_table_threshold = 0;

    polysemous_training = nullptr;
    do_polysemous_training = false;
    polysemous_ht = 0;

}


/****************************************************************
 * training                                                     */

void IndexIVFPQ::train_residual (idx_t n, const float *x)
{
    train_residual_o (n, x, nullptr);
}


void IndexIVFPQ::train_residual_o (idx_t n, const float *x, float *residuals_2)
{
    const float * x_in = x;

    x = fvecs_maybe_subsample (
         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
         x, verbose, pq.cp.seed);

    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);

    const float *trainset;
    ScopeDeleter<float> del_residuals;
    if (by_residual) {
        if(verbose) printf("computing residuals\n");
        idx_t * assign = new idx_t [n]; // assignement to coarse centroids
        ScopeDeleter<idx_t> del (assign);
        quantizer->assign (n, x, assign);
        float *residuals = new float [n * d];
        del_residuals.set (residuals);
        for (idx_t i = 0; i < n; i++)
           quantizer->compute_residual (x + i * d, residuals+i*d, assign[i]);

        trainset = residuals;
    } else {
        trainset = x;
    }
    if (verbose)
        printf ("training %zdx%zd product quantizer on %ld vectors in %dD\n",
                pq.M, pq.ksub, n, d);
    pq.verbose = verbose;
    pq.train (n, trainset);

    if (do_polysemous_training) {
        if (verbose)
            printf("doing polysemous training for PQ\n");
        PolysemousTraining default_pt;
        PolysemousTraining *pt = polysemous_training;
        if (!pt) pt = &default_pt;
        pt->optimize_pq_for_hamming (pq, n, trainset);
    }

    // prepare second-level residuals for refine PQ
    if (residuals_2) {
        uint8_t *train_codes = new uint8_t [pq.code_size * n];
        ScopeDeleter<uint8_t> del (train_codes);
        pq.compute_codes (trainset, train_codes, n);

        for (idx_t i = 0; i < n; i++) {
            const float *xx = trainset + i * d;
            float * res = residuals_2 + i * d;
            pq.decode (train_codes + i * pq.code_size, res);
            for (int j = 0; j < d; j++)
                res[j] = xx[j] - res[j];
        }

    }

    if (by_residual) {
        precompute_table ();
    }

}


/****************************************************************
 * IVFPQ as codec                                               */


/* produce a binary signature based on the residual vector */
void IndexIVFPQ::encode (long key, const float * x, uint8_t * code) const
{
    if (by_residual) {
        float residual_vec[d];
        quantizer->compute_residual (x, residual_vec, key);
        pq.compute_code (residual_vec, code);
    }
    else pq.compute_code (x, code);
}

void IndexIVFPQ::encode_multiple (size_t n, long *keys,
                                  const float * x, uint8_t * xcodes,
                                  bool compute_keys) const
{
    if (compute_keys)
        quantizer->assign (n, x, keys);

    encode_vectors (n, x, keys, xcodes);
}

void IndexIVFPQ::decode_multiple (size_t n, const long *keys,
                                  const uint8_t * xcodes, float * x) const
{
    pq.decode (xcodes, x, n);
    if (by_residual) {
        std::vector<float> centroid (d);
        for (size_t i = 0; i < n; i++) {
            quantizer->reconstruct (keys[i], centroid.data());
            float *xi = x + i * d;
            for (size_t j = 0; j < d; j++) {
                xi [j] += centroid [j];
            }
        }
    }
}


/****************************************************************
 * add                                                          */


void IndexIVFPQ::add_with_ids (idx_t n, const float * x, const long *xids)
{
    add_core_o (n, x, xids, nullptr);
}


static float * compute_residuals (
        const Index *quantizer,
        Index::idx_t n, const float* x,
        const Index::idx_t *list_nos)
{
    size_t d = quantizer->d;
    float *residuals = new float [n * d];
    // TODO: parallelize?
    for (size_t i = 0; i < n; i++) {
        if (list_nos[i] < 0)
            memset (residuals + i * d, 0, sizeof(*residuals) * d);
        else
            quantizer->compute_residual (
                 x + i * d, residuals + i * d, list_nos[i]);
    }
    return residuals;
}

void IndexIVFPQ::encode_vectors(idx_t n, const float* x,
                                const idx_t *list_nos,
                                uint8_t * codes) const
{
    if (by_residual) {
        float *to_encode = compute_residuals (quantizer, n, x, list_nos);
        ScopeDeleter<float> del (to_encode);
        pq.compute_codes (to_encode, codes, n);
    } else {
        pq.compute_codes (x, codes, n);
    }
}


void IndexIVFPQ::add_core_o (idx_t n, const float * x, const long *xids,
                             float *residuals_2, const long *precomputed_idx)
{

    idx_t bs = 32768;
    if (n > bs) {
        for (idx_t i0 = 0; i0 < n; i0 += bs) {
            idx_t i1 = std::min(i0 + bs, n);
            if (verbose) {
                printf("IndexIVFPQ::add_core_o: adding %ld:%ld / %ld\n",
                       i0, i1, n);
            }
            add_core_o (i1 - i0, x + i0 * d,
                        xids ? xids + i0 : nullptr,
                        residuals_2 ? residuals_2 + i0 * d : nullptr,
                        precomputed_idx ? precomputed_idx + i0 : nullptr);
        }
        return;
    }

    FAISS_THROW_IF_NOT (is_trained);
    double t0 = getmillisecs ();
    const long * idx;
    ScopeDeleter<long> del_idx;

    if (precomputed_idx) {
        idx = precomputed_idx;
    } else {
        long * idx0 = new long [n];
        del_idx.set (idx0);
        quantizer->assign (n, x, idx0);
        idx = idx0;
    }

    double t1 = getmillisecs ();
    uint8_t * xcodes = new uint8_t [n * code_size];
    ScopeDeleter<uint8_t> del_xcodes (xcodes);

    const float *to_encode = nullptr;
    ScopeDeleter<float> del_to_encode;

    if (by_residual) {
        to_encode = compute_residuals (quantizer, n, x, idx);
        del_to_encode.set (to_encode);
    } else {
        to_encode = x;
    }
    pq.compute_codes (to_encode, xcodes, n);

    double t2 = getmillisecs ();
    // TODO: parallelize?
    size_t n_ignore = 0;
    for (size_t i = 0; i < n; i++) {
        idx_t key = idx[i];
        if (key < 0) {
            n_ignore ++;
            if (residuals_2)
                memset (residuals_2, 0, sizeof(*residuals_2) * d);
            continue;
        }
        idx_t id = xids ? xids[i] : ntotal + i;

        uint8_t *code = xcodes + i * code_size;
        size_t offset = invlists->add_entry (key, id, code);

        if (residuals_2) {
            float *res2 = residuals_2 + i * d;
            const float *xi = to_encode + i * d;
            pq.decode (code, res2);
            for (int j = 0; j < d; j++)
                res2[j] = xi[j] - res2[j];
        }

        if (maintain_direct_map)
            direct_map.push_back (key << 32 | offset);
    }


    double t3 = getmillisecs ();
    if(verbose) {
        char comment[100] = {0};
        if (n_ignore > 0)
            snprintf (comment, 100, "(%ld vectors ignored)", n_ignore);
        printf(" add_core times: %.3f %.3f %.3f %s\n",
               t1 - t0, t2 - t1, t3 - t2, comment);
    }
    ntotal += n;
}


void IndexIVFPQ::reconstruct_from_offset (long list_no, long offset,
                                          float* recons) const
{
    const uint8_t* code = invlists->get_single_code (list_no, offset);

    if (by_residual) {
        std::vector<float> centroid(d);
        quantizer->reconstruct (list_no, centroid.data());

        pq.decode (code, recons);
        for (int i = 0; i < d; ++i) {
            recons[i] += centroid[i];
        }
    } else {
        pq.decode (code, recons);
    }
}


/// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;

/** Precomputed tables for residuals
 *
 * During IVFPQ search with by_residual, we compute
 *
 *     d = || x - y_C - y_R ||^2
 *
 * where x is the query vector, y_C the coarse centroid, y_R the
 * refined PQ centroid. The expression can be decomposed as:
 *
 *    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
 *        ---------------   ---------------------------       -------
 *             term 1                 term 2                   term 3
 *
 * When using multiprobe, we use the following decomposition:
 * - term 1 is the distance to the coarse centroid, that is computed
 *   during the 1st stage search.
 * - term 2 can be precomputed, as it does not involve x. However,
 *   because of the PQ, it needs nlist * M * ksub storage. This is why
 *   use_precomputed_table is off by default
 * - term 3 is the classical non-residual distance table.
 *
 * Since y_R defined by a product quantizer, it is split across
 * subvectors and stored separately for each subvector. If the coarse
 * quantizer is a MultiIndexQuantizer then the table can be stored
 * more compactly.
 *
 * At search time, the tables for term 2 and term 3 are added up. This
 * is faster when the length of the lists is > ksub * M.
 */

void IndexIVFPQ::precompute_table ()
{
    if (use_precomputed_table == -1)
        return;

    if (use_precomputed_table == 0) { // then choose the type of table
        if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
            if (verbose) {
                printf("IndexIVFPQ::precompute_table: precomputed "
                        "tables not needed for inner product quantizers\n");
            }
            return;
        }
        const MultiIndexQuantizer *miq =
            dynamic_cast<const MultiIndexQuantizer *> (quantizer);
        if (miq && pq.M % miq->pq.M == 0)
            use_precomputed_table = 2;
        else {
            size_t table_size = pq.M * pq.ksub * nlist * sizeof(float);
            if (table_size > precomputed_table_max_bytes) {
                if (verbose) {
                    printf(
                       "IndexIVFPQ::precompute_table: not precomputing table, "
                       "it would be too big: %ld bytes (max %ld)\n",
                       table_size, precomputed_table_max_bytes);
                    use_precomputed_table = 0;
                }
                return;
            }
            use_precomputed_table = 1;
        }
    } // otherwise assume user has set appropriate flag on input

    if (verbose) {
        printf ("precomputing IVFPQ tables type %d\n",
                use_precomputed_table);
    }

    // squared norms of the PQ centroids
    std::vector<float> r_norms (pq.M * pq.ksub, NAN);
    for (int m = 0; m < pq.M; m++)
        for (int j = 0; j < pq.ksub; j++)
            r_norms [m * pq.ksub + j] =
                fvec_norm_L2sqr (pq.get_centroids (m, j), pq.dsub);

    if (use_precomputed_table == 1) {

        precomputed_table.resize (nlist * pq.M * pq.ksub);
        std::vector<float> centroid (d);

        for (size_t i = 0; i < nlist; i++) {
            quantizer->reconstruct (i, centroid.data());

            float *tab = &precomputed_table[i * pq.M * pq.ksub];
            pq.compute_inner_prod_table (centroid.data(), tab);
            fvec_madd (pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
        }
    } else if (use_precomputed_table == 2) {
        const MultiIndexQuantizer *miq =
           dynamic_cast<const MultiIndexQuantizer *> (quantizer);
        FAISS_THROW_IF_NOT (miq);
        const ProductQuantizer &cpq = miq->pq;
        FAISS_THROW_IF_NOT (pq.M % cpq.M == 0);

        precomputed_table.resize(cpq.ksub * pq.M * pq.ksub);

        // reorder PQ centroid table
        std::vector<float> centroids (d * cpq.ksub, NAN);

        for (int m = 0; m < cpq.M; m++) {
            for (size_t i = 0; i < cpq.ksub; i++) {
                memcpy (centroids.data() + i * d + m * cpq.dsub,
                        cpq.get_centroids (m, i),
                        sizeof (*centroids.data()) * cpq.dsub);
            }
        }

        pq.compute_inner_prod_tables (cpq.ksub, centroids.data (),
                                      precomputed_table.data ());

        for (size_t i = 0; i < cpq.ksub; i++) {
            float *tab = &precomputed_table[i * pq.M * pq.ksub];
            fvec_madd (pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
        }

    }

}

namespace {

using idx_t = Index::idx_t;

static uint64_t get_cycles () {
#ifdef  __x86_64__
    uint32_t high, low;
    asm volatile("rdtsc \n\t"
                 : "=a" (low),
                   "=d" (high));
    return ((uint64_t)high << 32) | (low);
#else
    return 0;
#endif
}

#define TIC t0 = get_cycles()
#define TOC get_cycles () - t0


/** QueryTables manages the various ways of searching an
 * IndexIVFPQ. The code contains a lot of branches, depending on:
 * - metric_type: are we computing L2 or Inner product similarity?
 * - by_residual: do we encode raw vectors or residuals?
 * - use_precomputed_table: are x_R|x_C tables precomputed?
 * - polysemous_ht: are we filtering with polysemous codes?
 */
struct QueryTables {

    /*****************************************************
     * General data from the IVFPQ
     *****************************************************/

    const IndexIVFPQ & ivfpq;
    const IVFSearchParameters *params;

    // copied from IndexIVFPQ for easier access
    int d;
    const ProductQuantizer & pq;
    MetricType metric_type;
    bool by_residual;
    int use_precomputed_table;
    int polysemous_ht;

    // pre-allocated data buffers
    float * sim_table, * sim_table_2;
    float * residual_vec, *decoded_vec;

    // single data buffer
    std::vector<float> mem;

    // for table pointers
    std::vector<const float *> sim_table_ptrs;

    explicit QueryTables (const IndexIVFPQ & ivfpq,
                          const IVFSearchParameters *params):
        ivfpq(ivfpq),
        d(ivfpq.d),
        pq (ivfpq.pq),
        metric_type (ivfpq.metric_type),
        by_residual (ivfpq.by_residual),
        use_precomputed_table (ivfpq.use_precomputed_table)
    {
        mem.resize (pq.ksub * pq.M * 2 + d * 2);
        sim_table = mem.data ();
        sim_table_2 = sim_table + pq.ksub * pq.M;
        residual_vec = sim_table_2 + pq.ksub * pq.M;
        decoded_vec = residual_vec + d;

        // for polysemous
        polysemous_ht = ivfpq.polysemous_ht;
        if (auto ivfpq_params =
            dynamic_cast<const IVFPQSearchParameters *>(params)) {
            polysemous_ht = ivfpq_params->polysemous_ht;
        }
        if (polysemous_ht != 0)  {
            q_code.resize (pq.code_size);
        }
        init_list_cycles = 0;
        sim_table_ptrs.resize (pq.M);
    }

    /*****************************************************
     * What we do when query is known
     *****************************************************/

    // field specific to query
    const float * qi;

    // query-specific intialization
    void init_query (const float * qi) {
        this->qi = qi;
        if (metric_type == METRIC_INNER_PRODUCT)
            init_query_IP ();
        else
            init_query_L2 ();
        if (!by_residual && polysemous_ht != 0)
            pq.compute_code (qi, q_code.data());
    }

    void init_query_IP () {
        // precompute some tables specific to the query qi
        pq.compute_inner_prod_table (qi, sim_table);
    }

    void init_query_L2 () {
        if (!by_residual) {
            pq.compute_distance_table (qi, sim_table);
        } else if (use_precomputed_table) {
            pq.compute_inner_prod_table (qi, sim_table_2);
        }
    }

    /*****************************************************
     * When inverted list is known: prepare computations
     *****************************************************/

    // fields specific to list
    Index::idx_t key;
    float coarse_dis;
    std::vector<uint8_t> q_code;

    uint64_t init_list_cycles;

    /// once we know the query and the centroid, we can prepare the
    /// sim_table that will be used for accumulation
    /// and dis0, the initial value
    float precompute_list_tables () {
        float dis0 = 0;
        uint64_t t0; TIC;
        if (by_residual) {
            if (metric_type == METRIC_INNER_PRODUCT)
                dis0 = precompute_list_tables_IP ();
            else
                dis0 = precompute_list_tables_L2 ();
        }
        init_list_cycles += TOC;
        return dis0;
     }

    float precompute_list_table_pointers () {
        float dis0 = 0;
        uint64_t t0; TIC;
        if (by_residual) {
            if (metric_type == METRIC_INNER_PRODUCT)
              FAISS_THROW_MSG ("not implemented");
            else
              dis0 = precompute_list_table_pointers_L2 ();
        }
        init_list_cycles += TOC;
        return dis0;
     }

    /*****************************************************
     * compute tables for inner prod
     *****************************************************/

    float precompute_list_tables_IP ()
    {
        // prepare the sim_table that will be used for accumulation
        // and dis0, the initial value
        ivfpq.quantizer->reconstruct (key, decoded_vec);
        // decoded_vec = centroid
        float dis0 = fvec_inner_product (qi, decoded_vec, d);

        if (polysemous_ht) {
            for (int i = 0; i < d; i++) {
                residual_vec [i] = qi[i] - decoded_vec[i];
            }
            pq.compute_code (residual_vec, q_code.data());
        }
        return dis0;
    }


    /*****************************************************
     * compute tables for L2 distance
     *****************************************************/

    float precompute_list_tables_L2 ()
    {
        float dis0 = 0;

        if (use_precomputed_table == 0 || use_precomputed_table == -1) {
            ivfpq.quantizer->compute_residual (qi, residual_vec, key);
            pq.compute_distance_table (residual_vec, sim_table);

            if (polysemous_ht != 0) {
                pq.compute_code (residual_vec, q_code.data());
            }

        } else if (use_precomputed_table == 1) {
            dis0 = coarse_dis;

            fvec_madd (pq.M * pq.ksub,
                       &ivfpq.precomputed_table [key * pq.ksub * pq.M],
                       -2.0, sim_table_2,
                       sim_table);


            if (polysemous_ht != 0) {
                ivfpq.quantizer->compute_residual (qi, residual_vec, key);
                pq.compute_code (residual_vec, q_code.data());
            }

        } else if (use_precomputed_table == 2) {
            dis0 = coarse_dis;

            const MultiIndexQuantizer *miq =
                dynamic_cast<const MultiIndexQuantizer *> (ivfpq.quantizer);
            FAISS_THROW_IF_NOT (miq);
            const ProductQuantizer &cpq = miq->pq;
            int Mf = pq.M / cpq.M;

            const float *qtab = sim_table_2; // query-specific table
            float *ltab = sim_table; // (output) list-specific table

            long k = key;
            for (int cm = 0; cm < cpq.M; cm++) {
                // compute PQ index
                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
                k >>= cpq.nbits;

                // get corresponding table
                const float *pc = &ivfpq.precomputed_table
                    [(ki * pq.M + cm * Mf) * pq.ksub];

                if (polysemous_ht == 0) {

                    // sum up with query-specific table
                    fvec_madd (Mf * pq.ksub,
                               pc,
                               -2.0, qtab,
                               ltab);
                    ltab += Mf * pq.ksub;
                    qtab += Mf * pq.ksub;
                } else {
                    for (int m = cm * Mf; m < (cm + 1) * Mf; m++) {
                        q_code[m] = fvec_madd_and_argmin
                            (pq.ksub, pc, -2, qtab, ltab);
                        pc += pq.ksub;
                        ltab += pq.ksub;
                        qtab += pq.ksub;
                    }
                }

            }
        }

        return dis0;
    }

    float precompute_list_table_pointers_L2 ()
    {
        float dis0 = 0;

        if (use_precomputed_table == 1) {
            dis0 = coarse_dis;

            const float * s = &ivfpq.precomputed_table [key * pq.ksub * pq.M];
            for (int m = 0; m < pq.M; m++) {
                sim_table_ptrs [m] = s;
                s += pq.ksub;
            }
        } else if (use_precomputed_table == 2) {
            dis0 = coarse_dis;

            const MultiIndexQuantizer *miq =
                dynamic_cast<const MultiIndexQuantizer *> (ivfpq.quantizer);
            FAISS_THROW_IF_NOT (miq);
            const ProductQuantizer &cpq = miq->pq;
            int Mf = pq.M / cpq.M;

            long k = key;
            int m0 = 0;
            for (int cm = 0; cm < cpq.M; cm++) {
                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
                k >>= cpq.nbits;

                const float *pc = &ivfpq.precomputed_table
                    [(ki * pq.M + cm * Mf) * pq.ksub];

                for (int m = m0; m < m0 + Mf; m++) {
                    sim_table_ptrs [m] = pc;
                    pc += pq.ksub;
                }
                m0 += Mf;
            }
        } else {
          FAISS_THROW_MSG ("need precomputed tables");
        }

        if (polysemous_ht) {
            FAISS_THROW_MSG ("not implemented");
            // Not clear that it makes sense to implemente this,
            // because it costs M * ksub, which is what we wanted to
            // avoid with the tables pointers.
        }

        return dis0;
    }


};


template<class C>
struct KnnSearchResults {
    idx_t key;
    const idx_t *ids;

    // heap params
    size_t k;
    float * heap_sim;
    long * heap_ids;

    size_t nup;

    inline void add (idx_t j, float dis) {
        if (C::cmp (heap_sim[0], dis)) {
            heap_pop<C> (k, heap_sim, heap_ids);
            long id = ids ? ids[j] : (key << 32 | j);
            heap_push<C> (k, heap_sim, heap_ids, dis, id);
            nup++;
        }
    }

};

template<class C>
struct RangeSearchResults {
    idx_t key;
    const idx_t *ids;

    // wrapped result structure
    float radius;
    RangeQueryResult & rres;

    inline void add (idx_t j, float dis) {
        if (C::cmp (radius, dis)) {
            long id = ids ? ids[j] : (key << 32 | j);
            rres.add (dis, id);
        }
    }
};


/*****************************************************
 * Scaning the codes.
 * The scanning functions call their favorite precompute_*
 * function to precompute the tables they need.
 *****************************************************/
template <typename IDType, MetricType METRIC_TYPE>
struct IVFPQScannerT: QueryTables {

    const uint8_t * list_codes;
    const IDType * list_ids;
    size_t list_size;

    IVFPQScannerT (const IndexIVFPQ & ivfpq, const IVFSearchParameters *params):
        QueryTables (ivfpq, params)
    {
        FAISS_THROW_IF_NOT (pq.nbits == 8);
        assert(METRIC_TYPE == metric_type);
    }

    float dis0;

    void init_list (idx_t list_no, float coarse_dis,
                      int mode) {
        this->key = list_no;
        this->coarse_dis = coarse_dis;

        if (mode == 2) {
            dis0 = precompute_list_tables ();
        } else if (mode == 1) {
            dis0 = precompute_list_table_pointers ();
        }
    }

    /*****************************************************
     * Scaning the codes: simple PQ scan.
     *****************************************************/

    /// version of the scan where we use precomputed tables
    template<class SearchResultType>
    void scan_list_with_table (size_t ncode, const uint8_t *codes,
                               SearchResultType & res) const
    {
        for (size_t j = 0; j < ncode; j++) {

            float dis = dis0;
            const float *tab = sim_table;

            for (size_t m = 0; m < pq.M; m++) {
                dis += tab[*codes++];
                tab += pq.ksub;
            }

            res.add(j, dis);
        }
    }


    /// tables are not precomputed, but pointers are provided to the
    /// relevant X_c|x_r tables
    template<class SearchResultType>
    void scan_list_with_pointer (size_t ncode, const uint8_t *codes,
                                 SearchResultType & res) const
    {
        for (size_t j = 0; j < ncode; j++) {

            float dis = dis0;
            const float *tab = sim_table_2;

            for (size_t m = 0; m < pq.M; m++) {
                int ci = *codes++;
                dis += sim_table_ptrs [m][ci] - 2 * tab [ci];
                tab += pq.ksub;
            }
            res.add (j, dis);
        }
    }


    /// nothing is precomputed: access residuals on-the-fly
    template<class SearchResultType>
    void scan_on_the_fly_dist (size_t ncode, const uint8_t *codes,
                                 SearchResultType &res) const
    {
        const float *dvec;
        float dis0 = 0;
        if (by_residual) {
            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
                ivfpq.quantizer->reconstruct (key, residual_vec);
                dis0 = fvec_inner_product (residual_vec, qi, d);
            } else {
                ivfpq.quantizer->compute_residual (qi, residual_vec, key);
            }
            dvec = residual_vec;
        } else {
            dvec = qi;
            dis0 = 0;
        }

        for (size_t j = 0; j < ncode; j++) {

            pq.decode (codes, decoded_vec);
            codes += pq.code_size;

            float dis;
            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
                dis = dis0 + fvec_inner_product (decoded_vec, qi, d);
            } else {
                dis = fvec_L2sqr (decoded_vec, dvec, d);
            }
            res.add (j, dis);
        }
    }

    /*****************************************************
     * Scanning codes with polysemous filtering
     *****************************************************/

    template <class HammingComputer, class SearchResultType>
    void scan_list_polysemous_hc (
             size_t ncode, const uint8_t *codes,
             SearchResultType & res) const
    {
        int ht = ivfpq.polysemous_ht;
        size_t n_hamming_pass = 0, nup = 0;

        int code_size = pq.code_size;

        HammingComputer hc (q_code.data(), code_size);

        for (size_t j = 0; j < ncode; j++) {
            const uint8_t *b_code = codes;
            int hd = hc.hamming (b_code);
            if (hd < ht) {
                n_hamming_pass ++;

                float dis = dis0;
                const float *tab = sim_table;

                for (size_t m = 0; m < pq.M; m++) {
                    dis += tab[*b_code++];
                    tab += pq.ksub;
                }

                res.add (j, dis);
            }
            codes += code_size;
        }
#pragma omp critical
        {
            indexIVFPQ_stats.n_hamming_pass += n_hamming_pass;
        }
    }

    template<class SearchResultType>
    void scan_list_polysemous (
             size_t ncode, const uint8_t *codes,
             SearchResultType &res) const
    {
        switch (pq.code_size) {
#define HANDLE_CODE_SIZE(cs)                                            \
        case cs:                                                        \
            scan_list_polysemous_hc \
            <HammingComputer ## cs, SearchResultType>   \
                (ncode, codes, res);             \
            break
        HANDLE_CODE_SIZE(4);
        HANDLE_CODE_SIZE(8);
        HANDLE_CODE_SIZE(16);
        HANDLE_CODE_SIZE(20);
        HANDLE_CODE_SIZE(32);
        HANDLE_CODE_SIZE(64);
#undef HANDLE_CODE_SIZE
        default:
            if (pq.code_size % 8 == 0)
                scan_list_polysemous_hc
                    <HammingComputerM8, SearchResultType>
                    (ncode, codes, res);
            else
                scan_list_polysemous_hc
                    <HammingComputerM4, SearchResultType>
                    (ncode, codes, res);
            break;
        }
    }

};


/* We put as many parameters as possible in template. Hopefully the
 * gain in runtime is worth the code bloat. C is the comparator < or
 * >, it is directly related to METRIC_TYPE. precompute_mode is how
 * much we precompute (2 = precompute distance tables, 1 = precompute
 * pointers to distances, 0 = compute distances one by one).
 * Currently only 2 is supported */
template<MetricType METRIC_TYPE, class C, int precompute_mode>
struct IVFPQScanner:
    IVFPQScannerT<Index::idx_t, METRIC_TYPE>,
    InvertedListScanner
{
    bool store_pairs;

    IVFPQScanner(const IndexIVFPQ & ivfpq, bool store_pairs):
        IVFPQScannerT<Index::idx_t, METRIC_TYPE>(ivfpq, nullptr),
        store_pairs(store_pairs)
    {
    }

    void set_query (const float *query) override {
        this->init_query (query);
    }

    void set_list (idx_t list_no, float coarse_dis) override {
        this->init_list (list_no, coarse_dis, precompute_mode);
    }

    float distance_to_code (const uint8_t *code) const override {
        assert(precompute_mode == 2);
        float dis = this->dis0;
        const float *tab = this->sim_table;

        for (size_t m = 0; m < this->pq.M; m++) {
            dis += tab[*code++];
            tab += this->pq.ksub;
        }
        return dis;
    }

    size_t scan_codes (size_t ncode,
                       const uint8_t *codes,
                       const idx_t *ids,
                       float *heap_sim, idx_t *heap_ids,
                       size_t k) const override
    {
        KnnSearchResults<C> res = {
            /* key */      this->key,
            /* ids */      this->store_pairs ? nullptr : ids,
            /* k */        k,
            /* heap_sim */ heap_sim,
            /* heap_ids */ heap_ids,
            /* nup */      0
        };

        if (this->polysemous_ht > 0) {
            assert(precompute_mode == 2);
            this->scan_list_polysemous (ncode, codes, res);
        } else if (precompute_mode == 2) {
            this->scan_list_with_table (ncode, codes, res);
        } else if (precompute_mode == 1) {
            this->scan_list_with_pointer (ncode, codes, res);
        } else if (precompute_mode == 0) {
            this->scan_on_the_fly_dist (ncode, codes, res);
        } else {
            FAISS_THROW_MSG("bad precomp mode");
        }
        return res.nup;
    }

    void scan_codes_range (size_t ncode,
                           const uint8_t *codes,
                           const idx_t *ids,
                           float radius,
                           RangeQueryResult & rres) const override
    {
        RangeSearchResults<C> res = {
            /* key */      this->key,
            /* ids */      this->store_pairs ? nullptr : ids,
            /* radius */   radius,
            /* rres */     rres
        };

        if (this->polysemous_ht > 0) {
            assert(precompute_mode == 2);
            this->scan_list_polysemous (ncode, codes, res);
        } else if (precompute_mode == 2) {
            this->scan_list_with_table (ncode, codes, res);
        } else if (precompute_mode == 1) {
            this->scan_list_with_pointer (ncode, codes, res);
        } else if (precompute_mode == 0) {
            this->scan_on_the_fly_dist (ncode, codes, res);
        } else {
            FAISS_THROW_MSG("bad precomp mode");
        }

    }
};


} // anonymous namespace

InvertedListScanner *
IndexIVFPQ::get_InvertedListScanner (bool store_pairs) const
{
    if (metric_type == METRIC_INNER_PRODUCT) {
        return new IVFPQScanner<METRIC_INNER_PRODUCT, CMin<float, long>, 2>
            (*this, store_pairs);
    } else if (metric_type == METRIC_L2) {
        return new IVFPQScanner<METRIC_L2, CMax<float, long>, 2>
            (*this, store_pairs);
    }
    return nullptr;

}


IndexIVFPQStats indexIVFPQ_stats;

void IndexIVFPQStats::reset () {
    memset (this, 0, sizeof (*this));
}


IndexIVFPQ::IndexIVFPQ ()
{
    // initialize some runtime values
    use_precomputed_table = 0;
    scan_table_threshold = 0;
    do_polysemous_training = false;
    polysemous_ht = 0;
    polysemous_training = nullptr;
}


struct CodeCmp {
    const uint8_t *tab;
    size_t code_size;
    bool operator () (int a, int b) const {
        return cmp (a, b) > 0;
    }
    int cmp (int a, int b) const {
        return memcmp (tab + a * code_size, tab + b * code_size,
                       code_size);
    }
};


size_t IndexIVFPQ::find_duplicates (idx_t *dup_ids, size_t *lims) const
{
    size_t ngroup = 0;
    lims[0] = 0;
    for (size_t list_no = 0; list_no < nlist; list_no++) {
        size_t n = invlists->list_size (list_no);
        std::vector<int> ord (n);
        for (int i = 0; i < n; i++) ord[i] = i;
        InvertedLists::ScopedCodes codes (invlists, list_no);
        CodeCmp cs = { codes.get(), code_size };
        std::sort (ord.begin(), ord.end(), cs);

        InvertedLists::ScopedIds list_ids (invlists, list_no);
        int prev = -1;  // all elements from prev to i-1 are equal
        for (int i = 0; i < n; i++) {
            if (prev >= 0 && cs.cmp (ord [prev], ord [i]) == 0) {
                // same as previous => remember
                if (prev + 1 == i) { // start new group
                    ngroup++;
                    lims[ngroup] = lims[ngroup - 1];
                    dup_ids [lims [ngroup]++] = list_ids [ord [prev]];
                }
                dup_ids [lims [ngroup]++] = list_ids [ord [i]];
            } else { // not same as previous.
                prev = i;
            }
        }
    }
    return ngroup;
}


/*****************************************
 * IndexIVFPQR implementation
 ******************************************/

IndexIVFPQR::IndexIVFPQR (
            Index * quantizer, size_t d, size_t nlist,
            size_t M, size_t nbits_per_idx,
            size_t M_refine, size_t nbits_per_idx_refine):
    IndexIVFPQ (quantizer, d, nlist, M, nbits_per_idx),
    refine_pq (d, M_refine, nbits_per_idx_refine),
    k_factor (4)
{
    by_residual = true;
}

IndexIVFPQR::IndexIVFPQR ():
    k_factor (1)
{
    by_residual = true;
}


void IndexIVFPQR::reset()
{
    IndexIVFPQ::reset();
    refine_codes.clear();
}


void IndexIVFPQR::train_residual (idx_t n, const float *x)
{

    float * residual_2 = new float [n * d];
    ScopeDeleter <float> del(residual_2);

    train_residual_o (n, x, residual_2);

    if (verbose)
        printf ("training %zdx%zd 2nd level PQ quantizer on %ld %dD-vectors\n",
                refine_pq.M, refine_pq.ksub, n, d);

    refine_pq.cp.max_points_per_centroid = 1000;
    refine_pq.cp.verbose = verbose;

    refine_pq.train (n, residual_2);

}


void IndexIVFPQR::add_with_ids (idx_t n, const float *x, const long *xids) {
    add_core (n, x, xids, nullptr);
}

void IndexIVFPQR::add_core (idx_t n, const float *x, const long *xids,
                                const long *precomputed_idx) {

    float * residual_2 = new float [n * d];
    ScopeDeleter <float> del(residual_2);

    idx_t n0 = ntotal;

    add_core_o (n, x, xids, residual_2, precomputed_idx);

    refine_codes.resize (ntotal * refine_pq.code_size);

    refine_pq.compute_codes (
        residual_2, &refine_codes[n0 * refine_pq.code_size], n);


}


void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
                                      const idx_t *idx,
                                      const float *L1_dis,
                                      float *distances, idx_t *labels,
                                      bool store_pairs,
                                      const IVFSearchParameters *params
                                      ) const
{
    uint64_t t0;
    TIC;
    size_t k_coarse = long(k * k_factor);
    idx_t *coarse_labels = new idx_t [k_coarse * n];
    ScopeDeleter<idx_t> del1 (coarse_labels);
    { // query with quantizer levels 1 and 2.
        float *coarse_distances = new float [k_coarse * n];
        ScopeDeleter<float> del(coarse_distances);

        IndexIVFPQ::search_preassigned (
                   n, x, k_coarse,
                   idx, L1_dis, coarse_distances, coarse_labels,
                   true, params);
    }


    indexIVFPQ_stats.search_cycles += TOC;

    TIC;

    // 3rd level refinement
    size_t n_refine = 0;
#pragma omp parallel reduction(+ : n_refine)
    {
        // tmp buffers
        float *residual_1 = new float [2 * d];
        ScopeDeleter<float> del (residual_1);
        float *residual_2 = residual_1 + d;
#pragma omp for
        for (idx_t i = 0; i < n; i++) {
            const float *xq = x + i * d;
            const long * shortlist = coarse_labels + k_coarse * i;
            float * heap_sim = distances + k * i;
            long * heap_ids = labels + k * i;
            maxheap_heapify (k, heap_sim, heap_ids);

            for (int j = 0; j < k_coarse; j++) {
                long sl = shortlist[j];

                if (sl == -1) continue;

                int list_no = sl >> 32;
                int ofs = sl & 0xffffffff;

                assert (list_no >= 0 && list_no < nlist);
                assert (ofs >= 0 && ofs < invlists->list_size (list_no));

                // 1st level residual
                quantizer->compute_residual (xq, residual_1, list_no);

                // 2nd level residual
                const uint8_t * l2code =
                    invlists->get_single_code (list_no, ofs);

                pq.decode (l2code, residual_2);
                for (int l = 0; l < d; l++)
                    residual_2[l] = residual_1[l] - residual_2[l];

                // 3rd level residual's approximation
                idx_t id = invlists->get_single_id (list_no, ofs);
                assert (0 <= id && id < ntotal);
                refine_pq.decode (&refine_codes [id * refine_pq.code_size],
                                  residual_1);

                float dis = fvec_L2sqr (residual_1, residual_2, d);

                if (dis < heap_sim[0]) {
                    maxheap_pop (k, heap_sim, heap_ids);
                    long id_or_pair = store_pairs ? sl : id;
                    maxheap_push (k, heap_sim, heap_ids, dis, id_or_pair);
                }
                n_refine ++;
            }
            maxheap_reorder (k, heap_sim, heap_ids);
        }
    }
    indexIVFPQ_stats.nrefine += n_refine;
    indexIVFPQ_stats.refine_cycles += TOC;
}

void IndexIVFPQR::reconstruct_from_offset (long list_no, long offset,
                                           float* recons) const
{
    IndexIVFPQ::reconstruct_from_offset (list_no, offset, recons);

    idx_t id = invlists->get_single_id (list_no, offset);
    assert (0 <= id && id < ntotal);

    std::vector<float> r3(d);
    refine_pq.decode (&refine_codes [id * refine_pq.code_size], r3.data());
    for (int i = 0; i < d; ++i) {
      recons[i] += r3[i];
    }
}

void IndexIVFPQR::merge_from (IndexIVF &other_in, idx_t add_id)
{
    IndexIVFPQR *other = dynamic_cast<IndexIVFPQR *> (&other_in);
    FAISS_THROW_IF_NOT(other);

    IndexIVF::merge_from (other_in, add_id);

    refine_codes.insert (refine_codes.end(),
                         other->refine_codes.begin(),
                         other->refine_codes.end());
    other->refine_codes.clear();
}

long IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
  FAISS_THROW_MSG("not implemented");
  return 0;
}

/*************************************
 * Index2Layer implementation
 *************************************/


Index2Layer::Index2Layer (Index * quantizer, size_t nlist,
                          int M,
                          MetricType metric):
    Index (quantizer->d, metric),
    q1 (quantizer, nlist),
    pq (quantizer->d, M, 8)
{
    is_trained = false;
    for (int nbyte = 0; nbyte < 7; nbyte++) {
        if ((1L << (8 * nbyte)) >= nlist) {
            code_size_1 = nbyte;
            break;
        }
    }
    code_size_2 = pq.code_size;
    code_size = code_size_1 + code_size_2;
}

Index2Layer::Index2Layer ()
{
    code_size = code_size_1 = code_size_2 = 0;
}

Index2Layer::~Index2Layer ()
{}

void Index2Layer::train(idx_t n, const float* x)
{
    if (verbose) {
        printf ("training level-1 quantizer %ld vectors in %dD\n",
                n, d);
    }

    q1.train_q1 (n, x, verbose, metric_type);

    if (verbose) {
        printf("computing residuals\n");
    }

    const float * x_in = x;

    x = fvecs_maybe_subsample (
         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
         x, verbose, pq.cp.seed);

    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);

    std::vector<idx_t> assign(n); // assignement to coarse centroids
    q1.quantizer->assign (n, x, assign.data());
    std::vector<float> residuals(n * d);
    for (idx_t i = 0; i < n; i++) {
        q1.quantizer->compute_residual (
           x + i * d, residuals.data() + i * d, assign[i]);
    }

    if (verbose)
        printf ("training %zdx%zd product quantizer on %ld vectors in %dD\n",
                pq.M, pq.ksub, n, d);
    pq.verbose = verbose;
    pq.train (n, residuals.data());

    is_trained = true;
}

void Index2Layer::add(idx_t n, const float* x)
{
    idx_t bs = 32768;
    if (n > bs) {
        for (idx_t i0 = 0; i0 < n; i0 += bs) {
            idx_t i1 = std::min(i0 + bs, n);
            if (verbose) {
                printf("Index2Layer::add: adding %ld:%ld / %ld\n",
                       i0, i1, n);
            }
            add (i1 - i0, x + i0 * d);
        }
        return;
    }

    std::vector<idx_t> codes1 (n);
    q1.quantizer->assign (n, x, codes1.data());
    std::vector<float> residuals(n * d);
    for (idx_t i = 0; i < n; i++) {
        q1.quantizer->compute_residual (
            x + i * d, residuals.data() + i * d, codes1[i]);
    }
    std::vector<uint8_t> codes2 (n * code_size_2);

    pq.compute_codes (residuals.data(), codes2.data(), n);

    codes.resize ((ntotal + n) * code_size);
    uint8_t *wp = &codes[ntotal * code_size];

    {
        int i = 0x11223344;
        const char *ip = (char*)&i;
        FAISS_THROW_IF_NOT_MSG (ip[0] == 0x44,
                                "works only on a little-endian CPU");
    }

    // copy to output table
    for (idx_t i = 0; i < n; i++) {
        memcpy (wp, &codes1[i], code_size_1);
        wp += code_size_1;
        memcpy (wp, &codes2[i * code_size_2], code_size_2);
        wp += code_size_2;
    }

    ntotal += n;

}

void Index2Layer::search(
    idx_t /*n*/,
    const float* /*x*/,
    idx_t /*k*/,
    float* /*distances*/,
    idx_t* /*labels*/) const {
  FAISS_THROW_MSG("not implemented");
}


void Index2Layer::reconstruct_n(idx_t i0, idx_t ni, float* recons) const
{
    float recons1[d];
    FAISS_THROW_IF_NOT (i0 >= 0 && i0 + ni <= ntotal);
    const uint8_t *rp = &codes[i0 * code_size];

    for (idx_t i = 0; i < ni; i++) {
        idx_t key = 0;
        memcpy (&key, rp, code_size_1);
        q1.quantizer->reconstruct (key, recons1);
        rp += code_size_1;
        pq.decode (rp, recons);
        for (idx_t j = 0; j < d; j++) {
            recons[j] += recons1[j];
        }
        rp += code_size_2;
        recons += d;
    }
}

void Index2Layer::transfer_to_IVFPQ (IndexIVFPQ & other) const
{
    FAISS_THROW_IF_NOT (other.nlist == q1.nlist);
    FAISS_THROW_IF_NOT (other.code_size == code_size_2);
    FAISS_THROW_IF_NOT (other.ntotal == 0);

    const uint8_t *rp = codes.data();

    for (idx_t i = 0; i < ntotal; i++) {
        idx_t key = 0;
        memcpy (&key, rp, code_size_1);
        rp += code_size_1;
        other.invlists->add_entry (key, i, rp);
        rp += code_size_2;
    }

    other.ntotal = ntotal;

}


void Index2Layer::reconstruct(idx_t key, float* recons) const
{
    reconstruct_n (key, 1, recons);
}

void Index2Layer::reset()
{
    ntotal = 0;
    codes.clear ();
}


} // namespace faiss