sync with FB version 2017-07-18

- implemented ScalarQuantizer (without IVF) - implemented update for IndexIVFFlat - implemented L2 normalization preproc
2017-07-18 02:51:27 -07:00 · 2017-07-18 02:51:27 -07:00 · f7aedbdfc0
parent 602debae7b
commit f7aedbdfc0
24 changed files with 3961 additions and 1318 deletions
--- a/AutoTune.cpp
+++ b/AutoTune.cpp
@ -22,7 +22,7 @@
 #include "IndexIVF.h"
 #include "IndexIVFPQ.h"
 #include "MetaIndexes.h"
-#include "IndexIVFScalarQuantizer.h"
+#include "IndexScalarQuantizer.h"


 namespace faiss {
@ -623,18 +623,28 @@ void ParameterSpace::explore (Index *index,
 * index_factory
 ***************************************************************/

+namespace {

+struct VTChain {
+    std::vector<VectorTransform *> chain;
+    ~VTChain () {
+        for (int i = 0; i < chain.size(); i++) {
+            delete chain[i];
+        }
+    }
+};
+
+}

 Index *index_factory (int d, const char *description_in, MetricType metric)
 {
-    VectorTransform *vt = nullptr;
+    VTChain vts;
    Index *coarse_quantizer = nullptr;
    Index *index = nullptr;
    bool add_idmap = false;
    bool make_IndexRefineFlat = false;

    ScopeDeleter1<Index> del_coarse_quantizer, del_index;
-    ScopeDeleter1<VectorTransform> del_vt;

    char description[strlen(description_in) + 1];
    char *ptr;
@ -656,18 +666,27 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
        Index *index_1 = nullptr;

        // VectorTransforms
-        if (!vt && sscanf (tok, "PCA%d", &d_out) == 1) {
+        if (sscanf (tok, "PCA%d", &d_out) == 1) {
            vt_1 = new PCAMatrix (d, d_out);
            d = d_out;
-        } else if (!vt && sscanf (tok, "PCAR%d", &d_out) == 1) {
+        } else if (sscanf (tok, "PCAR%d", &d_out) == 1) {
            vt_1 = new PCAMatrix (d, d_out, 0, true);
            d = d_out;
-        } else if (!vt && sscanf (tok, "OPQ%d_%d", &opq_M, &d_out) == 2) {
+        } else if (sscanf (tok, "PCAW%d", &d_out) == 1) {
+            vt_1 = new PCAMatrix (d, d_out, -0.5, false);
+            d = d_out;
+        } else if (sscanf (tok, "PCAWR%d", &d_out) == 1) {
+            vt_1 = new PCAMatrix (d, d_out, -0.5, true);
+            d = d_out;
+        } else if (sscanf (tok, "OPQ%d_%d", &opq_M, &d_out) == 2) {
            vt_1 = new OPQMatrix (d, opq_M, d_out);
            d = d_out;
-        } else if (!vt && sscanf (tok, "OPQ%d", &opq_M) == 1) {
+        } else if (sscanf (tok, "OPQ%d", &opq_M) == 1) {
            vt_1 = new OPQMatrix (d, opq_M);
-            // coarse quantizers
+        } else if (stok == "L2norm") {
+            vt_1 = new NormalizationTransform (d, 2.0);
+
+        // coarse quantizers
        } else if (!coarse_quantizer &&
                   sscanf (tok, "IVF%d", &ncentroids) == 1) {
            if (metric == METRIC_L2) {
@ -698,28 +717,25 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                index_1 = index_ivf;
            } else {
                index_1 = new IndexFlat (d, metric);
-                if (add_idmap) {
-                    IndexIDMap *idmap = new IndexIDMap(index_1);
-                    idmap->own_fields = true;
-                    index_1 = idmap;
-                    add_idmap = false;
-                }
            }
        } else if (!index && (stok == "SQ8" || stok == "SQ4")) {
-            FAISS_THROW_IF_NOT_MSG(coarse_quantizer,
-                             "ScalarQuantizer works only with an IVF");
            ScalarQuantizer::QuantizerType qt =
                stok == "SQ8" ? ScalarQuantizer::QT_8bit :
                stok == "SQ4" ? ScalarQuantizer::QT_4bit :
                ScalarQuantizer::QT_4bit;
-            IndexIVFScalarQuantizer *index_ivf = new IndexIVFScalarQuantizer (
-                             coarse_quantizer, d, ncentroids, qt, metric);
-            index_ivf->quantizer_trains_alone =
-                dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
-                != nullptr;
-            del_coarse_quantizer.release ();
-            index_ivf->own_fields = true;
-            index_1 = index_ivf;
+            if (coarse_quantizer) {
+                IndexIVFScalarQuantizer *index_ivf =
+                    new IndexIVFScalarQuantizer (
+                      coarse_quantizer, d, ncentroids, qt, metric);
+                index_ivf->quantizer_trains_alone =
+                    dynamic_cast<MultiIndexQuantizer*>(coarse_quantizer)
+                    != nullptr;
+                del_coarse_quantizer.release ();
+                index_ivf->own_fields = true;
+                index_1 = index_ivf;
+            } else {
+                index_1 = new IndexScalarQuantizer (d, qt, metric);
+            }
        } else if (!index && sscanf (tok, "PQ%d+%d", &M, &M2) == 2) {
            FAISS_THROW_IF_NOT_MSG(coarse_quantizer,
                             "PQ with + works only with an IVF");
@ -750,13 +766,6 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                IndexPQ *index_pq = new IndexPQ (d, M, 8, metric);
                index_pq->do_polysemous_training = true;
                index_1 = index_pq;
-                if (add_idmap) {
-                    IndexIDMap *idmap = new IndexIDMap(index_1);
-                    del_index.set (idmap);
-                    idmap->own_fields = true;
-                    index_1 = idmap;
-                    add_idmap = false;
-                }
            }
        } else if (stok == "RFlat") {
            make_IndexRefineFlat = true;
@ -765,9 +774,16 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                             tok, description_in);
        }

+        if (index_1 && add_idmap) {
+            IndexIDMap *idmap = new IndexIDMap(index_1);
+            del_index.set (idmap);
+            idmap->own_fields = true;
+            index_1 = idmap;
+            add_idmap = false;
+        }
+
        if (vt_1)  {
-            vt = vt_1;
-            del_vt.set (vt);
+            vts.chain.push_back (vt_1);
        }

        if (coarse_quantizer_1) {
@ -793,10 +809,14 @@ Index *index_factory (int d, const char *description_in, MetricType metric)
                "IDMap option not used\n");
    }

-    if (vt) {
-        IndexPreTransform *index_pt = new IndexPreTransform (vt, index);
-        del_vt.release ();
+    if (vts.chain.size() > 0) {
+        IndexPreTransform *index_pt = new IndexPreTransform (index);
        index_pt->own_fields = true;
+        // add from back
+        while (vts.chain.size() > 0) {
+            index_pt->prepend_transform (vts.chain.back());
+            vts.chain.pop_back ();
+        }
        index = index_pt;
    }

--- a/AuxIndexStructures.cpp
+++ b/AuxIndexStructures.cpp
@ -158,6 +158,10 @@ void RangeSearchPartialResult::set_result (bool incremental)
 }


+/***********************************************************************
+ * IDSelectorRange
+ ***********************************************************************/
+
 IDSelectorRange::IDSelectorRange (idx_t imin, idx_t imax):
    imin (imin), imax (imax)
 {
@ -169,6 +173,9 @@ bool IDSelectorRange::is_member (idx_t id) const
 }


+/***********************************************************************
+ * IDSelectorBatch
+ ***********************************************************************/

 IDSelectorBatch::IDSelectorBatch (long n, const idx_t *indices)
 {
--- a/AuxIndexStructures.h
+++ b/AuxIndexStructures.h
@ -15,12 +15,7 @@
 #define FAISS_AUX_INDEX_STRUCTURES_H

 #include <vector>
-
-#if __cplusplus >= 201103L
 #include <unordered_set>
-#endif
-
-#include <set>


 #include "Index.h"
@ -80,11 +75,7 @@ struct IDSelectorRange: IDSelector {
 * hash collisions if lsb's are always the same */
 struct IDSelectorBatch: IDSelector {

-#if __cplusplus >= 201103L
    std::unordered_set<idx_t> set;
-#else
-    std::set<idx_t> set;
-#endif

    typedef unsigned char uint8_t;
    std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
--- a/FaissException.cpp
+++ b/FaissException.cpp
@ -9,7 +9,6 @@
 // Copyright 2004-present Facebook. All Rights Reserved.

 #include "FaissException.h"
-#include <cstdio>

 namespace faiss {

@ -28,4 +27,9 @@ FaissException::FaissException(const std::string& m,
           funcName, file, line, m.c_str());
 }

+const char*
+FaissException::what() const noexcept {
+  return msg.c_str();
+}
+
 }
--- a/FaissException.h
+++ b/FaissException.h
@ -27,9 +27,7 @@ class FaissException : public std::exception {
                 int line);

  /// from std::exception
-  const char* what() const noexcept override
-  {  return msg.c_str(); }
-  ~FaissException () noexcept override {}
+  const char* what() const noexcept override;

  std::string msg;
 };
--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
@ -65,21 +65,28 @@ void IndexIVF::add (idx_t n, const float * x)
    add_with_ids (n, x, nullptr);
 }

-void IndexIVF::make_direct_map ()
+void IndexIVF::make_direct_map (bool new_maintain_direct_map)
 {
-    if (maintain_direct_map) return;
+    // nothing to do
+    if (new_maintain_direct_map == maintain_direct_map)
+        return;

-    direct_map.resize (ntotal, -1);
-    for (size_t key = 0; key < nlist; key++) {
-        const std::vector<long> & idlist = ids[key];
+    if (new_maintain_direct_map) {
+        direct_map.resize (ntotal, -1);
+        for (size_t key = 0; key < nlist; key++) {
+            const std::vector<long> & idlist = ids[key];

-        for (long ofs = 0; ofs < idlist.size(); ofs++) {
-            direct_map [idlist [ofs]] =
-                key << 32 | ofs;
+            for (long ofs = 0; ofs < idlist.size(); ofs++) {
+                FAISS_THROW_IF_NOT_MSG (
+                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
+                       "direct map supported only for seuquential ids");
+                direct_map [idlist [ofs]] = key << 32 | ofs;
+            }
        }
+    } else {
+        direct_map.clear ();
    }
-
-    maintain_direct_map = true;
+    maintain_direct_map = new_maintain_direct_map;
 }


@ -183,7 +190,6 @@ void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)



-
 IndexIVF::~IndexIVF()
 {
    if (own_fields) delete quantizer;
@ -217,6 +223,8 @@ void IndexIVFFlat::add_core (idx_t n, const float * x, const long *xids,

 {
    FAISS_THROW_IF_NOT (is_trained);
+    FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
+                            "cannot have direct map and add with ids");
    const long * idx;
    ScopeDeleter<long> del;

@ -477,6 +485,49 @@ void IndexIVFFlat::copy_subset_to (IndexIVFFlat & other, int subset_type,
    }
 }

+void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
+{
+    FAISS_THROW_IF_NOT (maintain_direct_map);
+    FAISS_THROW_IF_NOT (is_trained);
+    std::vector<idx_t> assign (n);
+    quantizer->assign (n, x, assign.data());
+
+    for (int i = 0; i < n; i++) {
+        idx_t id = new_ids[i];
+        FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
+                                "id to update out of range");
+        { // remove old one
+            long dm = direct_map[id];
+            long ofs = dm & 0xffffffff;
+            long il = dm >> 32;
+            size_t l = ids[il].size();
+            if (ofs != l - 1) {
+                long id2 = ids[il].back();
+                ids[il][ofs] = id2;
+                direct_map[id2] = (il << 32) | ofs;
+                memcpy (vecs[il].data() + ofs * d,
+                        vecs[il].data() + (l - 1) * d,
+                        d * sizeof(vecs[il][0]));
+            }
+            ids[il].pop_back();
+            vecs[il].resize((l - 1) * d);
+        }
+        { // insert new one
+            long il = assign[i];
+            size_t l = ids[il].size();
+            long dm = (il << 32) | l;
+            direct_map[id] = dm;
+            ids[il].push_back (id);
+            vecs[il].resize((l + 1) * d);
+            memcpy (vecs[il].data() + l * d,
+                    x + i * d,
+                    d * sizeof(vecs[il][0]));
+        }
+    }
+
+}
+
+


 void IndexIVFFlat::reset()
--- a/IndexIVF.h
+++ b/IndexIVF.h
@ -91,9 +91,12 @@ struct IndexIVF: Index {
    size_t get_list_size (size_t list_no) const
    { return ids[list_no].size(); }

-
-    /// intialize a direct map
-    void make_direct_map ();
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map (bool new_maintain_direct_map=true);

    /// 1= perfectly balanced, >1: imbalanced
    double imbalance_factor () const;
@ -184,6 +187,16 @@ struct IndexIVFFlat: IndexIVF {
            const long * keys,
            float_maxheap_array_t * res) const;

+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    void update_vectors (int nv, idx_t *idx, const float *v);
+
    void reconstruct(idx_t key, float* recons) const override;

    void merge_from_residuals(IndexIVF& other) override;
--- a/IndexIVFScalarQuantizer.cpp
+++ b/IndexIVFScalarQuantizer.cpp
@ -1,895 +0,0 @@
-/**
- * Copyright (c) 2015-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the CC-by-NC license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "IndexIVFScalarQuantizer.h"
-
-#include <cstdio>
-#include <algorithm>
-
-#include <omp.h>
-
-#include <immintrin.h>
-
-#include "utils.h"
-
-#include "FaissAssert.h"
-
-namespace faiss {
-
-/*******************************************************************
- * IndexIVFScalarQuantizer implementation
- *
- * The main source of complexity is to support combinations of 4
- * variants without incurring runtime tests or virtual function calls:
- *
- * - 4 / 8 bits per code component
- * - uniform / non-uniform
- * - IP / L2 distance search
- * - scalar / AVX distance computation
- *
- * The appropriate Quantizer object is returned via select_quantizer
- * that hides the template mess.
- ********************************************************************/
-
-#ifdef __AVX__
-#define USE_AVX
-#endif
-
-
-namespace {
-
-typedef Index::idx_t idx_t;
-typedef ScalarQuantizer::QuantizerType QuantizerType;
-typedef ScalarQuantizer::RangeStat RangeStat;
-
-
-/*******************************************************************
- * Codec: converts between values in [0, 1] and an index in a code
- * array. The "i" parameter is the vector component index (not byte
- * index).
- */
-
-struct Codec8bit {
-
-    static void encode_component (float x, uint8_t *code, int i) {
-        code[i] = (int)(255 * x);
-    }
-
-    static float decode_component (const uint8_t *code, int i) {
-        return (code[i] + 0.5f) / 255.0f;
-    }
-
-#ifdef USE_AVX
-    static __m256 decode_8_components (const uint8_t *code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + i);
-        __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
-        __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
-        // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
-        __m256i i8 = _mm256_castsi128_si256 (c4lo);
-        i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps (i8);
-        __m256 half = _mm256_set1_ps (0.5f);
-        f8 += half;
-        __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
-        return f8 * one_255;
-    }
-#endif
-};
-
-
-struct Codec4bit {
-
-    static void encode_component (float x, uint8_t *code, int i) {
-        code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
-    }
-
-    static float decode_component (const uint8_t *code, int i) {
-        return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
-    }
-
-
-#ifdef USE_AVX
-    static __m256 decode_8_components (const uint8_t *code, int i) {
-        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
-        uint32_t mask = 0x0f0f0f0f;
-        uint32_t c4ev = c4 & mask;
-        uint32_t c4od = (c4 >> 4) & mask;
-
-        // the 8 lower bytes of c8 contain the values
-        __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
-                                        _mm_set1_epi32(c4od));
-        __m128i c4lo = _mm_cvtepu8_epi32 (c8);
-        __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
-        __m256i i8 = _mm256_castsi128_si256 (c4lo);
-        i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps (i8);
-        __m256 half = _mm256_set1_ps (0.5f);
-        f8 += half;
-        __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
-        return f8 * one_255;
-    }
-#endif
-};
-
-
-/*******************************************************************
- * Similarity: gets vector components and computes a similarity wrt. a
- * query vector stored in the object
- */
-
-struct SimilarityL2 {
-    const float *y, *yi;
-    explicit SimilarityL2 (const float * y): y(y) {}
-
-
-    /******* scalar accumulator *******/
-
-    float accu;
-
-    void begin () {
-        accu = 0;
-        yi = y;
-    }
-
-    void add_component (float x) {
-        float tmp = *yi++ - x;
-        accu += tmp * tmp;
-    }
-
-    float result () {
-        return accu;
-    }
-
-#ifdef USE_AVX
-    /******* AVX accumulator *******/
-
-    __m256 accu8;
-
-    void begin_8 () {
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-
-    void add_8_components (__m256 x) {
-        __m256 yiv = _mm256_loadu_ps (yi);
-        yi += 8;
-        __m256 tmp = yiv - x;
-        accu8 += tmp * tmp;
-    }
-
-    float result_8 () {
-        __m256 sum = _mm256_hadd_ps(accu8, accu8);
-        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        // now add the 0th and 4th component
-        return
-            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
-            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
-    }
-#endif
-};
-
-struct SimilarityIP {
-    const float *y, *yi;
-    const float accu0;
-
-    /******* scalar accumulator *******/
-
-    float accu;
-
-    SimilarityIP (const float * y, float accu0):
-        y (y), accu0 (accu0) {}
-
-    void begin () {
-        accu = accu0;
-        yi = y;
-    }
-
-    void add_component (float x) {
-        accu +=  *yi++ * x;
-    }
-
-    float result () {
-        return accu;
-    }
-
-#ifdef USE_AVX
-    /******* AVX accumulator *******/
-
-    __m256 accu8;
-
-    void begin_8 () {
-        accu8 = _mm256_setzero_ps();
-        yi = y;
-    }
-
-    void add_8_components (__m256 x) {
-        __m256 yiv = _mm256_loadu_ps (yi);
-        yi += 8;
-        accu8 += yiv * x;
-    }
-
-    float result_8 () {
-        __m256 sum = _mm256_hadd_ps(accu8, accu8);
-        __m256 sum2 = _mm256_hadd_ps(sum, sum);
-        // now add the 0th and 4th component
-        return
-            accu0 +
-            _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
-            _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
-    }
-#endif
-};
-
-
-/*******************************************************************
- * templatized distance functions
- */
-
-
-template<class Quantizer, class Similarity>
-float compute_distance(const Quantizer & quant, Similarity & sim,
-                       const uint8_t *code)
-{
-    sim.begin();
-    for (size_t i = 0; i < quant.d; i++) {
-        float xi = quant.reconstruct_component (code, i);
-        sim.add_component (xi);
-    }
-    return sim.result();
-}
-
-#ifdef USE_AVX
-template<class Quantizer, class Similarity>
-float compute_distance_8(const Quantizer & quant, Similarity & sim,
-                         const uint8_t *code)
-{
-    sim.begin_8();
-    for (size_t i = 0; i < quant.d; i += 8) {
-        __m256 xi = quant.reconstruct_8_components (code, i);
-        sim.add_8_components (xi);
-    }
-    return sim.result_8();
-}
-#endif
-
-
-/*******************************************************************
- * Quantizer range training
- */
-
-static float sqr (float x) {
-    return x * x;
-}
-
-
-void train_Uniform(RangeStat rs, float rs_arg,
-                   idx_t n, int k, const float *x,
-                   std::vector<float> & trained)
-{
-    trained.resize (2);
-    float & vmin = trained[0];
-    float & vmax = trained[1];
-
-    if (rs == ScalarQuantizer::RS_minmax) {
-        vmin = HUGE_VAL; vmax = -HUGE_VAL;
-        for (size_t i = 0; i < n; i++) {
-            if (x[i] < vmin) vmin = x[i];
-            if (x[i] > vmax) vmax = x[i];
-        }
-        float vexp = (vmax - vmin) * rs_arg;
-        vmin -= vexp;
-        vmax += vexp;
-    } else if (rs == ScalarQuantizer::RS_meanstd) {
-        double sum = 0, sum2 = 0;
-        for (size_t i = 0; i < n; i++) {
-            sum += x[i];
-            sum2 += x[i] * x[i];
-        }
-        float mean = sum / n;
-        float var = sum2 / n - mean * mean;
-        float std = var <= 0 ? 1.0 : sqrt(var);
-
-        vmin = mean - std * rs_arg ;
-        vmax = mean + std * rs_arg ;
-    } else if (rs == ScalarQuantizer::RS_quantiles) {
-        std::vector<float> x_copy(n);
-        memcpy(x_copy.data(), x, n * sizeof(*x));
-        // TODO just do a qucikselect
-        std::sort(x_copy.begin(), x_copy.end());
-        int o = int(rs_arg * n);
-        if (o < 0) o = 0;
-        if (o > n - o) o = n / 2;
-        vmin = x_copy[o];
-        vmax = x_copy[n - 1 - o];
-
-    } else if (rs == ScalarQuantizer::RS_optim) {
-        float a, b;
-        float sx = 0;
-        {
-            vmin = HUGE_VAL, vmax = -HUGE_VAL;
-            for (size_t i = 0; i < n; i++) {
-                if (x[i] < vmin) vmin = x[i];
-                if (x[i] > vmax) vmax = x[i];
-                sx += x[i];
-            }
-            b = vmin;
-            a = (vmax - vmin) / (k - 1);
-        }
-        int verbose = false;
-        int niter = 2000;
-        float last_err = -1;
-        int iter_last_err = 0;
-        for (int it = 0; it < niter; it++) {
-            float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
-
-            for (idx_t i = 0; i < n; i++) {
-                float xi = x[i];
-                float ni = floor ((xi - b) / a + 0.5);
-                if (ni < 0) ni = 0;
-                if (ni >= k) ni = k - 1;
-                err1 += sqr (xi - (ni * a + b));
-                sn  += ni;
-                sn2 += ni * ni;
-                sxn += ni * xi;
-            }
-
-            if (err1 == last_err) {
-                iter_last_err ++;
-                if (iter_last_err == 16) break;
-            } else {
-                last_err = err1;
-                iter_last_err = 0;
-            }
-
-            float det = sqr (sn) - sn2 * n;
-
-            b = (sn * sxn - sn2 * sx) / det;
-            a = (sn * sx - n * sxn) / det;
-            if (verbose) {
-                printf ("it %d, err1=%g            \r", it, err1);
-                fflush(stdout);
-            }
-        }
-        if (verbose) printf("\n");
-
-        vmin = b;
-        vmax = b + a * (k - 1);
-
-    } else {
-        FAISS_THROW_MSG ("Invalid qtype");
-    }
-    vmax -= vmin;
-}
-
-void train_NonUniform(RangeStat rs, float rs_arg,
-                      idx_t n, int d, int k, const float *x,
-                      std::vector<float> & trained)
-{
-    trained.resize (2 * d);
-    float * vmin = trained.data();
-    float * vmax = trained.data() + d;
-    if (rs == ScalarQuantizer::RS_minmax) {
-        memcpy (vmin, x, sizeof(*x) * d);
-        memcpy (vmax, x, sizeof(*x) * d);
-        for (size_t i = 1; i < n; i++) {
-            const float *xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                if (xi[j] < vmin[j]) vmin[j] = xi[j];
-                if (xi[j] > vmax[j]) vmax[j] = xi[j];
-            }
-        }
-        float *vdiff = vmax;
-        for (size_t j = 0; j < d; j++) {
-            float vexp = (vmax[j] - vmin[j]) * rs_arg;
-            vmin[j] -= vexp;
-            vmax[j] += vexp;
-            vdiff [j] = vmax[j] - vmin[j];
-        }
-    } else {
-        // transpose
-        std::vector<float> xt(n * d);
-        for (size_t i = 1; i < n; i++) {
-            const float *xi = x + i * d;
-            for (size_t j = 0; j < d; j++) {
-                xt[j * n + i] = xi[j];
-            }
-        }
-        std::vector<float> trained_d(2);
-#pragma omp parallel for
-        for (size_t j = 0; j < d; j++) {
-            train_Uniform(rs, rs_arg,
-                          n, k, xt.data() + j * n,
-                          trained_d);
-            vmin[j] = trained_d[0];
-            vmax[j] = trained_d[1];
-        }
-    }
-}
-
-
-/*******************************************************************
- * Quantizer: normalizes scalar vector components, then passes them
- * through a codec
- */
-
-
-
-struct Quantizer {
-    virtual void encode_vector(const float *x, uint8_t *code) const = 0;
-    virtual void decode_vector(const uint8_t *code, float *x) const = 0;
-
-    virtual float compute_distance_L2 (SimilarityL2 &sim,
-                                       const uint8_t * codes) const = 0;
-    virtual float compute_distance_IP (SimilarityIP &sim,
-                                       const uint8_t * codes) const = 0;
-
-    virtual ~Quantizer() {}
-};
-
-
-
-
-template<class Codec>
-struct QuantizerUniform: Quantizer {
-    const size_t d;
-    const float vmin, vdiff;
-
-    QuantizerUniform(size_t d, const std::vector<float> &trained):
-        d(d), vmin(trained[0]), vdiff(trained[1]) {
-    }
-
-    void encode_vector(const float* x, uint8_t* code) const override {
-      for (size_t i = 0; i < d; i++) {
-        float xi = (x[i] - vmin) / vdiff;
-        if (xi < 0)
-          xi = 0;
-        if (xi > 1.0)
-          xi = 1.0;
-        Codec::encode_component(xi, code, i);
-      }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const override {
-      for (size_t i = 0; i < d; i++) {
-        float xi = Codec::decode_component(code, i);
-        x[i] = vmin + xi * vdiff;
-      }
-    }
-
-    float reconstruct_component (const uint8_t * code, int i) const
-    {
-        float xi = Codec::decode_component (code, i);
-        return vmin + xi * vdiff;
-    }
-
-#ifdef USE_AVX
-    __m256 reconstruct_8_components (const uint8_t * code, int i) const
-    {
-        __m256 xi = Codec::decode_8_components (code, i);
-        return _mm256_set1_ps(vmin) + xi * _mm256_set1_ps (vdiff);
-    }
-#endif
-
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
-        const override {
-      return compute_distance(*this, sim, codes);
-    }
-
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
-        const override {
-      return compute_distance(*this, sim, codes);
-    }
-};
-
-#ifdef USE_AVX
-template<class Codec>
-struct QuantizerUniform8: QuantizerUniform<Codec> {
-
-    QuantizerUniform8 (size_t d, const std::vector<float> &trained):
-        QuantizerUniform<Codec> (d, trained) {}
-
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
-        const override {
-      return compute_distance_8(*this, sim, codes);
-    }
-
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
-        const override {
-      return compute_distance_8(*this, sim, codes);
-    }
-};
-#endif
-
-
-
-
-
-template<class Codec>
-struct QuantizerNonUniform: Quantizer {
-    const size_t d;
-    const float *vmin, *vdiff;
-
-    QuantizerNonUniform(size_t d, const std::vector<float> &trained):
-        d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
-
-    void encode_vector(const float* x, uint8_t* code) const override {
-      for (size_t i = 0; i < d; i++) {
-        float xi = (x[i] - vmin[i]) / vdiff[i];
-        if (xi < 0)
-          xi = 0;
-        if (xi > 1.0)
-          xi = 1.0;
-        Codec::encode_component(xi, code, i);
-      }
-    }
-
-    void decode_vector(const uint8_t* code, float* x) const override {
-      for (size_t i = 0; i < d; i++) {
-        float xi = Codec::decode_component(code, i);
-        x[i] = vmin[i] + xi * vdiff[i];
-      }
-    }
-
-    float reconstruct_component (const uint8_t * code, int i) const
-    {
-        float xi = Codec::decode_component (code, i);
-        return vmin[i] + xi * vdiff[i];
-    }
-
-#ifdef USE_AVX
-    __m256 reconstruct_8_components (const uint8_t * code, int i) const
-    {
-        __m256 xi = Codec::decode_8_components (code, i);
-        return _mm256_loadu_ps(vmin + i) + xi * _mm256_loadu_ps (vdiff + i);
-    }
-#endif
-
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
-        const override {
-      return compute_distance(*this, sim, codes);
-    }
-
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
-        const override {
-      return compute_distance(*this, sim, codes);
-    }
-};
-
-#ifdef USE_AVX
-template<class Codec>
-struct QuantizerNonUniform8: QuantizerNonUniform<Codec> {
-
-    QuantizerNonUniform8 (size_t d, const std::vector<float> &trained):
-        QuantizerNonUniform<Codec> (d, trained) {}
-
-    float compute_distance_L2(SimilarityL2& sim, const uint8_t* codes)
-        const override {
-      return compute_distance_8(*this, sim, codes);
-    }
-
-    float compute_distance_IP(SimilarityIP& sim, const uint8_t* codes)
-        const override {
-      return compute_distance_8(*this, sim, codes);
-    }
-};
-#endif
-
-
-
-
-
-Quantizer *select_quantizer (
-       QuantizerType qtype,
-       size_t d, const std::vector<float> & trained)
-{
-#ifdef USE_AVX
-    if (d % 8 == 0) {
-        switch(qtype) {
-        case ScalarQuantizer::QT_8bit:
-            return new QuantizerNonUniform8<Codec8bit>(d, trained);
-        case ScalarQuantizer::QT_4bit:
-            return new QuantizerNonUniform8<Codec4bit>(d, trained);
-        case ScalarQuantizer::QT_8bit_uniform:
-            return new QuantizerUniform8<Codec8bit>(d, trained);
-        case ScalarQuantizer::QT_4bit_uniform:
-            return new QuantizerUniform8<Codec4bit>(d, trained);
-        }
-    } else
-#endif
-    {
-        switch(qtype) {
-        case ScalarQuantizer::QT_8bit:
-            return new QuantizerNonUniform<Codec8bit>(d, trained);
-        case ScalarQuantizer::QT_4bit:
-            return new QuantizerNonUniform<Codec4bit>(d, trained);
-        case ScalarQuantizer::QT_8bit_uniform:
-            return new QuantizerUniform<Codec8bit>(d, trained);
-        case ScalarQuantizer::QT_4bit_uniform:
-            return new QuantizerUniform<Codec4bit>(d, trained);
-        }
-    }
-    FAISS_THROW_MSG ("unknown qtype");
-    return nullptr;
-}
-
-Quantizer *select_quantizer (const ScalarQuantizer &sq)
-{
-    return select_quantizer (sq.qtype, sq.d, sq.trained);
-}
-
-
-} // anonymous namespace
-
-
-
-/*******************************************************************
- * ScalarQuantizer implementation
- ********************************************************************/
-
-ScalarQuantizer::ScalarQuantizer
-          (size_t d, QuantizerType qtype):
-              qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
-{
-    switch (qtype) {
-    case QT_8bit: case QT_8bit_uniform:
-        code_size = d;
-        break;
-    case QT_4bit: case QT_4bit_uniform:
-        code_size = (d + 1) / 2;
-        break;
-    }
-
-}
-
-ScalarQuantizer::ScalarQuantizer ():
-    qtype(QT_8bit),
-    rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
-{}
-
-void ScalarQuantizer::train (size_t n, const float *x)
-{
-    int bit_per_dim =
-        qtype == QT_4bit_uniform ? 4 :
-        qtype == QT_4bit ? 4 :
-        qtype == QT_8bit_uniform ? 8 :
-        qtype == QT_8bit ? 8 : -1;
-
-    switch (qtype) {
-    case QT_4bit_uniform: case QT_8bit_uniform:
-        train_Uniform (rangestat, rangestat_arg,
-                       n * d, 1 << bit_per_dim, x, trained);
-        break;
-    case QT_4bit: case QT_8bit:
-        train_NonUniform (rangestat, rangestat_arg,
-                          n, d, 1 << bit_per_dim, x, trained);
-        break;
-    }
-}
-
-void ScalarQuantizer::compute_codes (const float * x,
-                                     uint8_t * codes,
-                                     size_t n) const
-{
-    Quantizer *squant = select_quantizer (*this);
-#pragma omp parallel for
-    for (size_t i = 0; i < n; i++)
-        squant->encode_vector (x + i * d, codes + i * code_size);
-    delete squant;
-}
-
-void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
-{
-    Quantizer *squant = select_quantizer (*this);
-#pragma omp parallel for
-    for (size_t i = 0; i < n; i++)
-        squant->decode_vector (codes + i * code_size, x + i * d);
-    delete squant;
-}
-
-
-
-/*******************************************************************
- * IndexIVFScalarQuantizer implementation
- ********************************************************************/
-
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
-          (Index *quantizer, size_t d, size_t nlist,
-           QuantizerType qtype, MetricType metric):
-              IndexIVF (quantizer, d, nlist, metric),
-              sq (d, qtype)
-{
-    code_size = sq.code_size;
-    codes.resize(nlist);
-}
-
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
-      IndexIVF (), code_size (0)
-{}
-
-void IndexIVFScalarQuantizer::train_residual (idx_t n, const float *x)
-{
-    long * idx = new long [n];
-    ScopeDeleter<long> del (idx);
-    quantizer->assign (n, x, idx);
-    float *residuals = new float [n * d];
-    ScopeDeleter<float> del2 (residuals);
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < n; i++) {
-        quantizer->compute_residual (x + i * d, residuals + i * d, idx[i]);
-    }
-
-    sq.train (n, residuals);
-
-}
-
-
-void IndexIVFScalarQuantizer::add_with_ids
-       (idx_t n, const float * x, const long *xids)
-{
-    FAISS_THROW_IF_NOT (is_trained);
-    long * idx = new long [n];
-    ScopeDeleter<long> del (idx);
-    quantizer->assign (n, x, idx);
-    size_t nadd = 0;
-    Quantizer *squant = select_quantizer (sq);
-    ScopeDeleter1<Quantizer> del2 (squant);
-
-#pragma omp parallel reduction(+: nadd)
-    {
-        std::vector<float> residual (d);
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        for (size_t i = 0; i < n; i++) {
-
-            long list_no = idx [i];
-            if (list_no >= 0 && list_no % nt == rank) {
-                long id = xids ? xids[i] : ntotal + i;
-
-                assert (list_no < nlist);
-
-                ids[list_no].push_back (id);
-                nadd++;
-                quantizer->compute_residual (
-                      x + i * d, residual.data(), list_no);
-
-                size_t cur_size = codes[list_no].size();
-                codes[list_no].resize (cur_size + code_size);
-
-                squant->encode_vector (residual.data(),
-                                       codes[list_no].data() + cur_size);
-            }
-        }
-    }
-    ntotal += nadd;
-}
-
-
-void search_with_probes_ip (const IndexIVFScalarQuantizer & index,
-                            const float *x,
-                            const idx_t *cent_ids, const float *cent_dis,
-                            const Quantizer & quant,
-                            int k, float *simi, idx_t *idxi)
-{
-    int nprobe = index.nprobe;
-    size_t code_size = index.code_size;
-    size_t d = index.d;
-    std::vector<float> decoded(d);
-    minheap_heapify (k, simi, idxi);
-    for (int i = 0; i < nprobe; i++) {
-        idx_t list_no = cent_ids[i];
-        if (list_no < 0) break;
-        float accu0 = cent_dis[i];
-
-        const std::vector<idx_t> & ids = index.ids[list_no];
-        const uint8_t* codes = index.codes[list_no].data();
-
-        SimilarityIP sim(x, accu0);
-
-        for (size_t j = 0; j < ids.size(); j++) {
-
-            float accu = quant.compute_distance_IP(sim, codes);
-
-            if (accu > simi [0]) {
-                minheap_pop (k, simi, idxi);
-                minheap_push (k, simi, idxi, accu, ids[j]);
-            }
-            codes += code_size;
-        }
-
-    }
-    minheap_reorder (k, simi, idxi);
-}
-
-void search_with_probes_L2 (const IndexIVFScalarQuantizer & index,
-                            const float *x_in,
-                            const idx_t *cent_ids,
-                            const Index *quantizer,
-                            const Quantizer & quant,
-                            int k, float *simi, idx_t *idxi)
-{
-    int nprobe = index.nprobe;
-    size_t code_size = index.code_size;
-    size_t d = index.d;
-    std::vector<float> decoded(d), x(d);
-    maxheap_heapify (k, simi, idxi);
-    for (int i = 0; i < nprobe; i++) {
-        idx_t list_no = cent_ids[i];
-        if (list_no < 0) break;
-
-        const std::vector<idx_t> & ids = index.ids[list_no];
-        const uint8_t* codes = index.codes[list_no].data();
-
-        // shift of x_in wrt centroid
-        quantizer->compute_residual (x_in, x.data(), list_no);
-
-        SimilarityL2 sim(x.data());
-
-        for (size_t j = 0; j < ids.size(); j++) {
-
-            float dis = quant.compute_distance_L2 (sim, codes);
-
-            if (dis < simi [0]) {
-                maxheap_pop (k, simi, idxi);
-                maxheap_push (k, simi, idxi, dis, ids[j]);
-            }
-            codes += code_size;
-        }
-    }
-    maxheap_reorder (k, simi, idxi);
-}
-
-
-void IndexIVFScalarQuantizer::search (idx_t n, const float *x, idx_t k,
-                                      float *distances, idx_t *labels) const
-{
-    FAISS_THROW_IF_NOT (is_trained);
-    idx_t *idx = new idx_t [n * nprobe];
-    ScopeDeleter<idx_t> del (idx);
-    float *dis = new float [n * nprobe];
-    ScopeDeleter<float> del2 (dis);
-
-    quantizer->search (n, x, nprobe, dis, idx);
-
-    Quantizer *squant = select_quantizer (sq);
-    ScopeDeleter1<Quantizer> del3(squant);
-
-    if (metric_type == METRIC_INNER_PRODUCT) {
-#pragma omp parallel for
-        for (size_t i = 0; i < n; i++) {
-            search_with_probes_ip (*this, x + i * d,
-                                   idx + i * nprobe, dis + i * nprobe, *squant,
-                                   k, distances + i * k, labels + i * k);
-        }
-    } else {
-#pragma omp parallel for
-        for (size_t i = 0; i < n; i++) {
-            search_with_probes_L2 (*this, x + i * d,
-                                   idx + i * nprobe, quantizer, *squant,
-                                   k, distances + i * k, labels + i * k);
-        }
-    }
-
-}
-
-
-void IndexIVFScalarQuantizer::merge_from_residuals (IndexIVF & other_in) {
-    IndexIVFScalarQuantizer &other =
-        dynamic_cast<IndexIVFScalarQuantizer &> (other_in);
-    for (int i = 0; i < nlist; i++) {
-        std::vector<uint8_t> & src = other.codes[i];
-        std::vector<uint8_t> & dest = codes[i];
-        dest.insert (dest.end(), src.begin (), src.end ());
-        src.clear ();
-    }
-
-}
-
-
-}
--- a/IndexIVFScalarQuantizer.h
+++ b/IndexIVFScalarQuantizer.h
@ -1,118 +0,0 @@
-/**
- * Copyright (c) 2015-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the CC-by-NC license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef FAISS_INDEX_IVF_SCALAR_QUANTIZER_H
-#define FAISS_INDEX_IVF_SCALAR_QUANTIZER_H
-
-#include <stdint.h>
-
-
-#include <vector>
-
-
-#include "IndexIVF.h"
-
-
-namespace faiss {
-
-/** An IVF implementation where the components of the residuals are
- * encoded with a scalar uniform quantizer. All distance computations
- * are asymmetric, so the encoded vectors are decoded and approximate
- * distances are computed.
- *
- * The uniform quantizer has a range [vmin, vmax]. The range can be
- * the same for all dimensions (uniform) or specific per dimension
- * (default).
- */
-
-
-struct ScalarQuantizer {
-
-    enum QuantizerType {
-        QT_8bit,             ///< 8 bits per component
-        QT_4bit,             ///< 4 bits per component
-        QT_8bit_uniform,     ///< same, shared range for all dimensions
-        QT_4bit_uniform,
-    };
-
-    QuantizerType qtype;
-
-    /** The uniform encoder can estimate the range of representable
-     * values of the unform encoder using different statistics. Here
-     * rs = rangestat_arg */
-
-    // rangestat_arg.
-    enum RangeStat {
-        RS_minmax,           ///< [min - rs*(max-min), max + rs*(max-min)]
-        RS_meanstd,          ///< [mean - std * rs, mean + std * rs]
-        RS_quantiles,        ///< [Q(rs), Q(1-rs)]
-        RS_optim,            ///< alternate optimization of reconstruction error
-    };
-
-    RangeStat rangestat;
-    float rangestat_arg;
-
-    /// dimension of input vectors
-    size_t d;
-
-    /// bytes per vector
-    size_t code_size;
-
-    /// trained values (including the range)
-    std::vector<float> trained;
-
-    ScalarQuantizer (size_t d, QuantizerType qtype);
-    ScalarQuantizer ();
-
-    void train (size_t n, const float *x);
-
-
-    /// same as compute_code for several vectors
-    void compute_codes (const float * x,
-                        uint8_t * codes,
-                        size_t n) const ;
-
-    /// decode a vector from a given code (or n vectors if third argument)
-     void decode (const uint8_t *code, float *x, size_t n) const;
-
-};
-
-
-struct IndexIVFScalarQuantizer:IndexIVF {
-    ScalarQuantizer sq;
-
-    size_t code_size;
-
-    /// inverted list codes.
-    std::vector<std::vector<uint8_t> > codes;
-
-    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
-                            ScalarQuantizer::QuantizerType qtype,
-                            MetricType metric = METRIC_L2);
-
-    IndexIVFScalarQuantizer();
-
-    void train_residual(idx_t n, const float* x) override;
-
-    void add_with_ids(idx_t n, const float* x, const long* xids) override;
-
-    void search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const override;
-
-    void merge_from_residuals(IndexIVF& other) override;
-};
-
-
-}
-
-
-#endif
--- a/17
+++ b/17
@ -29,7 +29,7 @@ LIBOBJ=hamming.o  utils.o \
       Clustering.o Heap.o VectorTransform.o index_io.o \
       PolysemousTraining.o MetaIndexes.o Index.o \
       ProductQuantizer.o AutoTune.o AuxIndexStructures.o \
-       IndexIVFScalarQuantizer.o FaissException.o
+       IndexScalarQuantizer.o FaissException.o


 $(LIBNAME).a: $(LIBOBJ)
@ -71,7 +71,7 @@ tests/demo_sift1M: tests/demo_sift1M.cpp $(LIBNAME).a
 HFILES = IndexFlat.h Index.h IndexLSH.h IndexPQ.h IndexIVF.h \
    IndexIVFPQ.h VectorTransform.h index_io.h utils.h \
    PolysemousTraining.h Heap.h MetaIndexes.h AuxIndexStructures.h \
-    Clustering.h hamming.h AutoTune.h IndexIVFScalarQuantizer.h FaissException.h
+    Clustering.h hamming.h AutoTune.h IndexScalarQuantizer.h FaissException.h

 # also silently generates python/swigfaiss.py
 python/swigfaiss_wrap.cxx: swigfaiss.swig $(HFILES)
@ -89,11 +89,12 @@ _swigfaiss.so: python/_swigfaiss.so
 #############################
 # Dependencies

-# for i in *.cpp ; do gcc -I.. -MM $i -msse4; done
+# for i in *.cpp ; do g++ -std=c++11 -I.. -MM $i -msse4; done
+
 AutoTune.o: AutoTune.cpp AutoTune.h Index.h FaissAssert.h \
 FaissException.h utils.h Heap.h IndexFlat.h VectorTransform.h IndexLSH.h \
 IndexPQ.h ProductQuantizer.h Clustering.h PolysemousTraining.h \
- IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexIVFScalarQuantizer.h
+ IndexIVF.h IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
 AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h
 Clustering.o: Clustering.cpp Clustering.h Index.h utils.h Heap.h \
 FaissAssert.h FaissException.h IndexFlat.h
@ -106,7 +107,7 @@ IndexFlat.o: IndexFlat.cpp IndexFlat.h Index.h utils.h Heap.h \
 index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
 IndexFlat.h Index.h VectorTransform.h IndexLSH.h IndexPQ.h \
 ProductQuantizer.h Clustering.h Heap.h PolysemousTraining.h IndexIVF.h \
- IndexIVFPQ.h MetaIndexes.h IndexIVFScalarQuantizer.h
+ IndexIVFPQ.h MetaIndexes.h IndexScalarQuantizer.h
 IndexIVF.o: IndexIVF.cpp IndexIVF.h Index.h Clustering.h Heap.h utils.h \
 hamming.h FaissAssert.h FaissException.h IndexFlat.h \
 AuxIndexStructures.h
@ -114,13 +115,13 @@ IndexIVFPQ.o: IndexIVFPQ.cpp IndexIVFPQ.h IndexIVF.h Index.h Clustering.h \
 Heap.h IndexPQ.h ProductQuantizer.h PolysemousTraining.h utils.h \
 IndexFlat.h hamming.h FaissAssert.h FaissException.h \
 AuxIndexStructures.h
-IndexIVFScalarQuantizer.o: IndexIVFScalarQuantizer.cpp \
- IndexIVFScalarQuantizer.h IndexIVF.h Index.h Clustering.h Heap.h utils.h \
- FaissAssert.h FaissException.h
 IndexLSH.o: IndexLSH.cpp IndexLSH.h Index.h VectorTransform.h utils.h \
 Heap.h hamming.h FaissAssert.h FaissException.h
 IndexPQ.o: IndexPQ.cpp IndexPQ.h Index.h ProductQuantizer.h Clustering.h \
 Heap.h PolysemousTraining.h FaissAssert.h FaissException.h hamming.h
+IndexScalarQuantizer.o: IndexScalarQuantizer.cpp IndexScalarQuantizer.h \
+ IndexIVF.h Index.h Clustering.h Heap.h utils.h FaissAssert.h \
+ FaissException.h
 MetaIndexes.o: MetaIndexes.cpp MetaIndexes.h Index.h FaissAssert.h \
 FaissException.h Heap.h AuxIndexStructures.h
 PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
--- a/MetaIndexes.cpp
+++ b/MetaIndexes.cpp
@ -120,6 +120,48 @@ IndexIDMap::~IndexIDMap ()
    if (own_fields) delete index;
 }

+/*****************************************************
+ * IndexIDMap2 implementation
+ *******************************************************/
+
+IndexIDMap2::IndexIDMap2 (Index *index): IndexIDMap (index)
+{}
+
+void IndexIDMap2::add_with_ids(idx_t n, const float* x, const long* xids)
+{
+    size_t prev_ntotal = ntotal;
+    IndexIDMap::add_with_ids (n, x, xids);
+    for (size_t i = prev_ntotal; i < ntotal; i++) {
+        rev_map [id_map [i]] = i;
+    }
+}
+
+void IndexIDMap2::construct_rev_map ()
+{
+    rev_map.clear ();
+    for (size_t i = 0; i < ntotal; i++) {
+        rev_map [id_map [i]] = i;
+    }
+}
+
+
+long IndexIDMap2::remove_ids(const IDSelector& sel)
+{
+    // This is quite inefficient
+    long nremove = IndexIDMap::remove_ids (sel);
+    construct_rev_map ();
+    return nremove;
+}
+
+void IndexIDMap2::reconstruct (idx_t key, float * recons) const
+{
+    try {
+        index->reconstruct (rev_map.at (key), recons);
+    } catch (const std::out_of_range& e) {
+        FAISS_THROW_FMT ("key %ld not found", key);
+    }
+}
+


 /*****************************************************
--- a/MetaIndexes.h
+++ b/MetaIndexes.h
@ -14,6 +14,7 @@


 #include <vector>
+#include <unordered_map>


 #include "Index.h"
@ -54,6 +55,28 @@ struct IndexIDMap : Index {
    IndexIDMap () {own_fields=false; index=nullptr; }
 };

+/** same as IndexIDMap but also provides an efficient reconstruction
+    implementation via a 2-way index */
+struct IndexIDMap2 : IndexIDMap {
+
+    std::unordered_map<idx_t, idx_t> rev_map;
+
+    explicit IndexIDMap2 (Index *index);
+
+    /// make the rev_map from scratch
+    void construct_rev_map ();
+
+    void add_with_ids(idx_t n, const float* x, const long* xids) override;
+
+    long remove_ids(const IDSelector& sel) override;
+
+    void reconstruct (idx_t key, float * recons) const override;
+
+    ~IndexIDMap2() override {}
+    IndexIDMap2 () {}
+};
+
+
 /** Index that concatenates the results from several sub-indexes
 *
 */
--- a/VectorTransform.cpp
+++ b/VectorTransform.cpp
@ -711,6 +711,32 @@ void OPQMatrix::reverse_transform (idx_t n, const float * xt,
    transform_transpose (n, xt, x);
 }

+
+/*********************************************
+ * NormalizationTransform
+ *********************************************/
+
+NormalizationTransform::NormalizationTransform (int d, float norm):
+    VectorTransform (d, d), norm (norm)
+{
+}
+
+NormalizationTransform::NormalizationTransform ():
+    VectorTransform (-1, -1), norm (-1)
+{
+}
+
+void NormalizationTransform::apply_noalloc
+      (idx_t n, const float* x, float* xt) const
+{
+    if (norm == 2.0) {
+        memcpy (xt, x, sizeof (x[0]) * n * d_in);
+        fvec_renorm_L2 (d_in, n, xt);
+    } else {
+        FAISS_THROW_MSG ("not implemented");
+    }
+}
+
 /*********************************************
 * IndexPreTransform
 *********************************************/
@ -730,8 +756,6 @@ IndexPreTransform::IndexPreTransform (
 }


-
-
 IndexPreTransform::IndexPreTransform (
        VectorTransform * ltrans,
        Index * index):
@ -766,9 +790,16 @@ IndexPreTransform::~IndexPreTransform ()
 void IndexPreTransform::train (idx_t n, const float *x)
 {
    int last_untrained = 0;
-    for (int i = 0; i < chain.size(); i++)
-        if (!chain[i]->is_trained) last_untrained = i;
-    if (!index->is_trained) last_untrained = chain.size();
+    if (index->is_trained) {
+        last_untrained = chain.size();
+    } else {
+        for (int i = chain.size() - 1; i >= 0; i--) {
+            if (!chain[i]->is_trained) {
+                last_untrained = i;
+                break;
+            }
+        }
+    }
    const float *prev_x = x;
    ScopeDeleter<float> del;

--- a/VectorTransform.h
+++ b/VectorTransform.h
@ -76,7 +76,6 @@ struct VectorTransform {
 */
 struct LinearTransform: VectorTransform {

-
    bool have_bias; ///! whether to use the bias term

    /// Transformation matrix, size d_out * d_in
@ -85,7 +84,6 @@ struct LinearTransform: VectorTransform {
     /// bias vector, size d_out
    std::vector<float> b;

-
    /// both d_in > d_out and d_out < d_in are supported
    explicit LinearTransform (int d_in = 0, int d_out = 0,
                              bool have_bias = false);
@ -204,7 +202,6 @@ struct OPQMatrix: LinearTransform {
 * to compute it with matrix multiplies */
 struct RemapDimensionsTransform: VectorTransform {

-
    /// map from output dimension to input, size d_out
    /// -1 -> set output to 0
    std::vector<int> map;
@ -225,6 +222,18 @@ struct RemapDimensionsTransform: VectorTransform {
 };


+/** per-vector normalization */
+struct NormalizationTransform: VectorTransform {
+    float norm;
+
+    explicit NormalizationTransform (int d, float norm = 2.0);
+    NormalizationTransform ();
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+};
+
+
+
 /** Index that applies a LinearTransform transform on vectors before
 *  handing them over to a sub-index */
 struct IndexPreTransform: Index {
--- a/faiss.py
+++ b/faiss.py
@ -34,8 +34,13 @@ except ImportError as e:
 ##################################################################


-def replace_method(the_class, name, replacement):
-    orig_method = getattr(the_class, name)
+def replace_method(the_class, name, replacement, ignore_missing=False):
+    try:
+        orig_method = getattr(the_class, name)
+    except AttributeError:
+        if ignore_missing:
+            return
+        raise
    if orig_method.__name__ == 'replacement_' + name:
        # replacement was done in parent class
        return
@ -123,12 +128,31 @@ def handle_Index(the_class):
            sel = IDSelectorBatch(x.size, swig_ptr(x))
        return self.remove_ids_c(sel)

+    def replacement_reconstruct(self, key):
+        x = np.empty(self.d, dtype=np.float32)
+        self.reconstruct_c(key, swig_ptr(x))
+        return x
+
+    def replacement_reconstruct_n(self, n0, ni):
+        x = np.empty((ni, self.d), dtype=np.float32)
+        self.reconstruct_n_c(n0, ni, swig_ptr(x))
+        return x
+
+    def replacement_update_vectors(self, keys, x):
+        n = keys.size
+        assert keys.shape == (n, )
+        assert x.shape == (n, self.d)
+        self.update_vectors_c(n, swig_ptr(keys), swig_ptr(x))
+
    replace_method(the_class, 'add', replacement_add)
    replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
    replace_method(the_class, 'train', replacement_train)
    replace_method(the_class, 'search', replacement_search)
    replace_method(the_class, 'remove_ids', replacement_remove_ids)
-
+    replace_method(the_class, 'reconstruct', replacement_reconstruct)
+    replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
+    replace_method(the_class, 'update_vectors', replacement_update_vectors,
+                   ignore_missing=True)

 def handle_VectorTransform(the_class):

@ -228,12 +252,13 @@ def vector_float_to_array(v):

 class Kmeans:

-    def __init__(self, d, k, niter=25, verbose=False):
+    def __init__(self, d, k, niter=25, verbose=False, spherical = False):
        self.d = d
        self.k = k
        self.cp = ClusteringParameters()
        self.cp.niter = niter
        self.cp.verbose = verbose
+        self.cp.spherical = spherical
        self.centroids = None

    def train(self, x):
@ -241,7 +266,10 @@ class Kmeans:
        n, d = x.shape
        assert d == self.d
        clus = Clustering(d, self.k, self.cp)
-        self.index = IndexFlatL2(d)
+        if self.cp.spherical:
+            self.index = IndexFlatIP(d)
+        else:
+            self.index = IndexFlatL2(d)
        clus.train(x, self.index)
        centroids = vector_float_to_array(clus.centroids)
        self.centroids = centroids.reshape(self.k, d)
--- a/index_io.cpp
+++ b/index_io.cpp
@ -24,7 +24,7 @@
 #include "IndexIVF.h"
 #include "IndexIVFPQ.h"
 #include "MetaIndexes.h"
-#include "IndexIVFScalarQuantizer.h"
+#include "IndexScalarQuantizer.h"

 /*************************************************************
 * The I/O format is the content of the class. For objects that are
@ -184,6 +184,11 @@ void write_VectorTransform (const VectorTransform *vt, FILE *f) {
        uint32_t h = fourcc ("RmDT");
        WRITE1 (h);
        WRITEVECTOR (rdt->map);
+    } else if (const NormalizationTransform *nt =
+               dynamic_cast<const NormalizationTransform *>(vt)) {
+        uint32_t h = fourcc ("VNrm");
+        WRITE1 (h);
+        WRITE1 (nt->norm);
    } else {
        FAISS_THROW_MSG ("cannot serialize this");
    }
@ -261,6 +266,13 @@ void write_index (const Index *idx, FILE *f) {
        WRITE1 (idxp->search_type);
        WRITE1 (idxp->encode_signs);
        WRITE1 (idxp->polysemous_ht);
+    } else if(const IndexScalarQuantizer * idxs =
+              dynamic_cast<const IndexScalarQuantizer *> (idx)) {
+        uint32_t h = fourcc ("IxSQ");
+        WRITE1 (h);
+        write_index_header (idx, f);
+        write_ScalarQuantizer (&idxs->sq, f);
+        WRITEVECTOR (idxs->codes);
    } else if(const IndexIVFFlat * ivfl =
              dynamic_cast<const IndexIVFFlat *> (idx)) {
        uint32_t h = fourcc ("IvFl");
@ -329,7 +341,10 @@ void write_index (const Index *idx, FILE *f) {
        WRITE1 (idxrf->k_factor);
    } else if(const IndexIDMap * idxmap =
              dynamic_cast<const IndexIDMap *> (idx)) {
-        uint32_t h = fourcc ("IxMp");
+        uint32_t h =
+            dynamic_cast<const IndexIDMap2 *> (idx) ? fourcc ("IxM2") :
+            fourcc ("IxMp");
+        // no need to store additional info for IndexIDMap2
        WRITE1 (h);
        write_index_header (idxmap, f);
        write_index (idxmap->index, f);
@ -400,6 +415,10 @@ VectorTransform* read_VectorTransform (FILE *f) {
        RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
        READVECTOR (rdt->map);
        vt = rdt;
+    } else if (h == fourcc ("VNrm")) {
+        NormalizationTransform *nt = new NormalizationTransform ();
+        READ1 (nt->norm);
+        vt = nt;
    } else {
        FAISS_THROW_MSG("fourcc not recognized");
    }
@ -582,6 +601,13 @@ Index *read_index (FILE * f, bool try_mmap) {
        for (size_t i = 0; i < ivfl->nlist; i++)
            READVECTOR (ivfl->vecs[i]);
        idx = ivfl;
+    } else if (h == fourcc ("IxSQ")) {
+        IndexScalarQuantizer * idxs = new IndexScalarQuantizer ();
+        read_index_header (idxs, f);
+        read_ScalarQuantizer (&idxs->sq, f);
+        READVECTOR (idxs->codes);
+        idxs->code_size = idxs->sq.code_size;
+        idx = idxs;
    } else if(h == fourcc ("IvSQ")) {
        IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
        read_ivf_header (ivsc, f);
@ -606,8 +632,9 @@ Index *read_index (FILE * f, bool try_mmap) {
        } else {
            READ1 (nt);
        }
-        for (int i = 0; i < nt; i++)
+        for (int i = 0; i < nt; i++) {
            ixpt->chain.push_back (read_VectorTransform (f));
+        }
        ixpt->index = read_index (f);
        idx = ixpt;
    } else if(h == fourcc ("Imiq")) {
@ -625,12 +652,16 @@ Index *read_index (FILE * f, bool try_mmap) {
        delete rf;
        READ1 (idxrf->k_factor);
        idx = idxrf;
-    } else if(h == fourcc ("IxMp")) {
-        IndexIDMap * idxmap = new IndexIDMap ();
+    } else if(h == fourcc ("IxMp") || h == fourcc ("IxM2")) {
+        bool is_map2 = h == fourcc ("IxM2");
+        IndexIDMap * idxmap = is_map2 ? new IndexIDMap2 () : new IndexIDMap ();
        read_index_header (idxmap, f);
        idxmap->index = read_index (f);
        idxmap->own_fields = true;
        READVECTOR (idxmap->id_map);
+        if (is_map2) {
+            static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
+        }
        idx = idxmap;
    } else {
        fprintf (stderr, "Index type 0x%08x not supported\n", h);
@ -698,6 +729,7 @@ IndexIVF * Cloner::clone_IndexIVF (const IndexIVF *ivf)
    TRYCLONE (IndexIVFPQR, ivf)
    TRYCLONE (IndexIVFPQ, ivf)
    TRYCLONE (IndexIVFFlat, ivf)
+    TRYCLONE (IndexIVFScalarQuantizer, ivf)
    {
      FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
    }
@ -711,6 +743,7 @@ Index *Cloner::clone_Index (const Index *index)
    TRYCLONE (IndexFlatL2, index)
    TRYCLONE (IndexFlatIP, index)
    TRYCLONE (IndexFlat, index)
+    TRYCLONE (IndexScalarQuantizer, index)
    TRYCLONE (MultiIndexQuantizer, index)
    if (const IndexIVF * ivf = dynamic_cast<const IndexIVF*>(index)) {
        IndexIVF *res = clone_IndexIVF (ivf);
--- a/python/swigfaiss.py
+++ b/python/swigfaiss.py
@ -1094,6 +1094,27 @@ class RemapDimensionsTransform(VectorTransform):
 RemapDimensionsTransform_swigregister = _swigfaiss.RemapDimensionsTransform_swigregister
 RemapDimensionsTransform_swigregister(RemapDimensionsTransform)

+class NormalizationTransform(VectorTransform):
+    __swig_setmethods__ = {}
+    for _s in [VectorTransform]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
+    __setattr__ = lambda self, name, value: _swig_setattr(self, NormalizationTransform, name, value)
+    __swig_getmethods__ = {}
+    for _s in [VectorTransform]: __swig_getmethods__.update(getattr(_s,'__swig_getmethods__',{}))
+    __getattr__ = lambda self, name: _swig_getattr(self, NormalizationTransform, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["norm"] = _swigfaiss.NormalizationTransform_norm_set
+    __swig_getmethods__["norm"] = _swigfaiss.NormalizationTransform_norm_get
+    if _newclass:norm = _swig_property(_swigfaiss.NormalizationTransform_norm_get, _swigfaiss.NormalizationTransform_norm_set)
+    def __init__(self, *args): 
+        this = _swigfaiss.new_NormalizationTransform(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    def apply_noalloc(self, *args): return _swigfaiss.NormalizationTransform_apply_noalloc(self, *args)
+    __swig_destroy__ = _swigfaiss.delete_NormalizationTransform
+    __del__ = lambda self : None;
+NormalizationTransform_swigregister = _swigfaiss.NormalizationTransform_swigregister
+NormalizationTransform_swigregister(NormalizationTransform)
+
 class IndexPreTransform(Index):
    __swig_setmethods__ = {}
    for _s in [Index]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
@ -1635,7 +1656,7 @@ class IndexIVF(Index):
    __swig_destroy__ = _swigfaiss.delete_IndexIVF
    __del__ = lambda self : None;
    def get_list_size(self, *args): return _swigfaiss.IndexIVF_get_list_size(self, *args)
-    def make_direct_map(self): return _swigfaiss.IndexIVF_make_direct_map(self)
+    def make_direct_map(self, new_maintain_direct_map=True): return _swigfaiss.IndexIVF_make_direct_map(self, new_maintain_direct_map)
    def imbalance_factor(self): return _swigfaiss.IndexIVF_imbalance_factor(self)
    def print_stats(self): return _swigfaiss.IndexIVF_print_stats(self)
 IndexIVF_swigregister = _swigfaiss.IndexIVF_swigregister
@ -1690,6 +1711,7 @@ class IndexIVFFlat(IndexIVF):
    def remove_ids(self, *args): return _swigfaiss.IndexIVFFlat_remove_ids(self, *args)
    def search_knn_inner_product(self, *args): return _swigfaiss.IndexIVFFlat_search_knn_inner_product(self, *args)
    def search_knn_L2sqr(self, *args): return _swigfaiss.IndexIVFFlat_search_knn_L2sqr(self, *args)
+    def update_vectors(self, *args): return _swigfaiss.IndexIVFFlat_update_vectors(self, *args)
    def reconstruct(self, *args): return _swigfaiss.IndexIVFFlat_reconstruct(self, *args)
    def merge_from_residuals(self, *args): return _swigfaiss.IndexIVFFlat_merge_from_residuals(self, *args)
    def __init__(self, *args): 
@ -1770,6 +1792,38 @@ class ScalarQuantizer(_object):
 ScalarQuantizer_swigregister = _swigfaiss.ScalarQuantizer_swigregister
 ScalarQuantizer_swigregister(ScalarQuantizer)

+class IndexScalarQuantizer(Index):
+    __swig_setmethods__ = {}
+    for _s in [Index]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
+    __setattr__ = lambda self, name, value: _swig_setattr(self, IndexScalarQuantizer, name, value)
+    __swig_getmethods__ = {}
+    for _s in [Index]: __swig_getmethods__.update(getattr(_s,'__swig_getmethods__',{}))
+    __getattr__ = lambda self, name: _swig_getattr(self, IndexScalarQuantizer, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["sq"] = _swigfaiss.IndexScalarQuantizer_sq_set
+    __swig_getmethods__["sq"] = _swigfaiss.IndexScalarQuantizer_sq_get
+    if _newclass:sq = _swig_property(_swigfaiss.IndexScalarQuantizer_sq_get, _swigfaiss.IndexScalarQuantizer_sq_set)
+    __swig_setmethods__["codes"] = _swigfaiss.IndexScalarQuantizer_codes_set
+    __swig_getmethods__["codes"] = _swigfaiss.IndexScalarQuantizer_codes_get
+    if _newclass:codes = _swig_property(_swigfaiss.IndexScalarQuantizer_codes_get, _swigfaiss.IndexScalarQuantizer_codes_set)
+    __swig_setmethods__["code_size"] = _swigfaiss.IndexScalarQuantizer_code_size_set
+    __swig_getmethods__["code_size"] = _swigfaiss.IndexScalarQuantizer_code_size_get
+    if _newclass:code_size = _swig_property(_swigfaiss.IndexScalarQuantizer_code_size_get, _swigfaiss.IndexScalarQuantizer_code_size_set)
+    def __init__(self, *args): 
+        this = _swigfaiss.new_IndexScalarQuantizer(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    def train(self, *args): return _swigfaiss.IndexScalarQuantizer_train(self, *args)
+    def add(self, *args): return _swigfaiss.IndexScalarQuantizer_add(self, *args)
+    def search(self, *args): return _swigfaiss.IndexScalarQuantizer_search(self, *args)
+    def reset(self): return _swigfaiss.IndexScalarQuantizer_reset(self)
+    def reconstruct_n(self, *args): return _swigfaiss.IndexScalarQuantizer_reconstruct_n(self, *args)
+    def reconstruct(self, *args): return _swigfaiss.IndexScalarQuantizer_reconstruct(self, *args)
+    __swig_destroy__ = _swigfaiss.delete_IndexScalarQuantizer
+    __del__ = lambda self : None;
+IndexScalarQuantizer_swigregister = _swigfaiss.IndexScalarQuantizer_swigregister
+IndexScalarQuantizer_swigregister(IndexScalarQuantizer)
+
 class IndexIVFScalarQuantizer(IndexIVF):
    __swig_setmethods__ = {}
    for _s in [IndexIVF]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
@ -2024,6 +2078,30 @@ class IndexIDMap(Index):
 IndexIDMap_swigregister = _swigfaiss.IndexIDMap_swigregister
 IndexIDMap_swigregister(IndexIDMap)

+class IndexIDMap2(IndexIDMap):
+    __swig_setmethods__ = {}
+    for _s in [IndexIDMap]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
+    __setattr__ = lambda self, name, value: _swig_setattr(self, IndexIDMap2, name, value)
+    __swig_getmethods__ = {}
+    for _s in [IndexIDMap]: __swig_getmethods__.update(getattr(_s,'__swig_getmethods__',{}))
+    __getattr__ = lambda self, name: _swig_getattr(self, IndexIDMap2, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["rev_map"] = _swigfaiss.IndexIDMap2_rev_map_set
+    __swig_getmethods__["rev_map"] = _swigfaiss.IndexIDMap2_rev_map_get
+    if _newclass:rev_map = _swig_property(_swigfaiss.IndexIDMap2_rev_map_get, _swigfaiss.IndexIDMap2_rev_map_set)
+    def construct_rev_map(self): return _swigfaiss.IndexIDMap2_construct_rev_map(self)
+    def add_with_ids(self, *args): return _swigfaiss.IndexIDMap2_add_with_ids(self, *args)
+    def remove_ids(self, *args): return _swigfaiss.IndexIDMap2_remove_ids(self, *args)
+    def reconstruct(self, *args): return _swigfaiss.IndexIDMap2_reconstruct(self, *args)
+    __swig_destroy__ = _swigfaiss.delete_IndexIDMap2
+    __del__ = lambda self : None;
+    def __init__(self, *args): 
+        this = _swigfaiss.new_IndexIDMap2(*args)
+        try: self.this.append(this)
+        except: self.this = this
+IndexIDMap2_swigregister = _swigfaiss.IndexIDMap2_swigregister
+IndexIDMap2_swigregister(IndexIDMap2)
+
 class IndexShards(Index):
    __swig_setmethods__ = {}
    for _s in [Index]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
--- a/python/swigfaiss_gpu.py
+++ b/python/swigfaiss_gpu.py
@ -1163,6 +1163,27 @@ class RemapDimensionsTransform(VectorTransform):
 RemapDimensionsTransform_swigregister = _swigfaiss_gpu.RemapDimensionsTransform_swigregister
 RemapDimensionsTransform_swigregister(RemapDimensionsTransform)

+class NormalizationTransform(VectorTransform):
+    __swig_setmethods__ = {}
+    for _s in [VectorTransform]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
+    __setattr__ = lambda self, name, value: _swig_setattr(self, NormalizationTransform, name, value)
+    __swig_getmethods__ = {}
+    for _s in [VectorTransform]: __swig_getmethods__.update(getattr(_s,'__swig_getmethods__',{}))
+    __getattr__ = lambda self, name: _swig_getattr(self, NormalizationTransform, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["norm"] = _swigfaiss_gpu.NormalizationTransform_norm_set
+    __swig_getmethods__["norm"] = _swigfaiss_gpu.NormalizationTransform_norm_get
+    if _newclass:norm = _swig_property(_swigfaiss_gpu.NormalizationTransform_norm_get, _swigfaiss_gpu.NormalizationTransform_norm_set)
+    def __init__(self, *args): 
+        this = _swigfaiss_gpu.new_NormalizationTransform(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    def apply_noalloc(self, *args): return _swigfaiss_gpu.NormalizationTransform_apply_noalloc(self, *args)
+    __swig_destroy__ = _swigfaiss_gpu.delete_NormalizationTransform
+    __del__ = lambda self : None;
+NormalizationTransform_swigregister = _swigfaiss_gpu.NormalizationTransform_swigregister
+NormalizationTransform_swigregister(NormalizationTransform)
+
 class IndexPreTransform(Index):
    __swig_setmethods__ = {}
    for _s in [Index]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
@ -1704,7 +1725,7 @@ class IndexIVF(Index):
    __swig_destroy__ = _swigfaiss_gpu.delete_IndexIVF
    __del__ = lambda self : None;
    def get_list_size(self, *args): return _swigfaiss_gpu.IndexIVF_get_list_size(self, *args)
-    def make_direct_map(self): return _swigfaiss_gpu.IndexIVF_make_direct_map(self)
+    def make_direct_map(self, new_maintain_direct_map=True): return _swigfaiss_gpu.IndexIVF_make_direct_map(self, new_maintain_direct_map)
    def imbalance_factor(self): return _swigfaiss_gpu.IndexIVF_imbalance_factor(self)
    def print_stats(self): return _swigfaiss_gpu.IndexIVF_print_stats(self)
 IndexIVF_swigregister = _swigfaiss_gpu.IndexIVF_swigregister
@ -1759,6 +1780,7 @@ class IndexIVFFlat(IndexIVF):
    def remove_ids(self, *args): return _swigfaiss_gpu.IndexIVFFlat_remove_ids(self, *args)
    def search_knn_inner_product(self, *args): return _swigfaiss_gpu.IndexIVFFlat_search_knn_inner_product(self, *args)
    def search_knn_L2sqr(self, *args): return _swigfaiss_gpu.IndexIVFFlat_search_knn_L2sqr(self, *args)
+    def update_vectors(self, *args): return _swigfaiss_gpu.IndexIVFFlat_update_vectors(self, *args)
    def reconstruct(self, *args): return _swigfaiss_gpu.IndexIVFFlat_reconstruct(self, *args)
    def merge_from_residuals(self, *args): return _swigfaiss_gpu.IndexIVFFlat_merge_from_residuals(self, *args)
    def __init__(self, *args): 
@ -1839,6 +1861,38 @@ class ScalarQuantizer(_object):
 ScalarQuantizer_swigregister = _swigfaiss_gpu.ScalarQuantizer_swigregister
 ScalarQuantizer_swigregister(ScalarQuantizer)

+class IndexScalarQuantizer(Index):
+    __swig_setmethods__ = {}
+    for _s in [Index]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
+    __setattr__ = lambda self, name, value: _swig_setattr(self, IndexScalarQuantizer, name, value)
+    __swig_getmethods__ = {}
+    for _s in [Index]: __swig_getmethods__.update(getattr(_s,'__swig_getmethods__',{}))
+    __getattr__ = lambda self, name: _swig_getattr(self, IndexScalarQuantizer, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["sq"] = _swigfaiss_gpu.IndexScalarQuantizer_sq_set
+    __swig_getmethods__["sq"] = _swigfaiss_gpu.IndexScalarQuantizer_sq_get
+    if _newclass:sq = _swig_property(_swigfaiss_gpu.IndexScalarQuantizer_sq_get, _swigfaiss_gpu.IndexScalarQuantizer_sq_set)
+    __swig_setmethods__["codes"] = _swigfaiss_gpu.IndexScalarQuantizer_codes_set
+    __swig_getmethods__["codes"] = _swigfaiss_gpu.IndexScalarQuantizer_codes_get
+    if _newclass:codes = _swig_property(_swigfaiss_gpu.IndexScalarQuantizer_codes_get, _swigfaiss_gpu.IndexScalarQuantizer_codes_set)
+    __swig_setmethods__["code_size"] = _swigfaiss_gpu.IndexScalarQuantizer_code_size_set
+    __swig_getmethods__["code_size"] = _swigfaiss_gpu.IndexScalarQuantizer_code_size_get
+    if _newclass:code_size = _swig_property(_swigfaiss_gpu.IndexScalarQuantizer_code_size_get, _swigfaiss_gpu.IndexScalarQuantizer_code_size_set)
+    def __init__(self, *args): 
+        this = _swigfaiss_gpu.new_IndexScalarQuantizer(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    def train(self, *args): return _swigfaiss_gpu.IndexScalarQuantizer_train(self, *args)
+    def add(self, *args): return _swigfaiss_gpu.IndexScalarQuantizer_add(self, *args)
+    def search(self, *args): return _swigfaiss_gpu.IndexScalarQuantizer_search(self, *args)
+    def reset(self): return _swigfaiss_gpu.IndexScalarQuantizer_reset(self)
+    def reconstruct_n(self, *args): return _swigfaiss_gpu.IndexScalarQuantizer_reconstruct_n(self, *args)
+    def reconstruct(self, *args): return _swigfaiss_gpu.IndexScalarQuantizer_reconstruct(self, *args)
+    __swig_destroy__ = _swigfaiss_gpu.delete_IndexScalarQuantizer
+    __del__ = lambda self : None;
+IndexScalarQuantizer_swigregister = _swigfaiss_gpu.IndexScalarQuantizer_swigregister
+IndexScalarQuantizer_swigregister(IndexScalarQuantizer)
+
 class IndexIVFScalarQuantizer(IndexIVF):
    __swig_setmethods__ = {}
    for _s in [IndexIVF]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
@ -2093,6 +2147,30 @@ class IndexIDMap(Index):
 IndexIDMap_swigregister = _swigfaiss_gpu.IndexIDMap_swigregister
 IndexIDMap_swigregister(IndexIDMap)

+class IndexIDMap2(IndexIDMap):
+    __swig_setmethods__ = {}
+    for _s in [IndexIDMap]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
+    __setattr__ = lambda self, name, value: _swig_setattr(self, IndexIDMap2, name, value)
+    __swig_getmethods__ = {}
+    for _s in [IndexIDMap]: __swig_getmethods__.update(getattr(_s,'__swig_getmethods__',{}))
+    __getattr__ = lambda self, name: _swig_getattr(self, IndexIDMap2, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["rev_map"] = _swigfaiss_gpu.IndexIDMap2_rev_map_set
+    __swig_getmethods__["rev_map"] = _swigfaiss_gpu.IndexIDMap2_rev_map_get
+    if _newclass:rev_map = _swig_property(_swigfaiss_gpu.IndexIDMap2_rev_map_get, _swigfaiss_gpu.IndexIDMap2_rev_map_set)
+    def construct_rev_map(self): return _swigfaiss_gpu.IndexIDMap2_construct_rev_map(self)
+    def add_with_ids(self, *args): return _swigfaiss_gpu.IndexIDMap2_add_with_ids(self, *args)
+    def remove_ids(self, *args): return _swigfaiss_gpu.IndexIDMap2_remove_ids(self, *args)
+    def reconstruct(self, *args): return _swigfaiss_gpu.IndexIDMap2_reconstruct(self, *args)
+    __swig_destroy__ = _swigfaiss_gpu.delete_IndexIDMap2
+    __del__ = lambda self : None;
+    def __init__(self, *args): 
+        this = _swigfaiss_gpu.new_IndexIDMap2(*args)
+        try: self.this.append(this)
+        except: self.this = this
+IndexIDMap2_swigregister = _swigfaiss_gpu.IndexIDMap2_swigregister
+IndexIDMap2_swigregister(IndexIDMap2)
+
 class IndexShards(Index):
    __swig_setmethods__ = {}
    for _s in [Index]: __swig_setmethods__.update(getattr(_s,'__swig_setmethods__',{}))
--- a/python/swigfaiss_gpu_wrap.cxx
+++ b/python/swigfaiss_gpu_wrap.cxx
--- a/python/swigfaiss_wrap.cxx
+++ b/python/swigfaiss_wrap.cxx
--- a/swigfaiss.swig
+++ b/swigfaiss.swig
@ -74,7 +74,7 @@ extern "C" {
 #include "IndexPQ.h"
 #include "IndexIVF.h"
 #include "IndexIVFPQ.h"
-#include "IndexIVFScalarQuantizer.h"
+#include "IndexScalarQuantizer.h"

 #include "MetaIndexes.h"
 #include "FaissAssert.h"
@ -240,7 +240,7 @@ int get_num_gpus()
 %include "PolysemousTraining.h"
 %include "IndexPQ.h"
 %include "IndexIVF.h"
-%include "IndexIVFScalarQuantizer.h"
+%include "IndexScalarQuantizer.h"

 %ignore faiss::IndexIVFPQ::alloc_type;
 %include "IndexIVFPQ.h"
@ -426,6 +426,7 @@ struct AsyncIndexSearchC {
    DOWNCAST ( IndexIVF )
    DOWNCAST ( IndexFlat )
    DOWNCAST ( IndexPQ )
+    DOWNCAST ( IndexScalarQuantizer )
    DOWNCAST ( IndexLSH )
    DOWNCAST ( IndexPreTransform )
    DOWNCAST ( MultiIndexQuantizer )
@ -457,6 +458,7 @@ struct AsyncIndexSearchC {
    DOWNCAST (PCAMatrix)
    DOWNCAST (RandomRotationMatrix)
    DOWNCAST (LinearTransform)
+    DOWNCAST (NormalizationTransform)
    DOWNCAST (VectorTransform)
    {
        assert(false);
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@ -11,6 +11,7 @@ import numpy as np
 import faiss
 import unittest

+
 class TestClustering(unittest.TestCase):

    def test_clustering(self):
@ -34,6 +35,17 @@ class TestClustering(unittest.TestCase):
        # check that 64 centroids give a lower quantization error than 32
        self.assertGreater(err32, err64)

+    def test_nasty_clustering(self):
+        d = 2
+        np.random.seed(123)
+        x = np.zeros((100, d), dtype='float32')
+        for i in range(5):
+            x[i * 20:i * 20 + 20] = np.random.random(size=d)
+
+        # we have 5 distinct points but ask for 10 centroids...
+        km = faiss.Kmeans(d, 10, niter=10, verbose=True)
+        km.train(x)
+

 class TestPCA(unittest.TestCase):

--- a/tests/test_index.py
+++ b/tests/test_index.py
@ -6,10 +6,8 @@

 #! /usr/bin/env python2

-"""this is a basic test script that works with fbmake to check if
-some simple indices work"""
+"""this is a basic test script for simple indices work"""

-import sys
 import numpy as np
 import unittest
 import faiss
@ -75,9 +73,9 @@ class TestMultiIndexQuantizer(unittest.TestCase):
        self.assertEqual(np.abs(D1[:, :1] - D5[:, :1]).max(), 0)


-class TestIVFScalarQuantizer(unittest.TestCase):
+class TestScalarQuantizer(unittest.TestCase):

-    def test_4variants(self):
+    def test_4variants_ivf(self):
        d = 32
        nt = 1500
        nq = 200
@ -127,19 +125,39 @@ class TestIVFScalarQuantizer(unittest.TestCase):
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
        self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])

+    def test_4variants(self):
+        d = 32
+        nt = 1500
+        nq = 200
+        nb = 10000

-class TestRemove(unittest.TestCase):
+        np.random.seed(123)

-    def test_remove(self):
-        # only tests the python interface
+        xt = np.random.random(size=(nt, d)).astype('float32')
+        xq = np.random.random(size=(nq, d)).astype('float32')
+        xb = np.random.random(size=(nb, d)).astype('float32')
+
+        index_gt = faiss.IndexFlatL2(d)
+        index_gt.add(xb)
+        D, I_ref = index_gt.search(xq, 10)
+
+        nok = {}
+
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform".split():
+            qtype = getattr(faiss.ScalarQuantizer, qname)
+            index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
+            index.train(xt)
+            index.add(xb)
+            D, I = index.search(xq, 10)
+
+            nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
+
+        print(nok)
+
+        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
+        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
+        self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])

-        index = faiss.IndexFlat(5)
-        xb = np.zeros((10, 5), dtype='float32')
-        xb[:, 0] = np.arange(10) + 1000
-        index.add(xb)
-        index.remove_ids(np.arange(5) * 2)
-        xb2 = faiss.vector_float_to_array(index.xb).reshape(5, 5)
-        assert np.all(xb2[:, 0] == xb[np.arange(5) * 2 + 1, 0])


 if __name__ == '__main__':
--- a/utils.cpp
+++ b/utils.cpp
@ -1418,7 +1418,7 @@ int km_update_centroids (const float * x,
    for (size_t ci = 0; ci < k; ci++) {
        if (hassign[ci] == 0) { /* need to redefine a centroid */
            size_t cj;
-            for (cj = 0; 1; cj = (cj+1) % k) {
+            for (cj = 0; 1; cj = (cj + 1) % k) {
                /* probability to pick this cluster for split */
                float p = (hassign[cj] - 1.0) / (float) (n - k);
                float r = rng.rand_float ();
@ -1429,15 +1429,15 @@ int km_update_centroids (const float * x,
            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);

            /* small symmetric pertubation. Much better than  */
-            for (size_t j = 0; j < d; j++)
+            for (size_t j = 0; j < d; j++) {
                if (j % 2 == 0) {
                    centroids[ci * d + j] *= 1 + EPS;
                    centroids[cj * d + j] *= 1 - EPS;
+                } else {
+                    centroids[ci * d + j] *= 1 - EPS;
+                    centroids[cj * d + j] *= 1 + EPS;
                }
-                else {
-                    centroids[ci * d + j] *= 1 + EPS;
-                    centroids[cj * d + j] *= 1 - EPS;
-                }
+            }

            /* assume even split of the cluster */
            hassign[ci] = hassign[cj] / 2;