diff --git a/Clustering.h b/Clustering.h
index 4215ed336..46410af79 100644
--- a/Clustering.h
+++ b/Clustering.h
@@ -34,7 +34,7 @@ struct ClusteringParameters {
 
     int seed; ///< seed for the random number generator
 
-    size_t decode_block_size;  /// < how many vectors at a time to decode
+    size_t decode_block_size;  ///< how many vectors at a time to decode
 
     /// sets reasonable defaults
     ClusteringParameters ();
@@ -42,11 +42,11 @@ struct ClusteringParameters {
 
 
 struct ClusteringIterationStats {
-    float obj;    /// objective values (sum of distances reported by index)
-    double time;  /// seconds for iteration
-    double time_search; /// seconds for just search
-    double imbalance_factor; /// imbalance factor of iteration
-    int nsplit;   /// number of cluster splits
+    float obj;               ///< objective values (sum of distances reported by index)
+    double time;             ///< seconds for iteration
+    double time_search;      ///< seconds for just search
+    double imbalance_factor; ///< imbalance factor of iteration
+    int nsplit;              ///< number of cluster splits
 };
 
 
diff --git a/Index.h b/Index.h
index efd5fecee..6d413adff 100644
--- a/Index.h
+++ b/Index.h
@@ -18,7 +18,7 @@
 
 #define FAISS_VERSION_MAJOR 1
 #define FAISS_VERSION_MINOR 6
-#define FAISS_VERSION_PATCH 2
+#define FAISS_VERSION_PATCH 3
 
 /**
  * @namespace faiss
@@ -44,12 +44,10 @@ struct IDSelector;
 struct RangeSearchResult;
 struct DistanceComputer;
 
-/** Abstract structure for an index
+/** Abstract structure for an index, supports adding vectors and searching them.
  *
- * Supports adding vertices and searching them.
- *
- * Currently only asymmetric queries are supported:
- * database-to-database queries are not implemented.
+ * All vectors provided at add or search time are 32-bit float arrays,
+ * although the internal representation may vary.
  */
 struct Index {
     using idx_t = int64_t;  ///< all indices are this type
diff --git a/IndexBinary.h b/IndexBinary.h
index 88042002e..e1a16f79a 100644
--- a/IndexBinary.h
+++ b/IndexBinary.h
@@ -99,9 +99,13 @@ struct IndexBinary {
 
   /** Query n vectors of dimension d to the index.
    *
-   * return all vectors with distance < radius. Note that many
-   * indexes do not implement the range_search (only the k-NN search
-   * is mandatory).
+   * return all vectors with distance < radius. Note that many indexes
+   * do not implement the range_search (only the k-NN search is
+   * mandatory). The distances are converted to float to reuse the
+   * RangeSearchResult structure, but they are integer. By convention,
+   * only distances < radius (strict comparison) are returned,
+   * ie. radius = 0 does not return any result and 1 returns only
+   * exact same vectors.
    *
    * @param x           input vectors to search, size n * d / 8
    * @param radius      search radius
diff --git a/IndexBinaryFlat.cpp b/IndexBinaryFlat.cpp
index a3de92d44..d7d8b4dd8 100644
--- a/IndexBinaryFlat.cpp
+++ b/IndexBinaryFlat.cpp
@@ -79,5 +79,10 @@ void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
   memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
 }
 
+void IndexBinaryFlat::range_search(idx_t n, const uint8_t *x, int radius,
+                   RangeSearchResult *result) const
+{
+    hamming_range_search (x, xb.data(), n, ntotal, radius, code_size, result);
+}
 
 }  // namespace faiss
diff --git a/IndexBinaryFlat.h b/IndexBinaryFlat.h
index 6f24aac5b..17d016498 100644
--- a/IndexBinaryFlat.h
+++ b/IndexBinaryFlat.h
@@ -38,6 +38,9 @@ struct IndexBinaryFlat : IndexBinary {
   void search(idx_t n, const uint8_t *x, idx_t k,
               int32_t *distances, idx_t *labels) const override;
 
+  void range_search(idx_t n, const uint8_t *x, int radius,
+                   RangeSearchResult *result) const override;
+
   void reconstruct(idx_t key, uint8_t *recons) const override;
 
   /** Remove some ids. Note that because of the indexing structure,
diff --git a/IndexBinaryHash.cpp b/IndexBinaryHash.cpp
new file mode 100644
index 000000000..4f9ccdbae
--- /dev/null
+++ b/IndexBinaryHash.cpp
@@ -0,0 +1,492 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryHash.h>
+
+#include <cstdio>
+#include <memory>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+void IndexBinaryHash::InvertedList::add (
+        idx_t id, size_t code_size, const uint8_t *code)
+{
+    ids.push_back(id);
+    vecs.insert(vecs.end(), code, code + code_size);
+}
+
+IndexBinaryHash::IndexBinaryHash(int d, int b):
+    IndexBinary(d), b(b), nflip(0)
+{
+    is_trained = true;
+}
+
+IndexBinaryHash::IndexBinaryHash(): b(0), nflip(0)
+{
+    is_trained = true;
+}
+
+void IndexBinaryHash::reset()
+{
+    invlists.clear();
+    ntotal = 0;
+}
+
+
+void IndexBinaryHash::add(idx_t n, const uint8_t *x)
+{
+    add_with_ids(n, x, nullptr);
+}
+
+void IndexBinaryHash::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids)
+{
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+    // simplistic add function. Cannot really be parallelized.
+
+    for (idx_t i = 0; i < n; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        const uint8_t * xi = x + i * code_size;
+        idx_t hash = *((uint64_t*)xi) & mask;
+        invlists[hash].add(id, code_size, xi);
+    }
+    ntotal += n;
+}
+
+namespace {
+
+
+/** Enumerate all bit vectors of size nbit with up to maxflip 1s
+ * test in P127257851 P127258235
+ */
+struct FlipEnumerator {
+    int nbit, nflip, maxflip;
+    uint64_t mask, x;
+
+    FlipEnumerator (int nbit, int maxflip): nbit(nbit), maxflip(maxflip) {
+        nflip = 0;
+        mask = 0;
+        x = 0;
+    }
+
+    bool next() {
+        if (x == mask) {
+            if (nflip == maxflip) {
+                return false;
+            }
+            // increase Hamming radius
+            nflip++;
+            mask = (((uint64_t)1 << nflip) - 1);
+            x = mask << (nbit - nflip);
+            return true;
+        }
+
+        int i = __builtin_ctzll(x);
+
+        if (i > 0) {
+            x ^= (uint64_t)3 << (i - 1);
+        } else {
+            // nb of LSB 1s
+            int n1 = __builtin_ctzll(~x);
+            // clear them
+            x &= ((uint64_t)(-1) << n1);
+            int n2 = __builtin_ctzll(x);
+            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
+        }
+        return true;
+    }
+
+};
+
+using idx_t = Index::idx_t;
+
+
+struct RangeSearchResults {
+    int radius;
+    RangeQueryResult &qres;
+
+    inline void add (float dis, idx_t id) {
+        if (dis < radius) {
+            qres.add (dis, id);
+        }
+    }
+
+};
+
+struct KnnSearchResults {
+    // heap params
+    idx_t k;
+    int32_t * heap_sim;
+    idx_t * heap_ids;
+
+    using C = CMax<int, idx_t>;
+
+    inline void add (float dis, idx_t id) {
+        if (dis < heap_sim[0]) {
+            heap_pop<C> (k, heap_sim, heap_ids);
+            heap_push<C> (k, heap_sim, heap_ids, dis, id);
+        }
+    }
+
+};
+
+template<class HammingComputer, class SearchResults>
+void
+search_single_query_template(const IndexBinaryHash & index, const uint8_t *q,
+                    SearchResults &res,
+                    size_t &n0, size_t &nlist, size_t &ndis)
+{
+    size_t code_size = index.code_size;
+    uint64_t mask = ((uint64_t)1 << index.b) - 1;
+    uint64_t qhash = *((uint64_t*)q) & mask;
+    HammingComputer hc (q, code_size);
+    FlipEnumerator fe(index.b, index.nflip);
+
+    // loop over neighbors that are at most at nflip bits
+    do {
+        uint64_t hash = qhash ^ fe.x;
+        auto it = index.invlists.find (hash);
+
+        if (it == index.invlists.end()) {
+            continue;
+        }
+
+        const IndexBinaryHash::InvertedList &il = it->second;
+
+        size_t nv = il.ids.size();
+
+        if (nv == 0) {
+            n0++;
+        } else {
+            const uint8_t *codes = il.vecs.data();
+            for (size_t i = 0; i < nv; i++) {
+                int dis = hc.hamming (codes);
+                res.add(dis, il.ids[i]);
+                codes += code_size;
+            }
+            ndis += nv;
+            nlist++;
+        }
+    } while(fe.next());
+}
+
+template<class SearchResults>
+void
+search_single_query(const IndexBinaryHash & index, const uint8_t *q,
+                    SearchResults &res,
+                    size_t &n0, size_t &nlist, size_t &ndis)
+{
+#define HC(name) search_single_query_template<name>(index, q, res, n0, nlist, ndis);
+    switch(index.code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 20: HC(HammingComputer20); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (index.code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+
+} // anonymous namespace
+
+
+
+void IndexBinaryHash::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result) const
+{
+
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres (result);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult & qres = pres.new_result (i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t *q = x + i * code_size;
+
+            search_single_query (*this, q, res, n0, nlist, ndis);
+
+        }
+        pres.finalize ();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+void IndexBinaryHash::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels) const
+{
+
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
+    for (size_t i = 0; i < n; i++) {
+        int32_t * simi = distances + k * i;
+        idx_t * idxi = labels + k * i;
+
+        heap_heapify<HeapForL2> (k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t *q = x + i * code_size;
+
+        search_single_query (*this, q, res, n0, nlist, ndis);
+
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+size_t IndexBinaryHash::hashtable_size() const
+{
+    return invlists.size();
+}
+
+
+void IndexBinaryHash::display() const
+{
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        printf("%ld: [", it->first);
+        const std::vector<idx_t> & v = it->second.ids;
+        for (auto x: v) {
+            printf("%ld ", 0 + x);
+        }
+        printf("]\n");
+
+    }
+}
+
+
+void IndexBinaryHashStats::reset()
+{
+    memset ((void*)this, 0, sizeof (*this));
+}
+
+IndexBinaryHashStats indexBinaryHash_stats;
+
+/*******************************************************
+ * IndexBinaryMultiHash implementation
+ ******************************************************/
+
+
+IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b):
+    IndexBinary(d),
+    storage(new IndexBinaryFlat(d)), own_fields(true),
+    maps(nhash), nhash(nhash), b(b), nflip(0)
+{
+    FAISS_THROW_IF_NOT(nhash * b <= d);
+}
+
+IndexBinaryMultiHash::IndexBinaryMultiHash():
+    storage(nullptr), own_fields(true),
+    nhash(0), b(0), nflip(0)
+{}
+
+IndexBinaryMultiHash::~IndexBinaryMultiHash()
+{
+    if (own_fields) {
+        delete storage;
+    }
+}
+
+
+void IndexBinaryMultiHash::reset()
+{
+    storage->reset();
+    ntotal = 0;
+    for(auto map: maps) {
+        map.clear();
+    }
+}
+
+void IndexBinaryMultiHash::add(idx_t n, const uint8_t *x)
+{
+    storage->add(n, x);
+    // populate maps
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+
+    for(idx_t i = 0; i < n; i++) {
+        const uint8_t *xi = x + i * code_size;
+        int ho = 0;
+        for(int h = 0; h < nhash; h++) {
+            uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+            hash &= mask;
+            maps[h][hash].push_back(i + ntotal);
+            ho += b;
+        }
+    }
+    ntotal += n;
+}
+
+
+namespace {
+
+template <class HammingComputer, class SearchResults>
+static
+void verify_shortlist(
+        const IndexBinaryFlat & index,
+        const uint8_t * q,
+        const std::unordered_set<Index::idx_t> & shortlist,
+        SearchResults &res)
+{
+    size_t code_size = index.code_size;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+    HammingComputer hc (q, code_size);
+    const uint8_t *codes = index.xb.data();
+
+    for (auto i: shortlist) {
+        int dis = hc.hamming (codes + i * code_size);
+        res.add(dis, i);
+    }
+}
+
+template<class SearchResults>
+void
+search_1_query_multihash(const IndexBinaryMultiHash & index, const uint8_t *xi,
+                         SearchResults &res,
+                         size_t &n0, size_t &nlist, size_t &ndis)
+{
+
+    std::unordered_set<idx_t> shortlist;
+    int b = index.b;
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+
+    int ho = 0;
+    for(int h = 0; h < index.nhash; h++) {
+        uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+        qhash &= mask;
+        const IndexBinaryMultiHash::Map & map = index.maps[h];
+
+        FlipEnumerator fe(index.b, index.nflip);
+        // loop over neighbors that are at most at nflip bits
+        do {
+            uint64_t hash = qhash ^ fe.x;
+            auto it = map.find (hash);
+
+            if (it != map.end()) {
+                const std::vector<idx_t> & v = it->second;
+                for (auto i: v) {
+                    shortlist.insert(i);
+                }
+                nlist++;
+            } else {
+                n0++;
+            }
+        } while(fe.next());
+
+        ho += b;
+    }
+    ndis += shortlist.size();
+
+    // verify shortlist
+
+#define HC(name) verify_shortlist<name> (*index.storage, xi, shortlist, res)
+    switch(index.code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 20: HC(HammingComputer20); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (index.code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+} // anonymous namespace
+
+void IndexBinaryMultiHash::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result) const
+{
+
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres (result);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult & qres = pres.new_result (i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t *q = x + i * code_size;
+
+            search_1_query_multihash (*this, q, res, n0, nlist, ndis);
+
+        }
+        pres.finalize ();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+void IndexBinaryMultiHash::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels) const
+{
+
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
+    for (size_t i = 0; i < n; i++) {
+        int32_t * simi = distances + k * i;
+        idx_t * idxi = labels + k * i;
+
+        heap_heapify<HeapForL2> (k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t *q = x + i * code_size;
+
+        search_1_query_multihash (*this, q, res, n0, nlist, ndis);
+
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+size_t IndexBinaryMultiHash::hashtable_size() const
+{
+    size_t tot = 0;
+    for (auto map: maps) {
+        tot += map.size();
+    }
+
+    return tot;
+}
+
+
+}
diff --git a/IndexBinaryHash.h b/IndexBinaryHash.h
new file mode 100644
index 000000000..6e8e82467
--- /dev/null
+++ b/IndexBinaryHash.h
@@ -0,0 +1,116 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_BINARY_HASH_H
+#define FAISS_BINARY_HASH_H
+
+
+
+#include <vector>
+#include <unordered_map>
+
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+struct RangeSearchResult;
+
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryHash : IndexBinary {
+
+    struct InvertedList {
+        std::vector<idx_t> ids;
+        std::vector<uint8_t> vecs;
+
+        void add (idx_t id, size_t code_size, const uint8_t *code);
+    };
+
+    using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
+    InvertedListMap invlists;
+
+    int b, nflip;
+
+    IndexBinaryHash(int d, int b);
+
+    IndexBinaryHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result) const override;
+
+    void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels) const override;
+
+    void display() const;
+    size_t hashtable_size() const;
+
+};
+
+struct IndexBinaryHashStats {
+    size_t nq;       // nb of queries run
+    size_t n0;       // nb of empty lists
+    size_t nlist;    // nb of non-empty inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+
+    IndexBinaryHashStats () {reset (); }
+    void reset ();
+};
+
+extern IndexBinaryHashStats indexBinaryHash_stats;
+
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryMultiHash: IndexBinary {
+
+    // where the vectors are actually stored
+    IndexBinaryFlat *storage;
+    bool own_fields;
+
+    // maps hash values to the ids that hash to them
+    using Map = std::unordered_map<idx_t, std::vector<idx_t> >;
+
+    // the different hashes, size nhash
+    std::vector<Map> maps;
+
+    int nhash; ///< nb of hash maps
+    int b; ///< nb bits per hash map
+    int nflip; ///< nb bit flips to use at search time
+
+    IndexBinaryMultiHash(int d, int nhash, int b);
+
+    IndexBinaryMultiHash();
+
+    ~IndexBinaryMultiHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result) const override;
+
+     void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels) const override;
+
+    size_t hashtable_size() const;
+
+};
+
+}
+
+#endif
diff --git a/IndexBinaryIVF.cpp b/IndexBinaryIVF.cpp
index 735598393..524fc60f6 100644
--- a/IndexBinaryIVF.cpp
+++ b/IndexBinaryIVF.cpp
@@ -11,11 +11,13 @@
 #include <faiss/IndexBinaryIVF.h>
 
 #include <cstdio>
+#include <omp.h>
+
 #include <memory>
 
+
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
-
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/IndexFlat.h>
@@ -281,13 +283,15 @@ namespace {
 using idx_t = Index::idx_t;
 
 
-template<class HammingComputer, bool store_pairs>
+template<class HammingComputer>
 struct IVFBinaryScannerL2: BinaryInvertedListScanner {
 
     HammingComputer hc;
     size_t code_size;
+    bool store_pairs;
 
-    IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
+    IVFBinaryScannerL2 (size_t code_size, bool store_pairs):
+        code_size (code_size), store_pairs(store_pairs)
     {}
 
     void set_query (const uint8_t *query_vector) override {
@@ -316,7 +320,7 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
             uint32_t dis = hc.hamming (codes);
             if (dis < simi[0]) {
                 heap_pop<C> (k, simi, idxi);
-                idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
                 heap_push<C> (k, simi, idxi, dis, id);
                 nup++;
             }
@@ -325,6 +329,24 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
         return nup;
     }
 
+    void scan_codes_range (size_t n,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           int radius,
+                           RangeQueryResult &result) const
+    {
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming (codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
+                result.add (dis, id);
+            }
+            codes += code_size;
+        }
+
+    }
+
 
 };
 
@@ -332,29 +354,6 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
 template <bool store_pairs>
 BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
 
-    switch (code_size) {
-#define HANDLE_CS(cs)                                                  \
-    case cs:                                                            \
-        return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
-      HANDLE_CS(4);
-      HANDLE_CS(8);
-      HANDLE_CS(16);
-      HANDLE_CS(20);
-      HANDLE_CS(32);
-      HANDLE_CS(64);
-#undef HANDLE_CS
-    default:
-        if (code_size % 8 == 0) {
-            return new IVFBinaryScannerL2<HammingComputerM8,
-                store_pairs> (code_size);
-        } else if (code_size % 4 == 0) {
-            return new IVFBinaryScannerL2<HammingComputerM4,
-                store_pairs> (code_size);
-        } else {
-            return new IVFBinaryScannerL2<HammingComputerDefault,
-                store_pairs> (code_size);
-        }
-    }
 }
 
 
@@ -425,8 +424,10 @@ void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
                     ids = sids->get();
                 }
 
-                nheap += scanner->scan_codes (list_size, scodes.get(),
-                                              ids, simi, idxi, k);
+                nheap += scanner->scan_codes (
+                        list_size, scodes.get(),
+                        ids, simi, idxi, k
+                );
 
                 nscan += list_size;
                 if (max_codes && nscan >= max_codes)
@@ -586,11 +587,26 @@ void search_knn_hamming_count_1 (
 BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
       (bool store_pairs) const
 {
-    if (store_pairs) {
-        return select_IVFBinaryScannerL2<true> (code_size);
-    } else {
-        return select_IVFBinaryScannerL2<false> (code_size);
+
+#define HC(name) return new IVFBinaryScannerL2<name> (code_size, store_pairs)
+    switch (code_size) {
+    case 4: HC(HammingComputer4);
+    case 8: HC(HammingComputer8);
+    case 16: HC(HammingComputer16);
+    case 20: HC(HammingComputer20);
+    case 32: HC(HammingComputer32);
+    case 64: HC(HammingComputer64);
+    default:
+        if (code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else if (code_size % 4 == 0) {
+            HC(HammingComputerM4);
+        } else {
+            HC(HammingComputerDefault);
+        }
     }
+#undef HC
+
 }
 
 void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
@@ -616,6 +632,84 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
     }
 }
 
+
+void IndexBinaryIVF::range_search(
+        idx_t n, const uint8_t *x, int radius,
+        RangeSearchResult *res) const
+{
+
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+
+    bool store_pairs = false;
+    size_t nlistv = 0, ndis = 0;
+
+    std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
+
+#pragma omp parallel reduction(+: nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(res);
+        std::unique_ptr<BinaryInvertedListScanner> scanner
+            (get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT (scanner.get ());
+
+        all_pres[omp_get_thread_num()] = &pres;
+
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres)
+        {
+
+            idx_t key = idx[i * nprobe + ik];  /* select the list  */
+            if (key < 0) return;
+            FAISS_THROW_IF_NOT_FMT (
+                    key < (idx_t) nlist,
+                    "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                    key, ik, nlist);
+            const size_t list_size = invlists->list_size(key);
+
+            if (list_size == 0) return;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+            InvertedLists::ScopedIds ids (invlists, key);
+
+            scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+            nlistv++;
+            ndis += list_size;
+            scanner->scan_codes_range (list_size, scodes.get(),
+                                       ids.get(), radius, qres);
+        };
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            scanner->set_query (x + i * code_size);
+
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                scan_list_func (i, ik, qres);
+            }
+
+        }
+
+        pres.finalize();
+
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.search_time += getmillisecs() - t0;
+
+}
+
+
+
+
 IndexBinaryIVF::~IndexBinaryIVF() {
   if (own_invlists) {
     delete invlists;
diff --git a/IndexBinaryIVF.h b/IndexBinaryIVF.h
index 40635ddbf..18918f61f 100644
--- a/IndexBinaryIVF.h
+++ b/IndexBinaryIVF.h
@@ -109,8 +109,11 @@ struct IndexBinaryIVF : IndexBinary {
                                          bool store_pairs=false) const;
 
     /** assign the vectors, then call search_preassign */
-    virtual void search(idx_t n, const uint8_t *x, idx_t k,
-                        int32_t *distances, idx_t *labels) const override;
+    void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels) const override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result) const override;
 
     void reconstruct(idx_t key, uint8_t *recons) const override;
 
@@ -202,6 +205,12 @@ struct BinaryInvertedListScanner {
                                int32_t *distances, idx_t *labels,
                                size_t k) const = 0;
 
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   int radius,
+                                   RangeQueryResult &result) const = 0;
+
     virtual ~BinaryInvertedListScanner () {}
 
 };
diff --git a/IndexFlat.h b/IndexFlat.h
index 7b1345121..ef928de09 100644
--- a/IndexFlat.h
+++ b/IndexFlat.h
@@ -19,6 +19,7 @@ namespace faiss {
 
 /** Index that stores the full vectors and performs exhaustive search */
 struct IndexFlat: Index {
+
     /// database vectors, size ntotal * d
     std::vector<float> xb;
 
@@ -144,7 +145,7 @@ struct IndexRefineFlat: Index {
 };
 
 
-/// optimized version for 1D "vectors"
+/// optimized version for 1D "vectors".
 struct IndexFlat1D:IndexFlatL2 {
     bool continuous_update; ///< is the permutation updated continuously?
 
diff --git a/IndexIVF.cpp b/IndexIVF.cpp
index 02c79dd3a..21cfff314 100644
--- a/IndexIVF.cpp
+++ b/IndexIVF.cpp
@@ -612,7 +612,6 @@ InvertedListScanner *IndexIVF::get_InvertedListScanner (
 void IndexIVF::reconstruct (idx_t key, float* recons) const
 {
     idx_t lo = direct_map.get (key);
-
     reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
 }
 
diff --git a/IndexIVFPQ.h b/IndexIVFPQ.h
index 0a6adcb3d..4ca04e9ef 100644
--- a/IndexIVFPQ.h
+++ b/IndexIVFPQ.h
@@ -42,14 +42,14 @@ struct IndexIVFPQ: IndexIVF {
     int polysemous_ht;             ///< Hamming thresh for polysemous filtering
 
     /** Precompute table that speed up query preprocessing at some
-     * memory cost
+     * memory cost (used only for by_residual with L2 metric)
      * =-1: force disable
      * =0: decide heuristically (default: use tables only if they are
      *     < precomputed_tables_max_bytes)
      * =1: tables that work for all quantizers (size 256 * nlist * M)
      * =2: specific version for MultiIndexQuantizer (much more compact)
      */
-    int use_precomputed_table;     ///< if by_residual, build precompute tables
+    int use_precomputed_table;
     static size_t precomputed_table_max_bytes;
 
     /// if use_precompute_table
@@ -93,9 +93,9 @@ struct IndexIVFPQ: IndexIVF {
      * the duplicates are returned in pre-allocated arrays (see the
      * max sizes).
      *
-     * @params lims   limits between groups of duplicates
+     * @param lims   limits between groups of duplicates
      *                (max size ntotal / 2 + 1)
-     * @params ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     * @param ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
      *                duplicates (max size ntotal)
      * @return n      number of groups found
      */
@@ -135,15 +135,14 @@ struct IndexIVFPQ: IndexIVF {
 /// statistics are robust to internal threading, but not if
 /// IndexIVFPQ::search_preassigned is called by multiple threads
 struct IndexIVFPQStats {
-    size_t nrefine;  // nb of refines (IVFPQR)
+    size_t nrefine;  ///< nb of refines (IVFPQR)
 
     size_t n_hamming_pass;
-    // nb of passed Hamming distance tests (for polysemous)
+    ///< nb of passed Hamming distance tests (for polysemous)
 
-    // timings measured with the CPU RTC
-    // on all threads
+    // timings measured with the CPU RTC on all threads
     size_t search_cycles;
-    size_t refine_cycles; // only for IVFPQR
+    size_t refine_cycles; ///< only for IVFPQR
 
     IndexIVFPQStats () {reset (); }
     void reset ();
diff --git a/demos/demo_weighted_kmeans.cpp b/demos/demo_weighted_kmeans.cpp
new file mode 100644
index 000000000..eee188e4b
--- /dev/null
+++ b/demos/demo_weighted_kmeans.cpp
@@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/Clustering.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/distances.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
+
+
+namespace {
+
+
+enum WeightedKMeansType {
+    WKMT_FlatL2,
+    WKMT_FlatIP,
+    WKMT_FlatIP_spherical,
+    WKMT_HNSW,
+};
+
+
+float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
+                                  const float *input,
+                                  const float *weights,
+                                  float *centroids,
+                                  WeightedKMeansType index_num)
+{
+    using namespace faiss;
+    Clustering clus (d, k);
+    clus.verbose = true;
+
+    std::unique_ptr<Index> index;
+
+    switch (index_num) {
+    case WKMT_FlatL2:
+        index.reset(new IndexFlatL2 (d));
+        break;
+    case WKMT_FlatIP:
+        index.reset(new IndexFlatIP (d));
+        break;
+    case WKMT_FlatIP_spherical:
+        index.reset(new IndexFlatIP (d));
+        clus.spherical = true;
+        break;
+    case WKMT_HNSW:
+        IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
+        ihnsw->hnsw.efSearch = 128;
+        index.reset(ihnsw);
+        break;
+    }
+
+    clus.train(n, input, *index.get(), weights);
+    // on output the index contains the centroids.
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.iteration_stats.back().obj;
+}
+
+
+int d = 32;
+float sigma = 0.1;
+
+#define BIGTEST
+
+#ifdef BIGTEST
+// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
+int nc = 200000;
+int n_big = 4;
+int n_small = 2;
+#else
+int nc = 5;
+int n_big = 100;
+int n_small = 10;
+#endif
+
+int n; // number of training points
+
+void generate_trainset (std::vector<float> & ccent,
+                        std::vector<float> & x,
+                        std::vector<float> & weights)
+{
+    // same sampling as test_build_blocks.py test_weighted
+
+    ccent.resize (d * 2 * nc);
+    faiss::float_randn (ccent.data(), d * 2 * nc, 123);
+    faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
+    n = nc * n_big + nc * n_small;
+    x.resize(d * n);
+    weights.resize(n);
+    faiss::float_randn (x.data(), x.size(), 1234);
+
+    float *xi = x.data();
+    float *w = weights.data();
+    for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
+        int np = ci < nc ? n_big : n_small; // nb of points around this centroid
+        for (int i = 0; i < np; i++) {
+            for (int j = 0; j < d; j++) {
+                xi[j] = xi[j] * sigma + ccent[ci * d + j];
+            }
+            *w++ = ci < nc ? 0.1 : 10;
+            xi += d;
+        }
+    }
+}
+
+}
+
+
+int main(int argc, char **argv) {
+    std::vector<float> ccent;
+    std::vector<float> x;
+    std::vector<float> weights;
+
+    printf("generate training set\n");
+    generate_trainset(ccent, x, weights);
+
+    std::vector<float> centroids;
+    centroids.resize(nc * d);
+
+    int the_index_num = -1;
+    int the_with_weights = -1;
+
+    if (argc == 3) {
+        the_index_num = atoi(argv[1]);
+        the_with_weights = atoi(argv[2]);
+    }
+
+
+    for (int index_num = WKMT_FlatL2;
+         index_num <= WKMT_HNSW;
+         index_num++) {
+
+        if (the_index_num >= 0 && index_num != the_index_num) {
+            continue;
+        }
+
+        for (int with_weights = 0; with_weights <= 1; with_weights++) {
+            if (the_with_weights >= 0 && with_weights != the_with_weights) {
+                continue;
+            }
+
+            printf("=================== index_num=%d Run %s weights\n",
+                   index_num, with_weights ? "with" : "without");
+
+            weighted_kmeans_clustering (
+                 d, n, nc, x.data(),
+                 with_weights ? weights.data() : nullptr,
+                 centroids.data(), (WeightedKMeansType)index_num
+            );
+
+            { // compute distance of points to centroids
+                faiss::IndexFlatL2 cent_index(d);
+                cent_index.add(nc, centroids.data());
+                std::vector<float> dis (n);
+                std::vector<faiss::Index::idx_t> idx (n);
+
+                cent_index.search (nc * 2, ccent.data(), 1,
+                                   dis.data(), idx.data());
+
+                float dis1 = 0, dis2 = 0;
+                for (int i = 0; i < nc ; i++) {
+                    dis1 += dis[i];
+                }
+                printf("average distance of points from big clusters: %g\n",
+                       dis1 / nc);
+
+                for (int i = 0; i < nc ; i++) {
+                    dis2 += dis[i + nc];
+                }
+
+                printf("average distance of points from small clusters: %g\n",
+                       dis2 / nc);
+
+            }
+
+        }
+    }
+    return 0;
+}
diff --git a/gpu/GpuIndexFlat.cu b/gpu/GpuIndexFlat.cu
index 6ad556060..bbfabb275 100644
--- a/gpu/GpuIndexFlat.cu
+++ b/gpu/GpuIndexFlat.cu
@@ -29,8 +29,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
              config),
     config_(std::move(config)),
     data_(nullptr) {
-  verifySettings_();
-
   // Flat index doesn't need training
   this->is_trained = true;
 
@@ -44,8 +42,6 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
     GpuIndex(resources, dims, metric, 0, config),
     config_(std::move(config)),
     data_(nullptr) {
-  verifySettings_();
-
   // Flat index doesn't need training
   this->is_trained = true;
 
@@ -298,21 +294,6 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n,
   fromDevice<float, 2>(residualDevice, residuals, stream);
 }
 
-void
-GpuIndexFlat::verifySettings_() const {
-  // If we want Hgemm, ensure that it is supported on this device
-  if (config_.useFloat16Accumulator) {
-    FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
-                       "useFloat16Accumulator can only be enabled "
-                       "with useFloat16");
-
-    FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
-                       "Device %d does not support Hgemm "
-                       "(useFloat16Accumulator)",
-                       config_.device);
-  }
-}
-
 //
 // GpuIndexFlatL2
 //
diff --git a/gpu/GpuIndexFlat.h b/gpu/GpuIndexFlat.h
index 89491eb1d..782792674 100644
--- a/gpu/GpuIndexFlat.h
+++ b/gpu/GpuIndexFlat.h
@@ -25,17 +25,12 @@ struct FlatIndex;
 struct GpuIndexFlatConfig : public GpuIndexConfig {
   inline GpuIndexFlatConfig()
       : useFloat16(false),
-        useFloat16Accumulator(false),
         storeTransposed(false) {
   }
 
   /// Whether or not data is stored as float16
   bool useFloat16;
 
-  /// This option is now deprecated and doesn't do anything. All accumulation of
-  /// float16 or float32 data is now done in float32.
-  bool useFloat16Accumulator;
-
   /// Whether or not data is stored (transparently) in a transposed
   /// layout, enabling use of the NN GEMM call, which is ~10% faster.
   /// This will improve the speed of the flat index, but will
@@ -123,10 +118,6 @@ class GpuIndexFlat : public GpuIndex {
                    float* distances,
                    faiss::Index::idx_t* labels) const override;
 
- private:
-  /// Checks user settings for consistency
-  void verifySettings_() const;
-
  protected:
   /// Our config object
   const GpuIndexFlatConfig config_;
diff --git a/gpu/impl/FlatIndex.cu b/gpu/impl/FlatIndex.cu
index eb58505eb..1e300795f 100644
--- a/gpu/impl/FlatIndex.cu
+++ b/gpu/impl/FlatIndex.cu
@@ -62,6 +62,22 @@ FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
   }
 }
 
+template <>
+Tensor<float, 2, true>&
+FlatIndex::getVectorsRef<float>() {
+  // Should not call this unless we are in float32 mode
+  FAISS_ASSERT(!useFloat16_);
+  return getVectorsFloat32Ref();
+}
+
+template <>
+Tensor<half, 2, true>&
+FlatIndex::getVectorsRef<half>() {
+  // Should not call this unless we are in float16 mode
+  FAISS_ASSERT(useFloat16_);
+  return getVectorsFloat16Ref();
+}
+
 Tensor<float, 2, true>&
 FlatIndex::getVectorsFloat32Ref() {
   // Should not call this unless we are in float32 mode
diff --git a/gpu/impl/FlatIndex.cuh b/gpu/impl/FlatIndex.cuh
index e48391210..3b8181868 100644
--- a/gpu/impl/FlatIndex.cuh
+++ b/gpu/impl/FlatIndex.cuh
@@ -26,16 +26,23 @@ class FlatIndex {
             bool storeTransposed,
             MemorySpace space);
 
+  /// Whether or not this flat index primarily stores data in float16
   bool getUseFloat16() const;
 
   /// Returns the number of vectors we contain
   int getSize() const;
 
+  /// Returns the dimensionality of the vectors
   int getDim() const;
 
   /// Reserve storage that can contain at least this many vectors
   void reserve(size_t numVecs, cudaStream_t stream);
 
+  /// Returns the vectors based on the type desired; the FlatIndex must be of
+  /// the same type (float16 or float32) to not assert
+  template <typename T>
+  Tensor<T, 2, true>& getVectorsRef();
+
   /// Returns a reference to our vectors currently in use
   Tensor<float, 2, true>& getVectorsFloat32Ref();
 
diff --git a/gpu/impl/IVFPQ.cu b/gpu/impl/IVFPQ.cu
index 402bda0d9..ac9b47aec 100644
--- a/gpu/impl/IVFPQ.cu
+++ b/gpu/impl/IVFPQ.cu
@@ -123,8 +123,6 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
   FAISS_ASSERT(vecs.getSize(1) == dim_);
 
-  FAISS_ASSERT(!quantizer_->getUseFloat16());
-  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
   auto& mem = resources_->getMemoryManagerCurrentDevice();
   auto stream = resources_->getDefaultStreamCurrentDevice();
 
@@ -155,7 +153,13 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   DeviceTensor<float, 2, true> residuals(
     mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
 
-  runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+  if (quantizer_->getUseFloat16()) {
+    auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
+    runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+  } else {
+    auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+    runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+  }
 
   // Residuals are in the form
   // (vec x numSubQuantizer x dimPerSubQuantizer)
@@ -437,8 +441,9 @@ IVFPQ::setPQCentroids_(float* data) {
   pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
 }
 
+template <typename CentroidT>
 void
-IVFPQ::precomputeCodes_() {
+IVFPQ::precomputeCodesT_() {
   FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
 
   //
@@ -449,8 +454,6 @@ IVFPQ::precomputeCodes_() {
 
   // Terms 1 and 3 are available only at query time. We compute term 2
   // here.
-  FAISS_ASSERT(!quantizer_->getUseFloat16());
-  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
 
   // Compute ||y_R||^2 by treating
   // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
@@ -473,9 +476,10 @@ IVFPQ::precomputeCodes_() {
   //      (centroid id)(sub q)(dim)
   // Transpose (centroid id)(sub q)(sub dim) to
   //           (sub q)(centroid id)(sub dim)
-  auto centroidView = coarseCentroids.view<3>(
+  auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
+  auto centroidView = coarseCentroids.template view<3>(
     {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
-  DeviceTensor<float, 3, true> centroidsTransposed(
+  DeviceTensor<CentroidT, 3, true> centroidsTransposed(
     {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
 
   runTransposeAny(centroidView, 0, 1, centroidsTransposed,
@@ -521,6 +525,15 @@ IVFPQ::precomputeCodes_() {
   }
 }
 
+void
+IVFPQ::precomputeCodes_() {
+  if (quantizer_->getUseFloat16()) {
+    precomputeCodesT_<half>();
+  } else {
+    precomputeCodesT_<float>();
+  }
+}
+
 void
 IVFPQ::query(Tensor<float, 2, true>& queries,
              int nprobe,
@@ -688,16 +701,16 @@ IVFPQ::runPQPrecomputedCodes_(
                                 resources_);
 }
 
+template <typename CentroidT>
 void
-IVFPQ::runPQNoPrecomputedCodes_(
+IVFPQ::runPQNoPrecomputedCodesT_(
   Tensor<float, 2, true>& queries,
   DeviceTensor<float, 2, true>& coarseDistances,
   DeviceTensor<int, 2, true>& coarseIndices,
   int k,
   Tensor<float, 2, true>& outDistances,
   Tensor<long, 2, true>& outIndices) {
-  FAISS_ASSERT(!quantizer_->getUseFloat16());
-  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+  auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
 
   runPQScanMultiPassNoPrecomputed(queries,
                                   coarseCentroids,
@@ -719,4 +732,29 @@ IVFPQ::runPQNoPrecomputedCodes_(
                                   resources_);
 }
 
+void
+IVFPQ::runPQNoPrecomputedCodes_(
+  Tensor<float, 2, true>& queries,
+  DeviceTensor<float, 2, true>& coarseDistances,
+  DeviceTensor<int, 2, true>& coarseIndices,
+  int k,
+  Tensor<float, 2, true>& outDistances,
+  Tensor<long, 2, true>& outIndices) {
+  if (quantizer_->getUseFloat16()) {
+    runPQNoPrecomputedCodesT_<half>(queries,
+                                    coarseDistances,
+                                    coarseIndices,
+                                    k,
+                                    outDistances,
+                                    outIndices);
+  } else {
+    runPQNoPrecomputedCodesT_<float>(queries,
+                                     coarseDistances,
+                                     coarseIndices,
+                                     k,
+                                     outDistances,
+                                     outIndices);
+  }
+}
+
 } } // namespace
diff --git a/gpu/impl/IVFPQ.cuh b/gpu/impl/IVFPQ.cuh
index d071705c8..ce6c7acb1 100644
--- a/gpu/impl/IVFPQ.cuh
+++ b/gpu/impl/IVFPQ.cuh
@@ -83,6 +83,11 @@ class IVFPQ : public IVFBase {
   /// Calculate precomputed residual distance information
   void precomputeCodes_();
 
+  /// Calculate precomputed residual distance information (for different coarse
+  /// centroid type)
+  template <typename CentroidT>
+  void precomputeCodesT_();
+
   /// Runs kernels for scanning inverted lists with precomputed codes
   void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
                               DeviceTensor<float, 2, true>& coarseDistances,
@@ -99,6 +104,16 @@ class IVFPQ : public IVFBase {
                                 Tensor<float, 2, true>& outDistances,
                                 Tensor<long, 2, true>& outIndices);
 
+  /// Runs kernels for scanning inverted lists without precomputed codes (for
+  /// different coarse centroid type)
+  template <typename CentroidT>
+  void runPQNoPrecomputedCodesT_(Tensor<float, 2, true>& queries,
+                                 DeviceTensor<float, 2, true>& coarseDistances,
+                                 DeviceTensor<int, 2, true>& coarseIndices,
+                                 int k,
+                                 Tensor<float, 2, true>& outDistances,
+                                 Tensor<long, 2, true>& outIndices);
+
  private:
   /// Number of sub-quantizers per vector
   const int numSubQuantizers_;
diff --git a/gpu/impl/PQCodeDistances-inl.cuh b/gpu/impl/PQCodeDistances-inl.cuh
new file mode 100644
index 000000000..c3ef87f2e
--- /dev/null
+++ b/gpu/impl/PQCodeDistances-inl.cuh
@@ -0,0 +1,561 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/BroadcastSum.cuh>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/MatrixMult.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+namespace faiss { namespace gpu {
+
+// Kernel responsible for calculating distance from residual vector to
+// each product quantizer code centroid
+template <typename OutCodeT,
+          typename CentroidT,
+          int DimsPerSubQuantizer,
+          bool L2Distance>
+__global__ void
+__launch_bounds__(288, 4)
+pqCodeDistances(Tensor<float, 2, true> queries,
+                int queriesPerBlock,
+                Tensor<CentroidT, 2, true> coarseCentroids,
+                Tensor<float, 3, true> pqCentroids,
+                Tensor<int, 2, true> topQueryToCentroid,
+                // (query id)(coarse)(subquantizer)(code) -> dist
+                Tensor<OutCodeT, 4, true> outCodeDistances) {
+  const auto numSubQuantizers = pqCentroids.getSize(0);
+  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+  assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
+  int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
+
+  extern __shared__ float smem[];
+
+  // Each thread calculates a single code
+  float subQuantizerData[DimsPerSubQuantizer];
+
+  auto code = threadIdx.x;
+  auto subQuantizer = blockIdx.y;
+
+  // Each thread will load the pq centroid data for the code that it
+  // is processing
+#pragma unroll
+  for (int i = 0; i < DimsPerSubQuantizer; ++i) {
+    subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
+  }
+
+  // Where we store our query vector
+  float* smemQuery = smem;
+
+  // Where we store our residual vector; this is double buffered so we
+  // can be loading the next one while processing the current one
+  float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
+  float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
+
+  // Where we pre-load the coarse centroid IDs
+  int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
+
+  // Each thread is calculating the distance for a single code,
+  // performing the reductions locally
+
+  // Handle multiple queries per block
+  auto startQueryId = blockIdx.x * queriesPerBlock;
+  auto numQueries = queries.getSize(0) - startQueryId;
+  if (numQueries > queriesPerBlock) {
+    numQueries = queriesPerBlock;
+  }
+
+  for (int query = 0; query < numQueries; ++query) {
+    auto queryId = startQueryId + query;
+
+    auto querySubQuantizer =
+      queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
+
+    // Load current query vector
+    for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
+      smemQuery[i] = querySubQuantizer[i];
+    }
+
+    // Load list of coarse centroids found
+    for (int i = threadIdx.x;
+         i < topQueryToCentroid.getSize(1); i += blockDim.x) {
+      coarseIds[i] = topQueryToCentroid[queryId][i];
+    }
+
+    // We need coarseIds below
+    // FIXME: investigate loading separately, so we don't need this
+    __syncthreads();
+
+    // Preload first buffer of residual data
+    if (isLoadingThread) {
+      for (int i = loadingThreadId;
+           i < DimsPerSubQuantizer;
+           i += blockDim.x - codesPerSubQuantizer) {
+        auto coarseId = coarseIds[0];
+        // In case NaNs were in the original query data
+        coarseId = coarseId == -1 ? 0 : coarseId;
+        auto coarseCentroidSubQuantizer =
+          coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
+
+        if (L2Distance) {
+          smemResidual1[i] = smemQuery[i] -
+            ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+        } else {
+          smemResidual1[i] =
+            ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+        }
+      }
+    }
+
+    // The block walks the list for a single query
+    for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
+      // Wait for smemResidual1 to be loaded
+      __syncthreads();
+
+      if (isLoadingThread) {
+        // Preload second buffer of residual data
+        for (int i = loadingThreadId;
+             i < DimsPerSubQuantizer;
+             i += blockDim.x - codesPerSubQuantizer) {
+          // FIXME: try always making this centroid id 0 so we can
+          // terminate
+          if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
+            auto coarseId = coarseIds[coarse + 1];
+            // In case NaNs were in the original query data
+            coarseId = coarseId == -1 ? 0 : coarseId;
+
+            auto coarseCentroidSubQuantizer =
+              coarseCentroids[coarseId]
+              [subQuantizer * dimsPerSubQuantizer].data();
+
+            if (L2Distance) {
+              smemResidual2[i] = smemQuery[i] -
+                ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+            } else {
+              smemResidual2[i] =
+                ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+            }
+          }
+        }
+      } else {
+        // These are the processing threads
+        float dist = 0.0f;
+
+        constexpr int kUnroll = 4;
+        constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
+        constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
+        float vals[kUnroll];
+
+        // Calculate residual - pqCentroid for each dim that we're
+        // processing
+
+        // Unrolled loop
+        if (L2Distance) {
+#pragma unroll
+          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] = smemResidual1[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] -= subQuantizerData[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] *= vals[j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              dist += vals[j];
+            }
+          }
+        } else {
+          // Inner product: query slice against the reconstructed sub-quantizer
+          // for this coarse cell (query o (centroid + subQCentroid))
+#pragma unroll
+          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] = smemResidual1[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] += subQuantizerData[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] *= smemQuery[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              dist += vals[j];
+            }
+          }
+        }
+
+        // Remainder loop
+        if (L2Distance) {
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] = smemResidual1[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] -= subQuantizerData[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] *= vals[j];
+          }
+        } else {
+          // Inner product
+          // Inner product: query slice against the reconstructed sub-quantizer
+          // for this coarse cell (query o (centroid + subQCentroid))
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] = smemResidual1[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] += subQuantizerData[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] *= smemQuery[kRemainderBase + j];
+          }
+        }
+
+#pragma unroll
+        for (int j = 0; j < kRemainder; ++j) {
+          dist += vals[j];
+        }
+
+        // We have the distance for our code; write it out
+        outCodeDistances[queryId][coarse][subQuantizer][code] =
+          ConvertTo<OutCodeT>::to(dist);
+      } // !isLoadingThread
+
+      // Swap residual buffers
+      float* tmp = smemResidual1;
+      smemResidual1 = smemResidual2;
+      smemResidual2 = tmp;
+    }
+  }
+}
+
+template <typename CentroidT>
+__global__ void
+residualVector(Tensor<float, 2, true> queries,
+               Tensor<CentroidT, 2, true> coarseCentroids,
+               Tensor<int, 2, true> topQueryToCentroid,
+               int numSubDim,
+               // output is transposed:
+               // (sub q)(query id)(centroid id)(sub dim)
+               Tensor<float, 4, true> residual) {
+  // block x is query id
+  // block y is centroid id
+  // thread x is dim
+  auto queryId = blockIdx.x;
+  auto centroidId = blockIdx.y;
+
+  int realCentroidId = topQueryToCentroid[queryId][centroidId];
+
+  for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
+    float q = queries[queryId][dim];
+    float c = ConvertTo<float>::to(coarseCentroids[realCentroidId][dim]);
+
+    residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = q - c;
+  }
+}
+
+template <typename CentroidT>
+void
+runResidualVector(Tensor<float, 3, true>& pqCentroids,
+                  Tensor<float, 2, true>& queries,
+                  Tensor<CentroidT, 2, true>& coarseCentroids,
+                  Tensor<int, 2, true>& topQueryToCentroid,
+                  Tensor<float, 4, true>& residual,
+                  cudaStream_t stream) {
+  auto grid =
+    dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
+  auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
+
+  residualVector<<<grid, block, 0, stream>>>(
+    queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
+    residual);
+
+  CUDA_TEST_ERROR();
+}
+
+template <typename CentroidT>
+void
+runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
+                     Tensor<float, 2, true>& queries,
+                     Tensor<CentroidT, 2, true>& coarseCentroids,
+                     Tensor<int, 2, true>& topQueryToCentroid,
+                     NoTypeTensor<4, true>& outCodeDistances,
+                     bool useFloat16Lookup,
+                     DeviceMemory& mem,
+                     cublasHandle_t handle,
+                     cudaStream_t stream) {
+  // Calculate (q - c) residual vector
+  // (sub q)(query id)(centroid id)(sub dim)
+  DeviceTensor<float, 4, true> residual(
+    mem,
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0),
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)},
+    stream);
+
+  runResidualVector(pqCentroids, queries,
+                    coarseCentroids, topQueryToCentroid,
+                    residual, stream);
+
+  // Calculate ||q - c||^2
+  DeviceTensor<float, 1, true> residualNorms(
+    mem,
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1)},
+    stream);
+
+  auto residualView2 = residual.view<2>(
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)});
+
+  runL2Norm(residualView2, true, residualNorms, true, stream);
+
+  // Perform a batch MM:
+  // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
+  // (sub q) x {(q * c)(code)}
+  auto residualView3 = residual.view<3>(
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)});
+
+  DeviceTensor<float, 3, true> residualDistance(
+    mem,
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(2)},
+    stream);
+
+  runIteratedMatrixMult(residualDistance, false,
+                        residualView3, false,
+                        pqCentroids, false,
+                        -2.0f, 0.0f,
+                        handle,
+                        stream);
+
+  // Sum ||q - c||^2 along rows
+  auto residualDistanceView2 = residualDistance.view<2>(
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(2)});
+
+  runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
+
+  Tensor<float, 4, true> outCodeDistancesF;
+  DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
+
+  if (useFloat16Lookup) {
+    outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
+      mem, {outCodeDistances.getSize(0),
+          outCodeDistances.getSize(1),
+          outCodeDistances.getSize(2),
+          outCodeDistances.getSize(3)},
+      stream);
+
+    outCodeDistancesF = outCodeDistancesFloatMem;
+  } else {
+    outCodeDistancesF = outCodeDistances.toTensor<float>();
+  }
+
+  // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
+  // is where we build our output distances)
+  auto outCodeDistancesView = outCodeDistancesF.view<3>(
+    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        outCodeDistances.getSize(2),
+        outCodeDistances.getSize(3)});
+
+  runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
+
+  // Calculate code norms per each sub-dim
+  // (sub q)(sub dim)(code) is pqCentroids
+  // transpose to (sub q)(code)(sub dim)
+  DeviceTensor<float, 3, true> pqCentroidsTranspose(
+    mem,
+    {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
+    stream);
+
+  runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
+
+  auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
+    {pqCentroids.getSize(0) * pqCentroids.getSize(2),
+        pqCentroids.getSize(1)});
+
+  DeviceTensor<float, 1, true> pqCentroidsNorm(
+    mem,
+    {pqCentroids.getSize(0) * pqCentroids.getSize(2)},
+    stream);
+
+  runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
+
+  // View output as (q * c)(sub q * code), and add centroid norm to
+  // each row
+  auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
+    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
+
+  runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
+
+  if (useFloat16Lookup) {
+    // Need to convert back
+    auto outCodeDistancesH = outCodeDistances.toTensor<half>();
+    convertTensor<float, half, 4>(stream,
+                                  outCodeDistancesF,
+                                  outCodeDistancesH);
+  }
+}
+
+template <typename CentroidT>
+void
+runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
+                   Tensor<float, 2, true>& queries,
+                   Tensor<CentroidT, 2, true>& coarseCentroids,
+                   Tensor<int, 2, true>& topQueryToCentroid,
+                   NoTypeTensor<4, true>& outCodeDistances,
+                   bool l2Distance,
+                   bool useFloat16Lookup,
+                   cudaStream_t stream) {
+  const auto numSubQuantizers = pqCentroids.getSize(0);
+  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  // FIXME: tune
+  // Reuse of pq centroid data is based on both # of queries * nprobe,
+  // and we should really be tiling in both dimensions
+  constexpr int kQueriesPerBlock = 8;
+
+  auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
+                   numSubQuantizers);
+
+  // Reserve one block of threads for double buffering
+  // FIXME: probably impractical for large # of dims?
+  auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
+  auto block = dim3(codesPerSubQuantizer + loadingThreads);
+
+  auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
+    + topQueryToCentroid.getSize(1) * sizeof(int);
+
+#define RUN_CODE(DIMS, L2)                                              \
+  do {                                                                  \
+    if (useFloat16Lookup) {                                             \
+      auto outCodeDistancesT = outCodeDistances.toTensor<half>();       \
+                                                                        \
+      pqCodeDistances<half, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
+        queries, kQueriesPerBlock,                                      \
+        coarseCentroids, pqCentroids,                                   \
+        topQueryToCentroid, outCodeDistancesT);                         \
+    } else {                                                            \
+      auto outCodeDistancesT = outCodeDistances.toTensor<float>();      \
+                                                                        \
+      pqCodeDistances<float, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
+        queries, kQueriesPerBlock,                                      \
+        coarseCentroids, pqCentroids,                                   \
+        topQueryToCentroid, outCodeDistancesT);                         \
+    }                                                                   \
+  } while (0)
+
+#define CODE_L2(DIMS)                           \
+  do {                                          \
+    if (l2Distance) {                           \
+      RUN_CODE(DIMS, true);                     \
+    } else {                                    \
+      RUN_CODE(DIMS, false);                    \
+    }                                           \
+  } while (0)
+
+  switch (dimsPerSubQuantizer) {
+    case 1:
+      CODE_L2(1);
+      break;
+    case 2:
+      CODE_L2(2);
+      break;
+    case 3:
+      CODE_L2(3);
+      break;
+    case 4:
+      CODE_L2(4);
+      break;
+    case 6:
+      CODE_L2(6);
+      break;
+    case 8:
+      CODE_L2(8);
+      break;
+    case 10:
+      CODE_L2(10);
+      break;
+    case 12:
+      CODE_L2(12);
+      break;
+    case 16:
+      CODE_L2(16);
+      break;
+    case 20:
+      CODE_L2(20);
+      break;
+    case 24:
+      CODE_L2(24);
+      break;
+    case 28:
+      CODE_L2(28);
+      break;
+    case 32:
+      CODE_L2(32);
+      break;
+      // FIXME: larger sizes require too many registers - we need the
+      // MM implementation working
+    default:
+      FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
+                      "not currently supported");
+  }
+
+#undef RUN_CODE
+#undef CODE_L2
+
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/gpu/impl/PQCodeDistances.cuh b/gpu/impl/PQCodeDistances.cuh
index b68694715..0add947f2 100644
--- a/gpu/impl/PQCodeDistances.cuh
+++ b/gpu/impl/PQCodeDistances.cuh
@@ -20,18 +20,20 @@ class DeviceMemory;
 /// Calculates the distance from the (query - centroid) residual to
 /// each sub-code vector, for the given list of query results in
 /// topQueryToCentroid
+template <typename CentroidT>
 void runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
                         Tensor<float, 2, true>& queries,
-                        Tensor<float, 2, true>& coarseCentroids,
+                        Tensor<CentroidT, 2, true>& coarseCentroids,
                         Tensor<int, 2, true>& topQueryToCentroid,
                         NoTypeTensor<4, true>& outCodeDistances,
                         bool l2Distance,
                         bool useFloat16Lookup,
                         cudaStream_t stream);
 
+template <typename CentroidT>
 void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
                           Tensor<float, 2, true>& queries,
-                          Tensor<float, 2, true>& coarseCentroids,
+                          Tensor<CentroidT, 2, true>& coarseCentroids,
                           Tensor<int, 2, true>& topQueryToCentroid,
                           NoTypeTensor<4, true>& outCodeDistances,
                           bool useFloat16Lookup,
@@ -40,3 +42,5 @@ void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
                           cudaStream_t stream);
 
 } } // namespace
+
+#include <faiss/gpu/impl/PQCodeDistances-inl.cuh>
diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
new file mode 100644
index 000000000..95a4b586d
--- /dev/null
+++ b/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
@@ -0,0 +1,599 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/PQCodeDistances.cuh>
+#include <faiss/gpu/impl/PQCodeLoad.cuh>
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/LoadStoreOperators.cuh>
+#include <faiss/gpu/utils/NoTypeTensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+
+#include <faiss/gpu/utils/HostTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// This must be kept in sync with PQCodeDistances.cu
+inline bool isSupportedNoPrecomputedSubDimSize(int dims) {
+  switch (dims) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 6:
+    case 8:
+    case 10:
+    case 12:
+    case 16:
+    case 20:
+    case 24:
+    case 28:
+    case 32:
+      return true;
+    default:
+      // FIXME: larger sizes require too many registers - we need the
+      // MM implementation working
+      return false;
+  }
+}
+
+template <typename LookupT, typename LookupVecT>
+struct LoadCodeDistances {
+  static inline __device__ void load(LookupT* smem,
+                                     LookupT* codes,
+                                     int numCodes) {
+    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
+
+    // We can only use the vector type if the data is guaranteed to be
+    // aligned. The codes are innermost, so if it is evenly divisible,
+    // then any slice will be aligned.
+    if (numCodes % kWordSize == 0) {
+      // Load the data by float4 for efficiency, and then handle any remainder
+      // limitVec is the number of whole vec words we can load, in terms
+      // of whole blocks performing the load
+      constexpr int kUnroll = 2;
+      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+      limitVec *= kUnroll * blockDim.x;
+
+      LookupVecT* smemV = (LookupVecT*) smem;
+      LookupVecT* codesV = (LookupVecT*) codes;
+
+      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+        LookupVecT vals[kUnroll];
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          vals[j] =
+            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
+        }
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
+        }
+      }
+
+      // This is where we start loading the remainder that does not evenly
+      // fit into kUnroll x blockDim.x
+      int remainder = limitVec * kWordSize;
+
+      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+        smem[i] = codes[i];
+      }
+    } else {
+      // Potential unaligned load
+      constexpr int kUnroll = 4;
+
+      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+
+      int i = threadIdx.x;
+      for (; i < limit; i += kUnroll * blockDim.x) {
+        LookupT vals[kUnroll];
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          vals[j] = codes[i + j * blockDim.x];
+        }
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          smem[i + j * blockDim.x] = vals[j];
+        }
+      }
+
+      for (; i < numCodes; i += blockDim.x) {
+        smem[i] = codes[i];
+      }
+    }
+  }
+};
+
+template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
+__global__ void
+pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
+                             Tensor<float, 3, true> pqCentroids,
+                             Tensor<int, 2, true> topQueryToCentroid,
+                             Tensor<LookupT, 4, true> codeDistances,
+                             void** listCodes,
+                             int* listLengths,
+                             Tensor<int, 2, true> prefixSumOffsets,
+                             Tensor<float, 1, true> distance) {
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  // Where the pq code -> residual distance is stored
+  extern __shared__ char smemCodeDistances[];
+  LookupT* codeDist = (LookupT*) smemCodeDistances;
+
+  // Each block handles a single query
+  auto queryId = blockIdx.y;
+  auto probeId = blockIdx.x;
+
+  // This is where we start writing out data
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+  float* distanceOut = distance[outBase].data();
+
+  auto listId = topQueryToCentroid[queryId][probeId];
+  // Safety guard in case NaNs in input cause no list ID to be generated
+  if (listId == -1) {
+    return;
+  }
+
+  unsigned char* codeList = (unsigned char*) listCodes[listId];
+  int limit = listLengths[listId];
+
+  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
+    (NumSubQuantizers / 4);
+  unsigned int code32[kNumCode32];
+  unsigned int nextCode32[kNumCode32];
+
+  // We double-buffer the code loading, which improves memory utilization
+  if (threadIdx.x < limit) {
+    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
+  }
+
+  LoadCodeDistances<LookupT, LookupVecT>::load(
+    codeDist,
+    codeDistances[queryId][probeId].data(),
+    codeDistances.getSize(2) * codeDistances.getSize(3));
+
+  // Prevent WAR dependencies
+  __syncthreads();
+
+  // Each thread handles one code element in the list, with a
+  // block-wide stride
+  for (int codeIndex = threadIdx.x;
+       codeIndex < limit;
+       codeIndex += blockDim.x) {
+    // Prefetch next codes
+    if (codeIndex + blockDim.x < limit) {
+      LoadCode32<NumSubQuantizers>::load(
+        nextCode32, codeList, codeIndex + blockDim.x);
+    }
+
+    float dist = 0.0f;
+
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      constexpr int kBytesPerCode32 =
+        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
+
+      if (kBytesPerCode32 == 1) {
+        auto code = code32[0];
+        dist = ConvertTo<float>::to(codeDist[code]);
+
+      } else {
+#pragma unroll
+        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
+          auto code = getByte(code32[word], byte * 8, 8);
+
+          auto offset =
+            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
+
+          dist += ConvertTo<float>::to(codeDist[offset + code]);
+        }
+      }
+    }
+
+    // Write out intermediate distance result
+    // We do not maintain indices here, in order to reduce global
+    // memory traffic. Those are recovered in the final selection step.
+    distanceOut[codeIndex] = dist;
+
+    // Rotate buffers
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      code32[word] = nextCode32[word];
+    }
+  }
+}
+
+template <typename CentroidT>
+void
+runMultiPassTile(Tensor<float, 2, true>& queries,
+                 Tensor<CentroidT, 2, true>& centroids,
+                 Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                 NoTypeTensor<4, true>& codeDistances,
+                 Tensor<int, 2, true>& topQueryToCentroid,
+                 bool useFloat16Lookup,
+                 int bytesPerCode,
+                 int numSubQuantizers,
+                 int numSubQuantizerCodes,
+                 thrust::device_vector<void*>& listCodes,
+                 thrust::device_vector<void*>& listIndices,
+                 IndicesOptions indicesOptions,
+                 thrust::device_vector<int>& listLengths,
+                 Tensor<char, 1, true>& thrustMem,
+                 Tensor<int, 2, true>& prefixSumOffsets,
+                 Tensor<float, 1, true>& allDistances,
+                 Tensor<float, 3, true>& heapDistances,
+                 Tensor<int, 3, true>& heapIndices,
+                 int k,
+                 faiss::MetricType metric,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<long, 2, true>& outIndices,
+                 cudaStream_t stream) {
+  // We only support two metrics at the moment
+  FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
+               metric == MetricType::METRIC_L2);
+
+  bool l2Distance = metric == MetricType::METRIC_L2;
+
+  // Calculate offset lengths, so we know where to write out
+  // intermediate results
+  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
+                     thrustMem, stream);
+
+  // Calculate residual code distances, since this is without
+  // precomputed codes
+  runPQCodeDistances(pqCentroidsInnermostCode,
+                     queries,
+                     centroids,
+                     topQueryToCentroid,
+                     codeDistances,
+                     l2Distance,
+                     useFloat16Lookup,
+                     stream);
+
+  // Convert all codes to a distance, and write out (distance,
+  // index) values for all intermediate results
+  {
+    auto kThreadsPerBlock = 256;
+
+    auto grid = dim3(topQueryToCentroid.getSize(1),
+                     topQueryToCentroid.getSize(0));
+    auto block = dim3(kThreadsPerBlock);
+
+    // pq centroid distances
+    auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+    smem *= numSubQuantizers * numSubQuantizerCodes;
+    FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
+
+#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
+    do {                                                                \
+      auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>();         \
+                                                                        \
+      pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T>   \
+        <<<grid, block, smem, stream>>>(                                \
+          queries,                                                      \
+          pqCentroidsInnermostCode,                                     \
+          topQueryToCentroid,                                           \
+          codeDistancesT,                                               \
+          listCodes.data().get(),                                       \
+          listLengths.data().get(),                                     \
+          prefixSumOffsets,                                             \
+          allDistances);                                                \
+    } while (0)
+
+#define RUN_PQ(NUM_SUB_Q)                       \
+    do {                                        \
+      if (useFloat16Lookup) {                   \
+        RUN_PQ_OPT(NUM_SUB_Q, half, Half8);     \
+      } else {                                  \
+        RUN_PQ_OPT(NUM_SUB_Q, float, float4);   \
+      }                                         \
+    } while (0)
+
+    switch (bytesPerCode) {
+      case 1:
+        RUN_PQ(1);
+        break;
+      case 2:
+        RUN_PQ(2);
+        break;
+      case 3:
+        RUN_PQ(3);
+        break;
+      case 4:
+        RUN_PQ(4);
+        break;
+      case 8:
+        RUN_PQ(8);
+        break;
+      case 12:
+        RUN_PQ(12);
+        break;
+      case 16:
+        RUN_PQ(16);
+        break;
+      case 20:
+        RUN_PQ(20);
+        break;
+      case 24:
+        RUN_PQ(24);
+        break;
+      case 28:
+        RUN_PQ(28);
+        break;
+      case 32:
+        RUN_PQ(32);
+        break;
+      case 40:
+        RUN_PQ(40);
+        break;
+      case 48:
+        RUN_PQ(48);
+        break;
+      case 56:
+        RUN_PQ(56);
+        break;
+      case 64:
+        RUN_PQ(64);
+        break;
+      case 96:
+        RUN_PQ(96);
+        break;
+      default:
+        FAISS_ASSERT(false);
+        break;
+    }
+
+#undef RUN_PQ
+#undef RUN_PQ_OPT
+  }
+
+  CUDA_TEST_ERROR();
+
+  // k-select the output in chunks, to increase parallelism
+  runPass1SelectLists(prefixSumOffsets,
+                      allDistances,
+                      topQueryToCentroid.getSize(1),
+                      k,
+                      !l2Distance, // L2 distance chooses smallest
+                      heapDistances,
+                      heapIndices,
+                      stream);
+
+  // k-select final output
+  auto flatHeapDistances = heapDistances.downcastInner<2>();
+  auto flatHeapIndices = heapIndices.downcastInner<2>();
+
+  runPass2SelectLists(flatHeapDistances,
+                      flatHeapIndices,
+                      listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      k,
+                      !l2Distance, // L2 distance chooses smallest
+                      outDistances,
+                      outIndices,
+                      stream);
+}
+
+template <typename CentroidT>
+void
+runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
+                                Tensor<CentroidT, 2, true>& centroids,
+                                Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                                Tensor<int, 2, true>& topQueryToCentroid,
+                                bool useFloat16Lookup,
+                                int bytesPerCode,
+                                int numSubQuantizers,
+                                int numSubQuantizerCodes,
+                                thrust::device_vector<void*>& listCodes,
+                                thrust::device_vector<void*>& listIndices,
+                                IndicesOptions indicesOptions,
+                                thrust::device_vector<int>& listLengths,
+                                int maxListLength,
+                                int k,
+                                faiss::MetricType metric,
+                                // output
+                                Tensor<float, 2, true>& outDistances,
+                                // output
+                                Tensor<long, 2, true>& outIndices,
+                                GpuResources* res) {
+  constexpr int kMinQueryTileSize = 8;
+  constexpr int kMaxQueryTileSize = 128;
+  constexpr int kThrustMemSize = 16384;
+
+  int nprobe = topQueryToCentroid.getSize(1);
+
+  auto& mem = res->getMemoryManagerCurrentDevice();
+  auto stream = res->getDefaultStreamCurrentDevice();
+
+  // Make a reservation for Thrust to do its dirty work (global memory
+  // cross-block reduction space); hopefully this is large enough.
+  DeviceTensor<char, 1, true> thrustMem1(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true> thrustMem2(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true>* thrustMem[2] =
+    {&thrustMem1, &thrustMem2};
+
+  // How much temporary storage is available?
+  // If possible, we'd like to fit within the space available.
+  size_t sizeAvailable = mem.getSizeAvailable();
+
+  // We run two passes of heap selection
+  // This is the size of the first-level heap passes
+  constexpr int kNProbeSplit = 8;
+  int pass2Chunks = std::min(nprobe, kNProbeSplit);
+
+  size_t sizeForFirstSelectPass =
+    pass2Chunks * k * (sizeof(float) + sizeof(int));
+
+  // How much temporary storage we need per each query
+  size_t sizePerQuery =
+    2 * // streams
+    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
+     nprobe * maxListLength * sizeof(float) + // allDistances
+     // residual distances
+     nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
+     sizeForFirstSelectPass);
+
+  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
+
+  if (queryTileSize < kMinQueryTileSize) {
+    queryTileSize = kMinQueryTileSize;
+  } else if (queryTileSize > kMaxQueryTileSize) {
+    queryTileSize = kMaxQueryTileSize;
+  }
+
+  // FIXME: we should adjust queryTileSize to deal with this, since
+  // indexing is in int32
+  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
+         std::numeric_limits<int>::max());
+
+  // Temporary memory buffers
+  // Make sure there is space prior to the start which will be 0, and
+  // will handle the boundary condition without branches
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    mem, {queryTileSize * nprobe + 1}, stream);
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    mem, {queryTileSize * nprobe + 1}, stream);
+
+  DeviceTensor<int, 2, true> prefixSumOffsets1(
+    prefixSumOffsetSpace1[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true> prefixSumOffsets2(
+    prefixSumOffsetSpace2[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
+    {&prefixSumOffsets1, &prefixSumOffsets2};
+
+  // Make sure the element before prefixSumOffsets is 0, since we
+  // depend upon simple, boundary-less indexing to get proper results
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+
+  int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+  int totalCodeDistancesSize =
+    queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
+    codeDistanceTypeSize;
+
+  DeviceTensor<char, 1, true> codeDistances1Mem(
+    mem, {totalCodeDistancesSize}, stream);
+  NoTypeTensor<4, true> codeDistances1(
+    codeDistances1Mem.data(),
+    codeDistanceTypeSize,
+    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+
+  DeviceTensor<char, 1, true> codeDistances2Mem(
+    mem, {totalCodeDistancesSize}, stream);
+  NoTypeTensor<4, true> codeDistances2(
+    codeDistances2Mem.data(),
+    codeDistanceTypeSize,
+    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+
+  NoTypeTensor<4, true>* codeDistances[2] =
+    {&codeDistances1, &codeDistances2};
+
+  DeviceTensor<float, 1, true> allDistances1(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true> allDistances2(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true>* allDistances[2] =
+    {&allDistances1, &allDistances2};
+
+  DeviceTensor<float, 3, true> heapDistances1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true> heapDistances2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true>* heapDistances[2] =
+    {&heapDistances1, &heapDistances2};
+
+  DeviceTensor<int, 3, true> heapIndices1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true> heapIndices2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true>* heapIndices[2] =
+    {&heapIndices1, &heapIndices2};
+
+  auto streams = res->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {stream});
+
+  int curStream = 0;
+
+  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
+    int numQueriesInTile =
+      std::min(queryTileSize, queries.getSize(0) - query);
+
+    auto prefixSumOffsetsView =
+      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto codeDistancesView =
+      codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto coarseIndicesView =
+      topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
+    auto queryView =
+      queries.narrowOutermost(query, numQueriesInTile);
+
+    auto heapDistancesView =
+      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto heapIndicesView =
+      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto outDistanceView =
+      outDistances.narrowOutermost(query, numQueriesInTile);
+    auto outIndicesView =
+      outIndices.narrowOutermost(query, numQueriesInTile);
+
+    runMultiPassTile(queryView,
+                     centroids,
+                     pqCentroidsInnermostCode,
+                     codeDistancesView,
+                     coarseIndicesView,
+                     useFloat16Lookup,
+                     bytesPerCode,
+                     numSubQuantizers,
+                     numSubQuantizerCodes,
+                     listCodes,
+                     listIndices,
+                     indicesOptions,
+                     listLengths,
+                     *thrustMem[curStream],
+                     prefixSumOffsetsView,
+                     *allDistances[curStream],
+                     heapDistancesView,
+                     heapIndicesView,
+                     k,
+                     metric,
+                     outDistanceView,
+                     outIndicesView,
+                     streams[curStream]);
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  streamWait({stream}, streams);
+}
+
+} } // namespace
diff --git a/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
index eae3869ce..5712eb675 100644
--- a/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
+++ b/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
@@ -21,8 +21,9 @@ class GpuResources;
 /// per subquantizer?
 bool isSupportedNoPrecomputedSubDimSize(int dims);
 
+template <typename CentroidT>
 void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
-                                     Tensor<float, 2, true>& centroids,
+                                     Tensor<CentroidT, 2, true>& centroids,
                                      Tensor<float, 3, true>& pqCentroidsInnermostCode,
                                      Tensor<int, 2, true>& topQueryToCentroid,
                                      bool useFloat16Lookup,
@@ -43,3 +44,5 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                      GpuResources* res);
 
 } } // namespace
+
+#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh>
diff --git a/gpu/perf/PerfFlat.cu b/gpu/perf/PerfFlat.cu
index 3b0e36ba1..20a16382f 100644
--- a/gpu/perf/PerfFlat.cu
+++ b/gpu/perf/PerfFlat.cu
@@ -76,7 +76,6 @@ int main(int argc, char** argv) {
     GpuIndexFlatConfig config;
     config.device = dev;
     config.useFloat16 = FLAGS_use_float16;
-    config.useFloat16Accumulator = FLAGS_use_float16_math;
     config.storeTransposed = FLAGS_transposed;
     config.memorySpace = FLAGS_use_unified_mem ?
     MemorySpace::Unified : MemorySpace::Device;
diff --git a/gpu/test/TestGpuIndexIVFPQ.cpp b/gpu/test/TestGpuIndexIVFPQ.cpp
index f5834f9a6..1bee6b4bb 100644
--- a/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -187,6 +187,41 @@ TEST(TestGpuIndexIVFPQ, Query_IP) {
   }
 }
 
+TEST(TestGpuIndexIVFPQ, Float16Coarse) {
+  Options opt;
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::IndexFlatL2 coarseQuantizer(opt.dim);
+  faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                             opt.codes, opt.bitsPerCode);
+  cpuIndex.nprobe = opt.nprobe;
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = opt.device;
+  config.flatConfig.useFloat16 = true;
+  config.usePrecomputedTables = opt.usePrecomputed;
+  config.indicesOptions = opt.indicesOpt;
+  config.useFloat16LookupTables = opt.useFloat16;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  gpuIndex.add(opt.numAdd, addVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             opt.getCompareEpsilon(),
+                             opt.getPctMaxDiff1(),
+                             opt.getPctMaxDiffN());
+}
+
 TEST(TestGpuIndexIVFPQ, Add_L2) {
   for (int tries = 0; tries < 2; ++tries) {
     Options opt;
diff --git a/gpu/test/test_gpu_index.py b/gpu/test/test_gpu_index.py
index 02ede949b..8b17b4801 100644
--- a/gpu/test/test_gpu_index.py
+++ b/gpu/test/test_gpu_index.py
@@ -98,7 +98,9 @@ class EvalIVFPQAccuracy(unittest.TestCase):
 
             D, Inew = gpu_index.search(xq, 10)
 
-            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
+            # 0.99: allow some tolerance in results otherwise test
+            # fails occasionally (not reproducible)
+            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
 
     def test_cpu_to_gpu_IVFPQ(self):
         self.do_cpu_to_gpu('IVF128,PQ4')
@@ -267,6 +269,45 @@ class TestGPUKmeans(unittest.TestCase):
         assert np.allclose(obj1, obj2)
 
 
+class TestAlternativeDistances(unittest.TestCase):
+
+    def do_test(self, metric, metric_arg=0):
+        res = faiss.StandardGpuResources()
+        d = 32
+        nb = 1000
+        nq = 100
+
+        rs = np.random.RandomState(123)
+        xb = rs.rand(nb, d).astype('float32')
+        xq = rs.rand(nq, d).astype('float32')
+
+        index_ref = faiss.IndexFlat(d, metric)
+        index_ref.metric_arg = metric_arg
+        index_ref.add(xb)
+        Dref, Iref = index_ref.search(xq, 10)
+
+        # build from other index
+        index = faiss.GpuIndexFlat(res, index_ref)
+        Dnew, Inew = index.search(xq, 10)
+        np.testing.assert_array_equal(Inew, Iref)
+        np.testing.assert_allclose(Dnew, Dref, rtol=1e-6)
+
+        #  build from scratch
+        index = faiss.GpuIndexFlat(res, d, metric)
+        index.metric_arg = metric_arg
+        index.add(xb)
+
+        Dnew, Inew = index.search(xq, 10)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_L1(self):
+        self.do_test(faiss.METRIC_L1)
+
+    def test_Linf(self):
+        self.do_test(faiss.METRIC_Linf)
+
+    def test_Lp(self):
+        self.do_test(faiss.METRIC_Lp, 0.7)
 
 
 if __name__ == '__main__':
diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh
new file mode 100644
index 000000000..ede225e03
--- /dev/null
+++ b/gpu/utils/MatrixMult-inl.cuh
@@ -0,0 +1,160 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss { namespace gpu {
+
+class DeviceMemory;
+
+template <typename T>
+struct GetCudaType;
+
+template <>
+struct GetCudaType<float> {
+  static constexpr cudaDataType_t Type = CUDA_R_32F;
+};
+
+template <>
+struct GetCudaType<half> {
+  static constexpr cudaDataType_t Type = CUDA_R_16F;
+};
+
+template <typename AT, typename BT>
+cublasStatus_t
+rawGemm(cublasHandle_t handle,
+        cublasOperation_t transa,
+        cublasOperation_t transb,
+        int m,
+        int n,
+        int k,
+        const float fAlpha,
+        const AT *A,
+        int lda,
+        const BT *B,
+        int ldb,
+        const float fBeta,
+        float *C,
+        int ldc) {
+  auto cAT = GetCudaType<AT>::Type;
+  auto cBT = GetCudaType<BT>::Type;
+
+  // Always accumulate in f32
+  return cublasSgemmEx(handle, transa, transb, m, n, k,
+                       &fAlpha, A, cAT, lda,
+                       B, cBT, ldb,
+                       &fBeta,
+                       C, CUDA_R_32F, ldc);
+}
+
+template <typename AT, typename BT>
+void
+runMatrixMult(Tensor<float, 2, true>& c, bool transC,
+              Tensor<AT, 2, true>& a, bool transA,
+              Tensor<BT, 2, true>& b, bool transB,
+              float alpha,
+              float beta,
+              cublasHandle_t handle,
+              cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+
+  // Check that we have (m x k) * (k x n) = (m x n)
+  // using the input row-major layout
+  int aM = transA ? a.getSize(1) : a.getSize(0);
+  int aK = transA ? a.getSize(0) : a.getSize(1);
+
+  int bK = transB ? b.getSize(1) : b.getSize(0);
+  int bN = transB ? b.getSize(0) : b.getSize(1);
+
+  int cM = transC ? c.getSize(1) : c.getSize(0);
+  int cN = transC ? c.getSize(0) : c.getSize(1);
+
+  FAISS_ASSERT(aM == cM);
+  FAISS_ASSERT(aK == bK);
+  FAISS_ASSERT(bN == cN);
+
+  FAISS_ASSERT(a.getStride(1) == 1);
+  FAISS_ASSERT(b.getStride(1) == 1);
+  FAISS_ASSERT(c.getStride(1) == 1);
+
+  // Now, we have to represent the matrix multiplication in
+  // column-major layout
+  float* pC = c.data();
+
+  int m = c.getSize(1); // stride 1 size
+  int n = c.getSize(0); // other size
+  int k = transA ? a.getSize(0) : a.getSize(1);
+
+  int lda = transC ? a.getStride(0) : b.getStride(0);
+  int ldb = transC ? b.getStride(0) : a.getStride(0);
+  int ldc = c.getStride(0);
+
+  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  if (transC) {
+    gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
+    gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
+  }
+
+  cublasStatus_t err;
+
+  if (transC) {
+    err = rawGemm(handle,
+                  gemmTrA, gemmTrB,
+                  m, n, k, alpha,
+                  a.data(), lda, b.data(), ldb, beta,
+                  pC, ldc);
+  } else {
+    err = rawGemm(handle,
+                  gemmTrA, gemmTrB,
+                  m, n, k, alpha,
+                  b.data(), lda, a.data(), ldb, beta,
+                  pC, ldc);
+  }
+
+  FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
+                   "cublas failed (%d): "
+                   "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
+                   (int) err,
+                   a.getSize(0), a.getSize(1), transA ? "'" : "",
+                   b.getSize(0), b.getSize(1), transB ? "'" : "",
+                   c.getSize(0), c.getSize(1), transC ? "'" : "");
+  CUDA_TEST_ERROR();
+}
+
+template <typename AT, typename BT>
+void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
+                           Tensor<AT, 3, true>& a, bool transA,
+                           Tensor<BT, 3, true>& b, bool transB,
+                           float alpha,
+                           float beta,
+                           cublasHandle_t handle,
+                           cudaStream_t stream) {
+  FAISS_ASSERT(c.getSize(0) == a.getSize(0));
+  FAISS_ASSERT(a.getSize(0) == b.getSize(0));
+
+  for (int i = 0; i < a.getSize(0); ++i) {
+    auto cView = c[i].view();
+    auto aView = a[i].view();
+    auto bView = b[i].view();
+
+    runMatrixMult(cView, transC,
+                  aView, transA,
+                  bView, transB,
+                  alpha, beta, handle, stream);
+  }
+}
+
+} } // namespace
diff --git a/gpu/utils/MatrixMult.cu b/gpu/utils/MatrixMult.cu
index 31e6b2c2c..2afb5017b 100644
--- a/gpu/utils/MatrixMult.cu
+++ b/gpu/utils/MatrixMult.cu
@@ -8,176 +8,9 @@
 
 #include <faiss/gpu/utils/MatrixMult.cuh>
 #include <faiss/gpu/utils/DeviceMemory.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
 
 namespace faiss { namespace gpu {
 
-template <typename T>
-struct CublasGemm {
-};
-
-template <>
-struct CublasGemm<float> {
-  static cublasStatus_t gemm(cublasHandle_t handle,
-                             cublasOperation_t transa,
-                             cublasOperation_t transb,
-                             int m,
-                             int n,
-                             int k,
-                             float fAlpha,
-                             const float *A,
-                             int lda,
-                             const float *B,
-                             int ldb,
-                             float fBeta,
-                             float *C,
-                             int ldc) {
-    return cublasSgemm(handle, transa, transb, m, n, k,
-                       &fAlpha, A, lda, B, ldb, &fBeta, C, ldc);
-  }
-};
-
-template <>
-struct CublasGemm<half> {
-  static cublasStatus_t gemm(cublasHandle_t handle,
-                             cublasOperation_t transa,
-                             cublasOperation_t transb,
-                             int m,
-                             int n,
-                             int k,
-                             const float fAlpha,
-                             const half *A,
-                             int lda,
-                             const half *B,
-                             int ldb,
-                             const float fBeta,
-                             float *C,
-                             int ldc) {
-    // Always accumulate in f32
-    return cublasSgemmEx(handle, transa, transb, m, n, k,
-                         &fAlpha, A, CUDA_R_16F, lda,
-                         B, CUDA_R_16F, ldb,
-                         &fBeta,
-                         C, CUDA_R_32F, ldc);
-  }
-};
-
-template <typename T>
-void
-runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-              Tensor<T, 2, true>& a, bool transA,
-              Tensor<T, 2, true>& b, bool transB,
-              float alpha,
-              float beta,
-              cublasHandle_t handle,
-              cudaStream_t stream) {
-  cublasSetStream(handle, stream);
-
-  // Check that we have (m x k) * (k x n) = (m x n)
-  // using the input row-major layout
-  int aM = transA ? a.getSize(1) : a.getSize(0);
-  int aK = transA ? a.getSize(0) : a.getSize(1);
-
-  int bK = transB ? b.getSize(1) : b.getSize(0);
-  int bN = transB ? b.getSize(0) : b.getSize(1);
-
-  int cM = transC ? c.getSize(1) : c.getSize(0);
-  int cN = transC ? c.getSize(0) : c.getSize(1);
-
-  FAISS_ASSERT(aM == cM);
-  FAISS_ASSERT(aK == bK);
-  FAISS_ASSERT(bN == cN);
-
-  FAISS_ASSERT(a.getStride(1) == 1);
-  FAISS_ASSERT(b.getStride(1) == 1);
-  FAISS_ASSERT(c.getStride(1) == 1);
-
-  // Now, we have to represent the matrix multiplication in
-  // column-major layout
-  T* pA = transC ? a.data() : b.data();
-  T* pB = transC ? b.data() : a.data();
-  float* pC = c.data();
-
-  int m = c.getSize(1); // stride 1 size
-  int n = c.getSize(0); // other size
-  int k = transA ? a.getSize(0) : a.getSize(1);
-
-  int lda = transC ? a.getStride(0) : b.getStride(0);
-  int ldb = transC ? b.getStride(0) : a.getStride(0);
-  int ldc = c.getStride(0);
-
-  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  if (transC) {
-    gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
-    gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
-  }
-
-  auto err = CublasGemm<T>::gemm(handle,
-                                 gemmTrA, gemmTrB,
-                                 m, n, k, alpha,
-                                 pA, lda, pB, ldb, beta,
-                                 pC, ldc);
-
-  FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
-                   "cublas failed (%d): "
-                   "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
-                   (int) err,
-                   a.getSize(0), a.getSize(1), transA ? "'" : "",
-                   b.getSize(0), b.getSize(1), transB ? "'" : "",
-                   c.getSize(0), c.getSize(1), transC ? "'" : "");
-  CUDA_TEST_ERROR();
-}
-
-void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-                   Tensor<float, 2, true>& a, bool transA,
-                   Tensor<float, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   cublasHandle_t handle,
-                   cudaStream_t stream) {
-  return runMatrixMult<float>(c, transC, a, transA, b, transB,
-                              alpha, beta, handle, stream);
-}
-
-void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-                   Tensor<half, 2, true>& a, bool transA,
-                   Tensor<half, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   cublasHandle_t handle,
-                   cudaStream_t stream) {
-  return runMatrixMult<half>(c, transC, a, transA, b, transB,
-                             alpha, beta, handle, stream);
-}
-
-void
-runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
-                      Tensor<float, 3, true>& a, bool transA,
-                      Tensor<float, 3, true>& b, bool transB,
-                      float alpha,
-                      float beta,
-                      cublasHandle_t handle,
-                      cudaStream_t stream) {
-  FAISS_ASSERT(c.getSize(0) == a.getSize(0));
-  FAISS_ASSERT(a.getSize(0) == b.getSize(0));
-
-  for (int i = 0; i < a.getSize(0); ++i) {
-    auto cView = c[i].view();
-    auto aView = a[i].view();
-    auto bView = b[i].view();
-
-    runMatrixMult(cView, transC,
-                  aView, transA,
-                  bView, transB,
-                  alpha, beta, handle, stream);
-  }
-}
-
 void
 runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
                    Tensor<float, 3, true>& a, bool transA,
diff --git a/gpu/utils/MatrixMult.cuh b/gpu/utils/MatrixMult.cuh
index 548457f2c..eeb11ccc5 100644
--- a/gpu/utils/MatrixMult.cuh
+++ b/gpu/utils/MatrixMult.cuh
@@ -10,6 +10,9 @@
 
 #include <cublas_v2.h>
 #include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
 
 namespace faiss { namespace gpu {
 
@@ -17,30 +20,23 @@ class DeviceMemory;
 
 /// C = alpha * A * B + beta * C
 /// Expects row major layout, not fortran/blas column major!
-void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-                   Tensor<float, 2, true>& a, bool transA,
-                   Tensor<float, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   cublasHandle_t handle,
-                   cudaStream_t stream);
-
-/// C = alpha * A * B + beta * C
-/// Expects row major layout, not fortran/blas column major!
-void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-                   Tensor<half, 2, true>& a, bool transA,
-                   Tensor<half, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   cublasHandle_t handle,
-                   cudaStream_t stream);
+template <typename AT, typename BT>
+void
+runMatrixMult(Tensor<float, 2, true>& c, bool transC,
+              Tensor<AT, 2, true>& a, bool transA,
+              Tensor<BT, 2, true>& b, bool transB,
+              float alpha,
+              float beta,
+              cublasHandle_t handle,
+              cudaStream_t stream);
 
 /// C_i = alpha * A_i * B_i + beta * C_i
 /// where `i` is the outermost dimension, via iterated gemm
 /// Expects row major layout, not fortran/blas column major!
+template <typename AT, typename BT>
 void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
-                           Tensor<float, 3, true>& a, bool transA,
-                           Tensor<float, 3, true>& b, bool transB,
+                           Tensor<AT, 3, true>& a, bool transA,
+                           Tensor<BT, 3, true>& b, bool transB,
                            float alpha,
                            float beta,
                            cublasHandle_t handle,
@@ -59,3 +55,5 @@ void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
                         cudaStream_t stream);
 
 } } // namespace
+
+#include <faiss/gpu/utils/MatrixMult-inl.cuh>
diff --git a/impl/AuxIndexStructures.h b/impl/AuxIndexStructures.h
index 840e191ce..c82b9ed56 100644
--- a/impl/AuxIndexStructures.h
+++ b/impl/AuxIndexStructures.h
@@ -51,9 +51,7 @@ struct RangeSearchResult {
 };
 
 
-/**
-
- Encapsulates a set of ids to remove. */
+/** Encapsulates a set of ids to remove. */
 struct IDSelector {
     typedef Index::idx_t idx_t;
     virtual bool is_member (idx_t id) const = 0;
diff --git a/impl/PolysemousTraining.h b/impl/PolysemousTraining.h
index cf511a74c..c27a48c99 100644
--- a/impl/PolysemousTraining.h
+++ b/impl/PolysemousTraining.h
@@ -123,15 +123,15 @@ struct PolysemousTraining: SimulatedAnnealingParameters {
     enum Optimization_type_t {
         OT_None,
         OT_ReproduceDistances_affine,  ///< default
-        OT_Ranking_weighted_diff  /// same as _2, but use rank of y+ - rank of y-
+        OT_Ranking_weighted_diff       ///< same as _2, but use rank of y+ - rank of y-
     };
     Optimization_type_t optimization_type;
 
-    // use 1/4 of the training points for the optimization, with
-    // max. ntrain_permutation. If ntrain_permutation == 0: train on
-    // centroids
+    /** use 1/4 of the training points for the optimization, with
+     * max. ntrain_permutation. If ntrain_permutation == 0: train on
+     * centroids */
     int ntrain_permutation;
-    double dis_weight_factor; // decay of exp that weights distance loss
+    double dis_weight_factor; ///< decay of exp that weights distance loss
 
     // filename pattern for the logging of iterations
     std::string log_pattern;
diff --git a/impl/index_read.cpp b/impl/index_read.cpp
index 8e582e2df..cab9dc642 100644
--- a/impl/index_read.cpp
+++ b/impl/index_read.cpp
@@ -19,6 +19,7 @@
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/io.h>
+#include <faiss/utils/hamming.h>
 
 #include <faiss/IndexFlat.h>
 #include <faiss/VectorTransform.h>
@@ -41,6 +42,7 @@
 #include <faiss/IndexBinaryFromFloat.h>
 #include <faiss/IndexBinaryHNSW.h>
 #include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexBinaryHash.h>
 
 
 
@@ -752,6 +754,56 @@ static void read_binary_ivf_header (
     read_direct_map (&ivf->direct_map, f);
 }
 
+static void read_binary_hash_invlists (
+        IndexBinaryHash::InvertedListMap &invlists,
+        int b, IOReader *f)
+{
+    size_t sz;
+    READ1 (sz);
+    int il_nbit = 0;
+    READ1 (il_nbit);
+    // buffer for bitstrings
+    std::vector<uint8_t> buf((b + il_nbit) * sz);
+    READVECTOR (buf);
+    BitstringReader rd (buf.data(), buf.size());
+    invlists.reserve (sz);
+    for (size_t i = 0; i < sz; i++) {
+        uint64_t hash = rd.read(b);
+        uint64_t ilsz = rd.read(il_nbit);
+        auto & il = invlists[hash];
+        READVECTOR (il.ids);
+        FAISS_THROW_IF_NOT (il.ids.size() == ilsz);
+        READVECTOR (il.vecs);
+    }
+}
+
+static void read_binary_multi_hash_map(
+        IndexBinaryMultiHash::Map &map,
+        int b, size_t ntotal,
+        IOReader *f)
+{
+    int id_bits;
+    size_t sz;
+    READ1 (id_bits);
+    READ1 (sz);
+    std::vector<uint8_t> buf;
+    READVECTOR (buf);
+    size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
+    FAISS_THROW_IF_NOT (buf.size() == (nbit + 7) / 8);
+    BitstringReader rd (buf.data(), buf.size());
+    map.reserve (sz);
+    for (size_t i = 0; i < sz; i++) {
+        uint64_t hash = rd.read(b);
+        uint64_t ilsz = rd.read(id_bits);
+        auto & il = map[hash];
+        for (size_t j = 0; j < ilsz; j++) {
+            il.push_back (rd.read (id_bits));
+        }
+    }
+}
+
+
+
 IndexBinary *read_index_binary (IOReader *f, int io_flags) {
     IndexBinary * idx = nullptr;
     uint32_t h;
@@ -793,6 +845,28 @@ IndexBinary *read_index_binary (IOReader *f, int io_flags) {
             static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map ();
         }
         idx = idxmap;
+    } else if(h == fourcc("IBHh")) {
+        IndexBinaryHash *idxh = new IndexBinaryHash ();
+        read_index_binary_header (idxh, f);
+        READ1 (idxh->b);
+        READ1 (idxh->nflip);
+        read_binary_hash_invlists(idxh->invlists, idxh->b, f);
+        idx = idxh;
+    } else if(h == fourcc("IBHm")) {
+        IndexBinaryMultiHash* idxmh = new IndexBinaryMultiHash ();
+        read_index_binary_header (idxmh, f);
+        idxmh->storage = dynamic_cast<IndexBinaryFlat*> (read_index_binary (f));
+        FAISS_THROW_IF_NOT(idxmh->storage && idxmh->storage->ntotal == idxmh->ntotal);
+        idxmh->own_fields = true;
+        READ1 (idxmh->b);
+        READ1 (idxmh->nhash);
+        READ1 (idxmh->nflip);
+        idxmh->maps.resize (idxmh->nhash);
+        for (int i = 0; i < idxmh->nhash; i++) {
+            read_binary_multi_hash_map(
+                    idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
+        }
+        idx = idxmh;
     } else {
         FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
         idx = nullptr;
diff --git a/impl/index_write.cpp b/impl/index_write.cpp
index 4818f13c4..46cc2a68c 100644
--- a/impl/index_write.cpp
+++ b/impl/index_write.cpp
@@ -19,6 +19,7 @@
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/io.h>
+#include <faiss/utils/hamming.h>
 
 #include <faiss/IndexFlat.h>
 #include <faiss/VectorTransform.h>
@@ -41,6 +42,7 @@
 #include <faiss/IndexBinaryFromFloat.h>
 #include <faiss/IndexBinaryHNSW.h>
 #include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexBinaryHash.h>
 
 
 
@@ -515,6 +517,67 @@ static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) {
     write_direct_map (&ivf->direct_map, f);
 }
 
+static void write_binary_hash_invlists (
+        const IndexBinaryHash::InvertedListMap &invlists,
+        int b, IOWriter *f)
+{
+    size_t sz = invlists.size();
+    WRITE1 (sz);
+    size_t maxil = 0;
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        if(it->second.ids.size() > maxil) {
+            maxil = it->second.ids.size();
+        }
+    }
+    int il_nbit = 0;
+    while(maxil >= ((uint64_t)1 << il_nbit)) {
+        il_nbit++;
+    }
+    WRITE1(il_nbit);
+
+    // first write sizes then data, may be useful if we want to
+    // memmap it at some point
+
+    // buffer for bitstrings
+    std::vector<uint8_t> buf (((b + il_nbit) * sz + 7) / 8);
+    BitstringWriter wr (buf.data(), buf.size());
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        wr.write (it->first, b);
+        wr.write (it->second.ids.size(), il_nbit);
+    }
+    WRITEVECTOR (buf);
+
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        WRITEVECTOR (it->second.ids);
+        WRITEVECTOR (it->second.vecs);
+    }
+}
+
+static void write_binary_multi_hash_map(
+        const IndexBinaryMultiHash::Map &map,
+        int b, size_t ntotal,
+        IOWriter *f)
+{
+    int id_bits = 0;
+    while ((ntotal > ((Index::idx_t)1 << id_bits))) {
+        id_bits++;
+    }
+    WRITE1(id_bits);
+    size_t sz = map.size();
+    WRITE1(sz);
+    size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
+    std::vector<uint8_t> buf((nbit + 7) / 8);
+    BitstringWriter wr (buf.data(), buf.size());
+    for (auto it = map.begin(); it != map.end(); ++it) {
+        wr.write(it->first, b);
+        wr.write(it->second.size(), id_bits);
+        for (auto id : it->second) {
+            wr.write(id, id_bits);
+        }
+    }
+    WRITEVECTOR (buf);
+}
+
 void write_index_binary (const IndexBinary *idx, IOWriter *f) {
     if (const IndexBinaryFlat *idxf =
         dynamic_cast<const IndexBinaryFlat *> (idx)) {
@@ -551,6 +614,27 @@ void write_index_binary (const IndexBinary *idx, IOWriter *f) {
         write_index_binary_header (idxmap, f);
         write_index_binary (idxmap->index, f);
         WRITEVECTOR (idxmap->id_map);
+    } else if (const IndexBinaryHash *idxh =
+               dynamic_cast<const IndexBinaryHash *> (idx)) {
+        uint32_t h = fourcc ("IBHh");
+        WRITE1 (h);
+        write_index_binary_header (idxh, f);
+        WRITE1 (idxh->b);
+        WRITE1 (idxh->nflip);
+        write_binary_hash_invlists(idxh->invlists, idxh->b, f);
+    } else if (const IndexBinaryMultiHash *idxmh =
+               dynamic_cast<const IndexBinaryMultiHash *> (idx)) {
+        uint32_t h = fourcc ("IBHm");
+        WRITE1 (h);
+        write_index_binary_header (idxmh, f);
+        write_index_binary (idxmh->storage, f);
+        WRITE1 (idxmh->b);
+        WRITE1 (idxmh->nhash);
+        WRITE1 (idxmh->nflip);
+        for (int i = 0; i < idxmh->nhash; i++) {
+            write_binary_multi_hash_map(
+                    idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
+        }
     } else {
         FAISS_THROW_MSG ("don't know how to serialize this type of index");
     }
diff --git a/impl/io.cpp b/impl/io.cpp
index e8ffca6bc..0954f3c1f 100644
--- a/impl/io.cpp
+++ b/impl/io.cpp
@@ -37,7 +37,6 @@ int IOWriter::fileno ()
  ***********************************************************************/
 
 
-
 size_t VectorIOWriter::operator()(
                 const void *ptr, size_t size, size_t nitems)
 {
@@ -132,6 +131,117 @@ int FileIOWriter::fileno()  {
     return ::fileno (f);
 }
 
+/***********************************************************************
+ * IO buffer
+ ***********************************************************************/
+
+BufferedIOReader::BufferedIOReader(IOReader *reader, size_t bsz, size_t totsz):
+    reader(reader), bsz(bsz), totsz(totsz), ofs(0), b0(0), b1(0), buffer(bsz)
+{
+}
+
+
+size_t BufferedIOReader::operator()(void *ptr, size_t unitsize, size_t nitems)
+{
+    size_t size = unitsize * nitems;
+    if (size == 0) return 0;
+    char * dst = (char*)ptr;
+    size_t nb;
+
+    { // first copy available bytes
+        nb = std::min(b1 - b0, size);
+        memcpy (dst, buffer.data() + b0, nb);
+        b0 += nb;
+        dst += nb;
+        size -= nb;
+    }
+
+    if (size > totsz - ofs) {
+        size = totsz - ofs;
+    }
+    // while we would like to have more data
+    while (size > 0) {
+        assert (b0 == b1); // buffer empty on input
+        // try to read from main reader
+        b0 = 0;
+        b1 = (*reader)(buffer.data(), 1, std::min(bsz, size));
+
+        if (b1 == 0) {
+            // no more bytes available
+            break;
+        }
+        ofs += b1;
+
+        // copy remaining bytes
+        size_t nb2 = std::min(b1, size);
+        memcpy (dst, buffer.data(), nb2);
+        b0 = nb2;
+        nb += nb2;
+        dst += nb2;
+        size -= nb2;
+    }
+    return nb / unitsize;
+}
+
+
+BufferedIOWriter::BufferedIOWriter(IOWriter *writer, size_t bsz):
+    writer(writer), bsz(bsz), b0(0), buffer(bsz)
+{
+}
+
+size_t BufferedIOWriter::operator()(const void *ptr, size_t unitsize, size_t nitems)
+{
+    size_t size = unitsize * nitems;
+    if (size == 0) return 0;
+    const char * src = (const char*)ptr;
+    size_t nb;
+
+    { // copy as many bytes as possible to buffer
+        nb = std::min(bsz - b0, size);
+        memcpy (buffer.data() + b0, src, nb);
+        b0 += nb;
+        src += nb;
+        size -= nb;
+    }
+    while (size > 0) {
+        assert(b0 == bsz);
+        // now we need to flush to add more bytes
+        size_t ofs = 0;
+        do {
+            assert (ofs < 10000000);
+            size_t written = (*writer)(buffer.data() + ofs, 1, bsz - ofs);
+            FAISS_THROW_IF_NOT(written > 0);
+            ofs += written;
+        } while(ofs != bsz);
+
+        // copy src to buffer
+        size_t nb1 = std::min(bsz, size);
+        memcpy (buffer.data(), src, nb1);
+        b0 = nb1;
+        nb += nb1;
+        src += nb1;
+        size -= nb1;
+    }
+
+    return nb / unitsize;
+}
+
+BufferedIOWriter::~BufferedIOWriter()
+{
+    size_t ofs = 0;
+    while(ofs != b0) {
+        printf("Destructor write %ld \n", b0 - ofs);
+        size_t written = (*writer)(buffer.data() + ofs, 1, b0 - ofs);
+        FAISS_THROW_IF_NOT(written > 0);
+        ofs += written;
+    }
+
+}
+
+
+
+
+
 uint32_t fourcc (const char sx[4]) {
     assert(4 == strlen(sx));
     const unsigned char *x = (unsigned char*)sx;
diff --git a/impl/io.h b/impl/io.h
index 173d87da6..a3a565af2 100644
--- a/impl/io.h
+++ b/impl/io.h
@@ -9,6 +9,9 @@
 
 /***********************************************************
  * Abstract I/O objects
+ *
+ * I/O is always sequential, seek does not need to be supported
+ * (indexes could be read or written to a pipe).
  ***********************************************************/
 
 #pragma once
@@ -92,6 +95,41 @@ struct FileIOWriter: IOWriter {
     int fileno() override;
 };
 
+/*******************************************************
+ * Buffered reader + writer
+ *******************************************************/
+
+
+
+/** wraps an ioreader to make buffered reads to avoid too small reads */
+struct BufferedIOReader: IOReader {
+
+    IOReader *reader;
+    size_t bsz, totsz, ofs;
+    size_t b0, b1; ///< range of available bytes in the buffer
+    std::vector<char> buffer;
+
+    BufferedIOReader(IOReader *reader, size_t bsz,
+                     size_t totsz=(size_t)(-1));
+
+    size_t operator()(void *ptr, size_t size, size_t nitems) override;
+};
+
+struct BufferedIOWriter: IOWriter {
+
+    IOWriter *writer;
+    size_t bsz, ofs;
+    size_t b0; ///< amount of data in buffer
+    std::vector<char> buffer;
+
+    BufferedIOWriter(IOWriter *writer, size_t bsz);
+
+    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
+
+    // flushes
+    ~BufferedIOWriter();
+};
+
 /// cast a 4-character string to a uint32_t that can be written and read easily
 uint32_t fourcc (const char sx[4]);
 
diff --git a/python/faiss.py b/python/faiss.py
index 163e2cb96..2d58b7f70 100644
--- a/python/faiss.py
+++ b/python/faiss.py
@@ -283,6 +283,18 @@ def handle_IndexBinary(the_class):
                       swig_ptr(labels))
         return distances, labels
 
+    def replacement_range_search(self, x, thresh):
+        n, d = x.shape
+        assert d * 8 == self.d
+        res = RangeSearchResult(n)
+        self.range_search_c(n, swig_ptr(x), thresh, res)
+        # get pointers and copy them
+        lims = rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = rev_swig_ptr(res.distances, nd).copy()
+        I = rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
     def replacement_remove_ids(self, x):
         if isinstance(x, IDSelector):
             sel = x
@@ -295,6 +307,7 @@ def handle_IndexBinary(the_class):
     replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
     replace_method(the_class, 'train', replacement_train)
     replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'range_search', replacement_range_search)
     replace_method(the_class, 'reconstruct', replacement_reconstruct)
     replace_method(the_class, 'remove_ids', replacement_remove_ids)
 
@@ -461,6 +474,9 @@ add_ref_in_constructor(IndexBinaryIDMap2, 0)
 add_ref_in_method(IndexReplicas, 'addIndex', 0)
 add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0)
 
+add_ref_in_constructor(BufferedIOWriter, 0)
+add_ref_in_constructor(BufferedIOReader, 0)
+
 # seems really marginal...
 # remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
 
@@ -751,9 +767,24 @@ def deserialize_index(data):
     copy_array_to_vector(data, reader.data)
     return read_index(reader)
 
+def serialize_index_binary(index):
+    """ convert an index to a numpy uint8 array  """
+    writer = VectorIOWriter()
+    write_index_binary(index, writer)
+    return vector_to_array(writer.data)
+
+def deserialize_index_binary(data):
+    reader = VectorIOReader()
+    copy_array_to_vector(data, reader.data)
+    return read_index_binary(reader)
+
+
+###########################################
+# ResultHeap
+###########################################
 
 class ResultHeap:
-    """Combine query results from a sliced dataset. The final result will
+    """Accumulate query results from a sliced dataset. The final result will
     be in self.D, self.I."""
 
     def __init__(self, nq, k):
diff --git a/python/setup.py b/python/setup.py
index a5ebdbc84..89b6d398c 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -32,7 +32,7 @@ are implemented on the GPU. It is developed by Facebook AI Research.
 """
 setup(
     name='faiss',
-    version='1.6.2',
+    version='1.6.3',
     description='A library for efficient similarity search and clustering of dense vectors',
     long_description=long_description,
     url='https://github.com/facebookresearch/faiss',
diff --git a/python/swigfaiss.swig b/python/swigfaiss.swig
index 9d09b3720..b0d8b8173 100644
--- a/python/swigfaiss.swig
+++ b/python/swigfaiss.swig
@@ -93,6 +93,7 @@ extern "C" {
 #include <faiss/IndexBinaryIVF.h>
 #include <faiss/IndexBinaryFromFloat.h>
 #include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryHash.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/index_io.h>
@@ -359,6 +360,7 @@ void gpu_sync_all_devices()
 %include  <faiss/IndexBinaryIVF.h>
 %include  <faiss/IndexBinaryFromFloat.h>
 %include  <faiss/IndexBinaryHNSW.h>
+%include  <faiss/IndexBinaryHash.h>
 
 
 
@@ -979,6 +981,124 @@ struct MapLong2Long {
 
 %}
 
+/*******************************************************************
+ * Support I/O to arbitrary functions
+ *******************************************************************/
+
+
+%inline %{
+
+#ifdef SWIGPYTHON
+
+
+struct PyCallbackIOWriter: faiss::IOWriter {
+
+    PyObject * callback;
+    size_t bs; // maximum write size
+
+    PyCallbackIOWriter(PyObject *callback,
+                       size_t bs = 1024 * 1024):
+        callback(callback), bs(bs) {
+        Py_INCREF(callback);
+        name = "PyCallbackIOWriter";
+    }
+
+    size_t operator()(const void *ptrv, size_t size, size_t nitems) override {
+        size_t ws = size * nitems;
+        const char *ptr = (const char*)ptrv;
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+        while(ws > 0) {
+            size_t wi = ws > bs ? bs : ws;
+            PyObject* bo = PyBytes_FromStringAndSize(ptr, wi);
+            PyObject *arglist = Py_BuildValue("(N)", bo);
+            if(!arglist) {
+                PyGILState_Release(gstate);
+                return 0;
+            }
+            ptr += wi;
+            ws -= wi;
+            PyObject * result = PyObject_CallObject(callback, arglist);
+            Py_DECREF(arglist);
+            if (result == NULL) {
+                PyGILState_Release(gstate);
+                return 0;
+            }
+            Py_DECREF(result);
+        }
+        PyGILState_Release(gstate);
+        return nitems;
+    }
+
+    ~PyCallbackIOWriter() {
+        Py_DECREF(callback);
+    }
+
+};
+
+struct PyCallbackIOReader: faiss::IOReader {
+
+    PyObject * callback;
+    size_t bs; // maximum buffer size
+
+    PyCallbackIOReader(PyObject *callback,
+                       size_t bs = 1024 * 1024):
+        callback(callback), bs(bs) {
+        Py_INCREF(callback);
+        name = "PyCallbackIOReader";
+    }
+
+    size_t operator()(void *ptrv, size_t size, size_t nitems) override {
+        size_t rs = size * nitems;
+        char *ptr = (char*)ptrv;
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+        while(rs > 0) {
+            size_t ri = rs > bs ? bs : rs;
+            PyObject *arglist = Py_BuildValue("(n)", ri);
+            PyObject * result = PyObject_CallObject(callback, arglist);
+            Py_DECREF(arglist);
+            if (result == NULL) {
+                PyGILState_Release(gstate);
+                return 0;
+            }
+            if(!PyBytes_Check(result)) {
+                Py_DECREF(result);
+                PyErr_SetString(PyExc_RuntimeError,
+                                "read callback did not return a bytes object");
+                PyGILState_Release(gstate);
+                throw faiss::FaissException("reader error");
+            }
+            size_t sz = PyBytes_Size(result);
+            if (sz == 0 || sz > rs) {
+                Py_DECREF(result);
+                PyErr_Format(PyExc_RuntimeError,
+                             "read callback returned %ld bytes (asked %ld)",
+                             sz, rs);
+                PyGILState_Release(gstate);
+                throw faiss::FaissException("reader error");
+            }
+            memcpy(ptr, PyBytes_AsString(result), sz);
+            Py_DECREF(result);
+            ptr += sz;
+            rs -= sz;
+        }
+        PyGILState_Release(gstate);
+        return nitems;
+    }
+
+    ~PyCallbackIOReader() {
+        Py_DECREF(callback);
+    }
+
+};
+
+#endif
+
+%}
+
+
+
 %inline %{
     void wait() {
         // in gdb, use return to get out of this function
diff --git a/tests/common.py b/tests/common.py
index 290e317c3..8621dd822 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -97,3 +97,32 @@ def get_dataset_2(d, nt, nb, nq):
     x = np.sin(x)
     x = x.astype('float32')
     return x[:nt], x[nt:nt + nb], x[nt + nb:]
+
+
+def make_binary_dataset(d, nt, nb, nq):
+    assert d % 8 == 0
+    rs = np.random.RandomState(123)
+    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
+    return x[:nt], x[nt:-nq], x[-nq:]
+
+
+def compare_binary_result_lists(D1, I1, D2, I2):
+    """comparing result lists is difficult because there are many
+    ties. Here we sort by (distance, index) pairs and ignore the largest
+    distance of each result. Compatible result lists should pass this."""
+    assert D1.shape == I1.shape == D2.shape == I2.shape
+    n, k = D1.shape
+    ndiff = (D1 != D2).sum()
+    assert ndiff == 0, '%d differences in distance matrix %s' % (
+        ndiff, D1.shape)
+
+    def normalize_DI(D, I):
+        norm = I.max() + 1.0
+        Dr = D.astype('float64') + I / norm
+        # ignore -1s and elements on last column
+        Dr[I1 == -1] = 1e20
+        Dr[D == D[:, -1:]] = 1e20
+        Dr.sort(axis=1)
+        return Dr
+    ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
+    assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
diff --git a/tests/test_binary_hashindex.py b/tests/test_binary_hashindex.py
new file mode 100644
index 000000000..1ee5a5f7d
--- /dev/null
+++ b/tests/test_binary_hashindex.py
@@ -0,0 +1,183 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python3
+
+import unittest
+import numpy as np
+import faiss
+
+from common import make_binary_dataset
+
+
+def bitvec_shuffle(a, order):
+    n, d = a.shape
+    db, = order.shape
+    b = np.empty((n, db // 8), dtype='uint8')
+    faiss.bitvec_shuffle(
+        n, d * 8, db,
+        faiss.swig_ptr(order),
+        faiss.swig_ptr(a), faiss.swig_ptr(b))
+    return b
+
+
+class TestSmallFuncs(unittest.TestCase):
+
+    def test_shuffle(self):
+        d = 256
+        n = 1000
+        rs = np.random.RandomState(123)
+        o = rs.permutation(d).astype('int32')
+
+        x = rs.randint(256, size=(n, d // 8)).astype('uint8')
+
+        y1 = bitvec_shuffle(x, o[:128])
+        y2 = bitvec_shuffle(x, o[128:])
+        y = np.hstack((y1, y2))
+
+        oinv = np.empty(d, dtype='int32')
+        oinv[o] = np.arange(d)
+        z = bitvec_shuffle(y, oinv)
+
+        np.testing.assert_array_equal(x, z)
+
+
+class TestRange(unittest.TestCase):
+
+    def test_hash(self):
+        d = 128
+        nq = 100
+        nb = 2000
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+
+        radius = 55
+
+        Lref, Dref, Iref = index_ref.range_search(xq, radius)
+
+        print("nb res: ", Lref[-1])
+
+        index = faiss.IndexBinaryHash(d, 10)
+        index.add(xb)
+        # index.display()
+        nfound = []
+        ndis = []
+        stats = faiss.cvar.indexBinaryHash_stats
+        for n_bitflips in range(index.b + 1):
+            index.nflip = n_bitflips
+            stats.reset()
+            Lnew, Dnew, Inew = index.range_search(xq, radius)
+            for i in range(nq):
+                ref = Iref[Lref[i]:Lref[i + 1]]
+                new = Inew[Lnew[i]:Lnew[i + 1]]
+                snew = set(new)
+                # no duplicates
+                self.assertTrue(len(new) == len(snew))
+                # subset of real results
+                self.assertTrue(snew <= set(ref))
+            nfound.append(Lnew[-1])
+            ndis.append(stats.ndis)
+        print('nfound=', nfound)
+        print('ndis=', ndis)
+        nfound = np.array(nfound)
+        self.assertTrue(nfound[-1] == Lref[-1])
+        self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
+
+    def test_multihash(self):
+        d = 128
+        nq = 100
+        nb = 2000
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+
+        radius = 55
+
+        Lref, Dref, Iref = index_ref.range_search(xq, radius)
+
+        print("nb res: ", Lref[-1])
+
+        nfound = []
+        ndis = []
+
+        for nh in 1, 3, 5:
+            index = faiss.IndexBinaryMultiHash(d, nh, 10)
+            index.add(xb)
+            # index.display()
+            stats = faiss.cvar.indexBinaryHash_stats
+            index.nflip = 2
+            stats.reset()
+            Lnew, Dnew, Inew = index.range_search(xq, radius)
+            for i in range(nq):
+                ref = Iref[Lref[i]:Lref[i + 1]]
+                new = Inew[Lnew[i]:Lnew[i + 1]]
+                snew = set(new)
+                # no duplicates
+                self.assertTrue(len(new) == len(snew))
+                # subset of real results
+                self.assertTrue(snew <= set(ref))
+            nfound.append(Lnew[-1])
+            ndis.append(stats.ndis)
+        print('nfound=', nfound)
+        print('ndis=', ndis)
+        nfound = np.array(nfound)
+        # self.assertTrue(nfound[-1] == Lref[-1])
+        self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
+
+
+class TestKnn(unittest.TestCase):
+
+    def test_hash_and_multihash(self):
+        d = 128
+        nq = 100
+        nb = 2000
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+        k = 10
+        Dref, Iref = index_ref.search(xq, k)
+
+        nfound = {}
+        for nh in 0, 1, 3, 5:
+
+            for nbit in 4, 7:
+                if nh == 0:
+                    index = faiss.IndexBinaryHash(d, nbit)
+                else:
+                    index = faiss.IndexBinaryMultiHash(d, nh, nbit)
+                index.add(xb)
+                index.nflip = 2
+                Dnew, Inew = index.search(xq, k)
+                nf = 0
+                for i in range(nq):
+                    ref = Iref[i]
+                    new = Inew[i]
+                    snew = set(new)
+                    # no duplicates
+                    self.assertTrue(len(new) == len(snew))
+                    nf += len(set(ref) & snew)
+                print('nfound', nh, nbit, nf)
+                nfound[(nh, nbit)] = nf
+            self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
+
+            # test serialization
+            index2 = faiss.deserialize_index_binary(
+                faiss.serialize_index_binary(index))
+
+            D2, I2 = index2.search(xq, k)
+            np.testing.assert_array_equal(Inew, I2)
+            np.testing.assert_array_equal(Dnew, D2)
+
+        print('nfound=', nfound)
+        self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
+        self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
+        self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
diff --git a/tests/test_index.py b/tests/test_index.py
index 54494dcbb..c41f7f8c0 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -13,7 +13,7 @@ import faiss
 import tempfile
 import os
 import re
-
+import warnings
 
 from common import get_dataset, get_dataset_2
 
@@ -24,7 +24,6 @@ class TestModuleInterface(unittest.TestCase):
         assert re.match('^\\d+\\.\\d+\\.\\d+$', faiss.__version__)
 
 
-
 class EvalIVFPQAccuracy(unittest.TestCase):
 
     def test_IndexIVFPQ(self):
@@ -506,37 +505,6 @@ class TestHNSW(unittest.TestCase):
         assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
 
 
-class TestIOError(unittest.TestCase):
-
-    def test_io_error(self):
-        d, n = 32, 1000
-        x = np.random.uniform(size=(n, d)).astype('float32')
-        index = faiss.IndexFlatL2(d)
-        index.add(x)
-        _, fname = tempfile.mkstemp()
-        try:
-            faiss.write_index(index, fname)
-
-            # should be fine
-            faiss.read_index(fname)
-
-            # now damage file
-            data = open(fname, 'rb').read()
-            data = data[:int(len(data) / 2)]
-            open(fname, 'wb').write(data)
-
-            # should make a nice readable exception that mentions the
-            try:
-                faiss.read_index(fname)
-            except RuntimeError as e:
-                if fname not in str(e):
-                    raise
-            else:
-                raise
-
-        finally:
-            if os.path.exists(fname):
-                os.unlink(fname)
 
 
 class TestDistancesPositive(unittest.TestCase):
diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py
index 064b1cfbe..c61e2fa5d 100644
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
@@ -10,12 +10,8 @@ import numpy as np
 import unittest
 import faiss
 
+from common import compare_binary_result_lists, make_binary_dataset
 
-def make_binary_dataset(d, nt, nb, nq):
-    assert d % 8 == 0
-    rs = np.random.RandomState(123)
-    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
-    return x[:nt], x[nt:-nq], x[-nq:]
 
 
 def binary_to_float(x):
@@ -124,6 +120,29 @@ class TestBinaryFlat(unittest.TestCase):
             assert(np.all(Iflat == -1))
             assert(np.all(Dflat == 2147483647)) # NOTE(hoss): int32_t max
 
+    def test_range_search(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryFlat(d)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 10)
+        thresh = int(np.median(D[:, -1]))
+
+        lims, D2, I2 = index.range_search(self.xq, thresh)
+        nt1 = nt2 = 0
+        for i in range(len(self.xq)):
+            range_res = I2[lims[i]:lims[i + 1]]
+            if thresh > D[i, -1]:
+                self.assertTrue(set(I[i]) <= set(range_res))
+                nt1 += 1
+            elif thresh < D[i, -1]:
+                self.assertTrue(set(range_res) <= set(I[i]))
+                nt2 += 1
+            # in case of equality we have a problem with ties
+        print('nb tests', nt1, nt2)
+        # nb tests is actually low...
+        self.assertTrue(nt1 > 19 and nt2 > 19)
+
 
 class TestBinaryIVF(unittest.TestCase):
 
@@ -166,6 +185,29 @@ class TestBinaryIVF(unittest.TestCase):
 
         self.assertEqual((self.Dref == Divfflat).sum(), 4122)
 
+    def test_ivf_range(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(self.xt)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 10)
+
+        radius = int(np.median(D[:, -1]) + 1)
+        Lr, Dr, Ir = index.range_search(self.xq, radius)
+
+        for i in range(len(self.xq)):
+            res = Ir[Lr[i]:Lr[i + 1]]
+            if D[i, -1] < radius:
+                self.assertTrue(set(I[i]) <= set(res))
+            else:
+                subset = I[i, D[i, :] < radius]
+                self.assertTrue(set(subset) == set(res))
+
+
     def test_ivf_flat_empty(self):
         d = self.xq.shape[1] * 8
 
@@ -257,27 +299,6 @@ class TestHNSW(unittest.TestCase):
         self.assertTrue((Dref == Dbin).all())
 
 
-def compare_binary_result_lists(D1, I1, D2, I2):
-    """comparing result lists is difficult because there are many
-    ties. Here we sort by (distance, index) pairs and ignore the largest
-    distance of each result. Compatible result lists should pass this."""
-    assert D1.shape == I1.shape == D2.shape == I2.shape
-    n, k = D1.shape
-    ndiff = (D1 != D2).sum()
-    assert ndiff == 0, '%d differences in distance matrix %s' % (
-        ndiff, D1.shape)
-
-    def normalize_DI(D, I):
-        norm = I.max() + 1.0
-        Dr = D.astype('float64') + I / norm
-        # ignore -1s and elements on last column
-        Dr[I1 == -1] = 1e20
-        Dr[D == D[:, -1:]] = 1e20
-        Dr.sort(axis=1)
-        return Dr
-    ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
-    assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
-
 
 class TestReplicasAndShards(unittest.TestCase):
 
diff --git a/tests/test_io.py b/tests/test_io.py
new file mode 100644
index 000000000..7e3d6edf5
--- /dev/null
+++ b/tests/test_io.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python3
+
+import numpy as np
+import unittest
+import faiss
+import tempfile
+import os
+import io
+import sys
+import warnings
+from multiprocessing.dummy import Pool as ThreadPool
+
+from common import get_dataset, get_dataset_2
+
+
+class TestIOVariants(unittest.TestCase):
+
+    def test_io_error(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            # should be fine
+            faiss.read_index(fname)
+
+            # now damage file
+            data = open(fname, 'rb').read()
+            data = data[:int(len(data) / 2)]
+            open(fname, 'wb').write(data)
+
+            # should make a nice readable exception that mentions the filename
+            try:
+                faiss.read_index(fname)
+            except RuntimeError as e:
+                if fname not in str(e):
+                    raise
+            else:
+                raise
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+
+class TestCallbacks(unittest.TestCase):
+
+    def do_write_callback(self, bsz):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        f = io.BytesIO()
+        # test with small block size
+        writer = faiss.PyCallbackIOWriter(f.write, 1234)
+
+        if bsz > 0:
+            writer = faiss.BufferedIOWriter(writer, bsz)
+
+        faiss.write_index(index, writer)
+        del writer   # make sure all writes committed
+
+        if sys.version_info[0] < 3:
+            buf = f.getvalue()
+        else:
+            buf = f.getbuffer()
+
+        index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8'))
+
+        self.assertEqual(index.d, index2.d)
+        self.assertTrue(np.all(
+            faiss.vector_to_array(index.xb) == faiss.vector_to_array(index2.xb)
+        ))
+
+        # This is not a callable function: shoudl raise an exception
+        writer = faiss.PyCallbackIOWriter("blabla")
+        self.assertRaises(
+            Exception,
+            faiss.write_index, index, writer
+        )
+
+    def test_buf_read(self):
+        x = np.random.uniform(size=20)
+
+        _, fname = tempfile.mkstemp()
+        try:
+            x.tofile(fname)
+
+            f = open(fname, 'rb')
+            reader = faiss.PyCallbackIOReader(f.read, 1234)
+
+            bsz = 123
+            reader = faiss.BufferedIOReader(reader, bsz)
+
+            y = np.zeros_like(x)
+            print('nbytes=', y.nbytes)
+            reader(faiss.swig_ptr(y), y.nbytes, 1)
+
+            np.testing.assert_array_equal(x, y)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def do_read_callback(self, bsz):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            f = open(fname, 'rb')
+
+            reader = faiss.PyCallbackIOReader(f.read, 1234)
+
+            if bsz > 0:
+                reader = faiss.BufferedIOReader(reader, bsz)
+
+            index2 = faiss.read_index(reader)
+
+            self.assertEqual(index.d, index2.d)
+            np.testing.assert_array_equal(
+                faiss.vector_to_array(index.xb),
+                faiss.vector_to_array(index2.xb)
+            )
+
+            # This is not a callable function: should raise an exception
+            reader = faiss.PyCallbackIOReader("blabla")
+            self.assertRaises(
+                Exception,
+                faiss.read_index, reader
+            )
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_write_callback(self):
+        self.do_write_callback(0)
+
+    def test_write_buffer(self):
+        self.do_write_callback(123)
+        self.do_write_callback(2345)
+
+    def test_read_callback(self):
+        self.do_read_callback(0)
+
+    def test_read_callback_buffered(self):
+        self.do_read_callback(123)
+        self.do_read_callback(12345)
+
+    def test_read_buffer(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            reader = faiss.BufferedIOReader(
+                faiss.FileIOReader(fname), 1234)
+
+            index2 = faiss.read_index(reader)
+
+            self.assertEqual(index.d, index2.d)
+            np.testing.assert_array_equal(
+                faiss.vector_to_array(index.xb),
+                faiss.vector_to_array(index2.xb)
+            )
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+
+    def test_transfer_pipe(self):
+        """ transfer an index through a Unix pipe """
+
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+        Dref, Iref = index.search(x, 10)
+
+        rf, wf = os.pipe()
+
+        # start thread that will decompress the index
+
+        def index_from_pipe():
+            reader = faiss.PyCallbackIOReader(lambda size: os.read(rf, size))
+            return faiss.read_index(reader)
+
+        fut = ThreadPool(1).apply_async(index_from_pipe, ())
+
+        # write to pipe
+        writer = faiss.PyCallbackIOWriter(lambda b: os.write(wf, b))
+        faiss.write_index(index, writer)
+
+        index2 = fut.get()
+
+        # closing is not really useful but it does not hurt
+        os.close(wf)
+        os.close(rf)
+
+        Dnew, Inew = index2.search(x, 10)
+
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_equal(Dref, Dnew)
diff --git a/utils/hamming.cpp b/utils/hamming.cpp
index 67a441e9e..f80c994a0 100644
--- a/utils/hamming.cpp
+++ b/utils/hamming.cpp
@@ -34,6 +34,7 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
+#include <faiss/impl/AuxIndexStructures.h>
 
 static const size_t BLOCKSIZE_QUERY = 8192;
 
@@ -484,6 +485,30 @@ void bitvec_print (const uint8_t * b, size_t d)
 }
 
 
+void bitvec_shuffle (size_t n, size_t da, size_t db,
+                     const int *order,
+                     const uint8_t *a,
+                     uint8_t *b)
+{
+    for(size_t i = 0; i < db; i++) {
+        FAISS_THROW_IF_NOT (order[i] >= 0 && order[i] < da);
+    }
+    size_t lda = (da + 7) / 8;
+    size_t ldb = (db + 7) / 8;
+
+#pragma omp parallel for if(n > 10000)
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t *ai = a + i * lda;
+        uint8_t *bi = b + i * ldb;
+        memset (bi, 0, ldb);
+        for(size_t i = 0; i < db; i++) {
+            int o = order[i];
+            uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
+            bi[i >> 3] |= the_bit << (i & 7);
+        }
+    }
+
+}
 
 
 
@@ -527,6 +552,7 @@ void hammings_knn(
 {
     hammings_knn_hc(ha, a, b, nb, ncodes, order);
 }
+
 void hammings_knn_hc (
         int_maxheap_array_t * ha,
         const uint8_t * a,
@@ -610,7 +636,66 @@ void hammings_knn_mc(
         }
     }
 }
+template <class HammingComputer>
+static
+void hamming_range_search_template (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t code_size,
+    RangeSearchResult *res)
+{
 
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres (res);
+
+#pragma omp for
+        for (size_t i = 0; i < na; i++) {
+             HammingComputer hc (a + i * code_size, code_size);
+            const uint8_t * yi = b;
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (size_t j = 0; j < nb; j++) {
+                int dis = hc.hamming (yi);
+                if (dis < radius) {
+                    qres.add(dis, j);
+                }
+                yi += code_size;
+            }
+        }
+        pres.finalize ();
+    }
+}
+
+void hamming_range_search (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t code_size,
+    RangeSearchResult *result)
+{
+
+#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
+
+    switch(code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
 
 
 
diff --git a/utils/hamming.h b/utils/hamming.h
index 1ddbd5c01..2d04613ef 100644
--- a/utils/hamming.h
+++ b/utils/hamming.h
@@ -39,6 +39,7 @@ namespace faiss {
  * General bit vector functions
  **************************************************/
 
+struct RangeSearchResult;
 
 void bitvec_print (const uint8_t * b, size_t d);
 
@@ -65,6 +66,14 @@ void bitvecs2fvecs (
 
 void fvec2bitvec (const float * x, uint8_t * b, size_t d);
 
+/** Shuffle the bits from b(i, j) := a(i, order[j])
+ */
+void bitvec_shuffle (size_t n, size_t da, size_t db,
+                     const int *order,
+                     const uint8_t *a,
+                     uint8_t *b);
+
+
 /***********************************************
  * Generic reader/writer for bit strings
  ***********************************************/
@@ -171,6 +180,17 @@ void hammings_knn_mc (
   int32_t *distances,
   int64_t *labels);
 
+/** same as hammings_knn except we are doing a range search with radius */
+void hamming_range_search (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t ncodes,
+    RangeSearchResult *result);
+
+
 /* Counting the number of matches or of cross-matches (without returning them)
    For use with function that assume pre-allocated memory */
 void hamming_count_thres (