building blocks for hybrid CPU / GPU search (#2638)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2638 This diff is a more streamlined way of searching IVF indexes with precomputed clusters. This will be used for experiments with hybrid CPU / GPU search. Reviewed By: algoriddle Differential Revision: D41301032 fbshipit-source-id: a1d645fd0f2bf806454dfd04971edc0a6200d20d
2023-01-12 13:34:44 -08:00 · 2023-01-12 13:34:44 -08:00 · 8fc3775472
parent 1eb4f42639
commit 8fc3775472
21 changed files with 412 additions and 189 deletions
--- a/benchs/bench_big_batch_ivf.py
+++ b/benchs/bench_big_batch_ivf.py
@ -70,7 +70,7 @@ if args.factory_string == "":
 else:
    factory_string = args.factory_string

-print(f"instanciate {factory_string}")
+print(f"instantiate {factory_string}")
 index = faiss.index_factory(ds.d, factory_string)

 if args.factory_string != "":
--- a/contrib/ivf_tools.py
+++ b/contrib/ivf_tools.py
@ -32,7 +32,9 @@ def add_preassigned(index_ivf, x, a, ids=None):

 def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
    """
-    Perform a search in the IVF index, with predefined lists to search into
+    Perform a search in the IVF index, with predefined lists to search into.
+    Supports indexes with pretransforms (as opposed to the
+    IndexIVF.search_preassigned, that cannot be applied with pretransform).
    """
    n, d = xq.shape
    if isinstance(index_ivf, faiss.IndexBinaryIVF):
@ -51,14 +53,7 @@ def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
    else:
        assert coarse_dis.shape == (n, index_ivf.nprobe)

-    D = np.empty((n, k), dtype=dis_type)
-    I = np.empty((n, k), dtype='int64')
-
-    sp = faiss.swig_ptr
-    index_ivf.search_preassigned(
-        n, sp(xq), k,
-        sp(list_nos), sp(coarse_dis), sp(D), sp(I), False)
-    return D, I
+    return index_ivf.search_preassigned(xq, k, list_nos, coarse_dis)


 def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@ -1125,7 +1125,7 @@ void IndexIVF::replace_invlists(InvertedLists* il, bool own) {

 void IndexIVF::copy_subset_to(
        IndexIVF& other,
-        int subset_type,
+        InvertedLists::subset_type_t subset_type,
        idx_t a1,
        idx_t a2) const {
    other.ntotal +=
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@ -326,7 +326,7 @@ struct IndexIVF : Index, Level1Quantizer {
     */
    virtual void copy_subset_to(
            IndexIVF& other,
-            int subset_type,
+            InvertedLists::subset_type_t subset_type,
            idx_t a1,
            idx_t a2) const;

--- a/faiss/IndexShards.cpp
+++ b/faiss/IndexShards.cpp
@ -33,92 +33,6 @@ void translate_labels(long n, idx_t* labels, long translation) {
    }
 }

-/** Merge result tables from several shards. The per-shard results are assumed
- * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
- * element heap because we want the best (ie. lowest for L2) result to be on
- * top, not the worst.
- *
- * @param all_distances  size nshard * n * k
- * @param all_labels     idem
- * @param translations  label translations to apply, size nshard
- */
-template <class IndexClass, class C>
-void merge_tables(
-        long n,
-        long k,
-        long nshard,
-        typename IndexClass::distance_t* distances,
-        idx_t* labels,
-        const std::vector<typename IndexClass::distance_t>& all_distances,
-        const std::vector<idx_t>& all_labels,
-        const std::vector<long>& translations) {
-    if (k == 0) {
-        return;
-    }
-    using distance_t = typename IndexClass::distance_t;
-    long stride = n * k;
-#pragma omp parallel if (n * nshard * k > 100000)
-    {
-        std::vector<int> buf(2 * nshard);
-        // index in each shard's result list
-        int* pointer = buf.data();
-        // (shard_ids, heap_vals): heap that indexes
-        // shard -> current distance for this shard
-        int* shard_ids = pointer + nshard;
-        std::vector<distance_t> buf2(nshard);
-        distance_t* heap_vals = buf2.data();
-#pragma omp for
-        for (long i = 0; i < n; i++) {
-            // the heap maps values to the shard where they are
-            // produced.
-            const distance_t* D_in = all_distances.data() + i * k;
-            const idx_t* I_in = all_labels.data() + i * k;
-            int heap_size = 0;
-
-            // push the first element of each shard (if not -1)
-            for (long s = 0; s < nshard; s++) {
-                pointer[s] = 0;
-                if (I_in[stride * s] >= 0) {
-                    heap_push<C>(
-                            ++heap_size,
-                            heap_vals,
-                            shard_ids,
-                            D_in[stride * s],
-                            s);
-                }
-            }
-
-            distance_t* D = distances + i * k;
-            idx_t* I = labels + i * k;
-
-            int j;
-            for (j = 0; j < k && heap_size > 0; j++) {
-                // pop element from best shard
-                int s = shard_ids[0]; // top of heap
-                int& p = pointer[s];
-                D[j] = heap_vals[0];
-                I[j] = I_in[stride * s + p] + translations[s];
-
-                // pop from shard, advance pointer for this shard
-                heap_pop<C>(heap_size--, heap_vals, shard_ids);
-                p++;
-                if (p < k && I_in[stride * s + p] >= 0) {
-                    heap_push<C>(
-                            ++heap_size,
-                            heap_vals,
-                            shard_ids,
-                            D_in[stride * s + p],
-                            s);
-                }
-            }
-            for (; j < k; j++) {
-                I[j] = -1;
-                D[j] = C::Crev::neutral();
-            }
-        }
-    }
-}
-
 } // anonymous namespace

 template <typename IndexT>
@ -303,27 +217,6 @@ void IndexShardsTemplate<IndexT>::search(

    std::vector<distance_t> all_distances(nshard * k * n);
    std::vector<idx_t> all_labels(nshard * k * n);
-
-    auto fn = [n, k, x, &all_distances, &all_labels](
-                      int no, const IndexT* index) {
-        if (index->verbose) {
-            printf("begin query shard %d on %" PRId64 " points\n", no, n);
-        }
-
-        index->search(
-                n,
-                x,
-                k,
-                all_distances.data() + no * k * n,
-                all_labels.data() + no * k * n);
-
-        if (index->verbose) {
-            printf("end query shard %d\n", no);
-        }
-    };
-
-    this->runOnIndex(fn);
-
    std::vector<long> translations(nshard, 0);

    // Because we just called runOnIndex above, it is safe to access the
@ -336,26 +229,47 @@ void IndexShardsTemplate<IndexT>::search(
        }
    }

+    auto fn = [n, k, x, &all_distances, &all_labels, &translations](
+                      int no, const IndexT* index) {
+        if (index->verbose) {
+            printf("begin query shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        index->search(
+                n,
+                x,
+                k,
+                all_distances.data() + no * k * n,
+                all_labels.data() + no * k * n);
+
+        translate_labels(
+                n * k, all_labels.data() + no * k * n, translations[no]);
+
+        if (index->verbose) {
+            printf("end query shard %d\n", no);
+        }
+    };
+
+    this->runOnIndex(fn);
+
    if (this->metric_type == METRIC_L2) {
-        merge_tables<IndexT, CMin<distance_t, int>>(
+        merge_knn_results<idx_t, CMin<distance_t, int>>(
                n,
                k,
                nshard,
+                all_distances.data(),
+                all_labels.data(),
                distances,
-                labels,
-                all_distances,
-                all_labels,
-                translations);
+                labels);
    } else {
-        merge_tables<IndexT, CMax<distance_t, int>>(
+        merge_knn_results<idx_t, CMax<distance_t, int>>(
                n,
                k,
                nshard,
+                all_distances.data(),
+                all_labels.data(),
                distances,
-                labels,
-                all_distances,
-                all_labels,
-                translations);
+                labels);
    }
 }

--- a/faiss/IndexShards.h
+++ b/faiss/IndexShards.h
@ -71,7 +71,7 @@ struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
     * Cases (successive_ids, xids):
     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
     *                        request them to be shifted
-     * - true, NULL           OK, but should be called only once (calls add()
+     * - true, NULL           OK: but should be called only once (calls add()
     *                        on sub-indexes).
     * - false, non-NULL      OK: will call add_with_ids with passed in xids
     *                        distributed evenly over shards
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@ -248,12 +248,14 @@ void ToGpuClonerMultiple::copy_ivf_shard(

        if (verbose)
            printf("IndexShards shard %ld indices %ld:%ld\n", i, i0, i1);
-        index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+        index_ivf->copy_subset_to(
+                *idx2, InvertedLists::SUBSET_TYPE_ID_RANGE, i0, i1);
        FAISS_ASSERT(idx2->ntotal == i1 - i0);
    } else if (shard_type == 1) {
        if (verbose)
            printf("IndexShards shard %ld select modulo %ld = %ld\n", i, n, i);
-        index_ivf->copy_subset_to(*idx2, 1, n, i);
+        index_ivf->copy_subset_to(
+                *idx2, InvertedLists::SUBSET_TYPE_ID_MOD, n, i);
    } else {
        FAISS_THROW_FMT("shard_type %d not implemented", shard_type);
    }
--- a/faiss/impl/lattice_Zn.cpp
+++ b/faiss/impl/lattice_Zn.cpp
@ -636,7 +636,7 @@ void ZnSphereCodecRec::decode(uint64_t code, float* c) const {
    }
 }

-// if not use_rec, instanciate an arbitrary harmless znc_rec
+// if not use_rec, instantiate an arbitrary harmless znc_rec
 ZnSphereCodecAlt::ZnSphereCodecAlt(int dim, int r2)
        : ZnSphereCodec(dim, r2),
          use_rec((dim & (dim - 1)) == 0),
--- a/faiss/impl/pq4_fast_scan.h
+++ b/faiss/impl/pq4_fast_scan.h
@ -19,7 +19,7 @@
 * otherwise register spilling becomes too large.
 *
 * The implementation of these functions is spread over 3 cpp files to reduce
- * parallel compile times. Templates are instanciated explicitly.
+ * parallel compile times. Templates are instantiated explicitly.
 */

 namespace faiss {
--- a/faiss/impl/pq4_fast_scan_search_qbs.cpp
+++ b/faiss/impl/pq4_fast_scan_search_qbs.cpp
@ -189,7 +189,7 @@ void accumulate(
        DISPATCH(3);
        DISPATCH(4);
    }
-    FAISS_THROW_FMT("accumulate nq=%d not instanciated", nq);
+    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);

 #undef DISPATCH
 }
@ -263,7 +263,7 @@ void pq4_accumulate_loop_qbs(
                DISPATCH(4);
 #undef DISPATCH
                default:
-                    FAISS_THROW_FMT("accumulate nq=%d not instanciated", nq);
+                    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
            }
            i0 += nq;
            LUT += nq * nsq * 16;
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@ -88,13 +88,13 @@ void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {

 size_t InvertedLists::copy_subset_to(
        InvertedLists& oivf,
-        int subset_type,
+        subset_type_t subset_type,
        idx_t a1,
        idx_t a2) const {
    FAISS_THROW_IF_NOT(nlist == oivf.nlist);
    FAISS_THROW_IF_NOT(code_size == oivf.code_size);
    FAISS_THROW_IF_NOT_FMT(
-            subset_type >= 0 && subset_type <= 3,
+            subset_type >= 0 && subset_type <= 4,
            "subset type %d not implemented",
            subset_type);
    size_t accu_n = 0;
@ -111,7 +111,7 @@ size_t InvertedLists::copy_subset_to(
        size_t n = list_size(list_no);
        ScopedIds ids_in(this, list_no);

-        if (subset_type == 0) {
+        if (subset_type == SUBSET_TYPE_ID_RANGE) {
            for (idx_t i = 0; i < n; i++) {
                idx_t id = ids_in[i];
                if (a1 <= id && id < a2) {
@ -122,7 +122,7 @@ size_t InvertedLists::copy_subset_to(
                    n_added++;
                }
            }
-        } else if (subset_type == 1) {
+        } else if (subset_type == SUBSET_TYPE_ID_MOD) {
            for (idx_t i = 0; i < n; i++) {
                idx_t id = ids_in[i];
                if (id % a1 == a2) {
@ -133,7 +133,7 @@ size_t InvertedLists::copy_subset_to(
                    n_added++;
                }
            }
-        } else if (subset_type == 2) {
+        } else if (subset_type == SUBSET_TYPE_ELEMENT_RANGE) {
            // see what is allocated to a1 and to a2
            size_t next_accu_n = accu_n + n;
            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
@ -151,7 +151,7 @@ size_t InvertedLists::copy_subset_to(
            n_added += i2 - i1;
            accu_a1 = next_accu_a1;
            accu_a2 = next_accu_a2;
-        } else if (subset_type == 3) {
+        } else if (subset_type == SUBSET_TYPE_INVLIST_FRACTION) {
            size_t i1 = n * a2 / a1;
            size_t i2 = n * (a2 + 1) / a1;

@ -163,6 +163,15 @@ size_t InvertedLists::copy_subset_to(
            }

            n_added += i2 - i1;
+        } else if (subset_type == SUBSET_TYPE_INVLIST) {
+            if (list_no >= a1 && list_no < a2) {
+                oivf.add_entries(
+                        list_no,
+                        n,
+                        ScopedIds(this, list_no).get(),
+                        ScopedCodes(this, list_no).get());
+                n_added += n;
+            }
        }
        accu_n += n;
    }
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@ -111,20 +111,28 @@ struct InvertedLists {
    /// move all entries from oivf (empty on output)
    void merge_from(InvertedLists* oivf, size_t add_id);

+    // how to copy a subset of elements from the inverted lists
+    // This depends on two integers, a1 and a2.
+    enum subset_type_t : int {
+        // depends on IDs
+        SUBSET_TYPE_ID_RANGE = 0, // copies ids in [a1, a2)
+        SUBSET_TYPE_ID_MOD = 1,   // copies ids if id % a1 == a2
+        // depends on order within invlists
+        SUBSET_TYPE_ELEMENT_RANGE =
+                2, // copies fractions of invlists so that a1 elements are left
+                   // before and a2 after
+        SUBSET_TYPE_INVLIST_FRACTION =
+                3, // take fraction a2 out of a1 from each invlist, 0 <= a2 < a1
+        // copy only inverted lists a1:a2
+        SUBSET_TYPE_INVLIST = 4
+    };
+
    /** copy a subset of the entries index to the other index
-     *
-     * if subset_type == 0: copies ids in [a1, a2)
-     * if subset_type == 1: copies ids if id % a1 == a2
-     * if subset_type == 2: copies inverted lists such that a1
-     *                      elements are left before and a2 elements are after
-     *                      (insensitive to ids)
-     * if subset_type == 3: take fraction a2 out of a1 from each invlist
-     *                      (does not depend on ids). 0 <= a2 < a1
     * @return number of entries copied
     */
    size_t copy_subset_to(
            InvertedLists& other,
-            int subset_type,
+            subset_type_t subset_type,
            idx_t a1,
            idx_t a2) const;

--- a/faiss/python/init.py
+++ b/faiss/python/init.py
@ -21,7 +21,8 @@ from faiss.gpu_wrappers import *
 from faiss.array_conversions import *
 from faiss.extra_wrappers import kmin, kmax, pairwise_distances, rand, randint, \
    lrand, randn, rand_smooth_vectors, eval_intersection, normalize_L2, \
-    ResultHeap, knn, Kmeans, checksum, matrix_bucket_sort_inplace, bucket_sort
+    ResultHeap, knn, Kmeans, checksum, matrix_bucket_sort_inplace, bucket_sort, \
+    merge_knn_results


 __version__ = "%d.%d.%d" % (FAISS_VERSION_MAJOR,
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@ -554,6 +554,70 @@ def handle_Index(the_class):
        I = rev_swig_ptr(res.labels, nd).copy()
        return lims, D, I

+    def replacement_search_preassigned(self, x, k, Iq, Dq, *, params=None, D=None, I=None):
+        """Find the k nearest neighbors of the set of vectors x in an IVF index,
+        with precalculated coarse quantization assignment.
+
+        Parameters
+        ----------
+        x : array_like
+            Query vectors, shape (n, d) where d is appropriate for the index.
+            `dtype` must be float32.
+        k : int
+            Number of nearest neighbors.
+        Dq : array_like, optional
+            Distance array to the centroids, size (n, nprobe)
+        Iq : array_like, optional
+            Nearest centroids, size (n, nprobe)
+
+        params : SearchParameters
+            Search parameters of the current search (overrides the class-level params)
+        D : array_like, optional
+            Distance array to store the result.
+        I : array_like, optional
+            Labels array to store the results.
+
+        Returns
+        -------
+        D : array_like
+            Distances of the nearest neighbors, shape (n, k). When not enough results are found
+            the label is set to +Inf or -Inf.
+        I : array_like
+            Labels of the nearest neighbors, shape (n, k).
+            When not enough results are found, the label is set to -1
+        """
+        n, d = x.shape
+        x = np.ascontiguousarray(x, dtype='float32')
+        assert d == self.d
+        assert k > 0
+
+        if D is None:
+            D = np.empty((n, k), dtype=np.float32)
+        else:
+            assert D.shape == (n, k)
+
+        if I is None:
+            I = np.empty((n, k), dtype=np.int64)
+        else:
+            assert I.shape == (n, k)
+
+        Iq = np.ascontiguousarray(Iq, dtype='int64')
+        assert params is None, "params not supported"
+        assert Iq.shape == (n, self.nprobe)
+
+        if Dq is not None:
+            Dq = np.ascontiguousarray(Dq, dtype='float32')
+            assert Dq.shape == Iq.shape
+
+        self.search_preassigned_c(
+            n, swig_ptr(x),
+            k,
+            swig_ptr(Iq), swig_ptr(Dq),
+            swig_ptr(D), swig_ptr(I),
+            False
+        )
+        return D, I
+
    def replacement_sa_encode(self, x, codes=None):
        n, d = x.shape
        assert d == self.d
@ -605,6 +669,8 @@ def handle_Index(the_class):
                   ignore_missing=True)
    replace_method(the_class, 'search_and_reconstruct',
                   replacement_search_and_reconstruct, ignore_missing=True)
+    replace_method(the_class, 'search_preassigned',
+                   replacement_search_preassigned, ignore_missing=True)
    replace_method(the_class, 'sa_encode', replacement_sa_encode)
    replace_method(the_class, 'sa_decode', replacement_sa_decode)
    replace_method(the_class, 'add_sa_codes', replacement_add_sa_codes,
@ -664,6 +730,31 @@ def handle_IndexBinary(the_class):
                      swig_ptr(labels))
        return distances, labels

+    def replacement_search_preassigned(self, x, k, Iq, Dq):
+        n, d = x.shape
+        x = _check_dtype_uint8(x)
+        assert d * 8 == self.d
+        assert k > 0
+
+        D = np.empty((n, k), dtype=np.int32)
+        I = np.empty((n, k), dtype=np.int64)
+
+        Iq = np.ascontiguousarray(Iq, dtype='int64')
+        assert Iq.shape == (n, self.nprobe)
+
+        if Dq is not None:
+            Dq = np.ascontiguousarray(Dq, dtype='int32')
+            assert Dq.shape == Iq.shape
+
+        self.search_preassigned_c(
+            n, swig_ptr(x),
+            k,
+            swig_ptr(Iq), swig_ptr(Dq),
+            swig_ptr(D), swig_ptr(I),
+            False
+        )
+        return D, I
+
    def replacement_range_search(self, x, thresh):
        n, d = x.shape
        x = _check_dtype_uint8(x)
@ -693,6 +784,8 @@ def handle_IndexBinary(the_class):
    replace_method(the_class, 'range_search', replacement_range_search)
    replace_method(the_class, 'reconstruct', replacement_reconstruct)
    replace_method(the_class, 'remove_ids', replacement_remove_ids)
+    replace_method(the_class, 'search_preassigned',
+                   replacement_search_preassigned, ignore_missing=True)


 def handle_VectorTransform(the_class):
--- a/faiss/python/extra_wrappers.py
+++ b/faiss/python/extra_wrappers.py
@ -279,6 +279,23 @@ class ResultHeap:
        self.heaps.reorder()


+def merge_knn_results(Dall, Iall, keep_max=False):
+    """
+    Merge a set of sorted knn-results obtained from different shards in a dataset
+    Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
+    returns D, I of size (nq, k) as the merged result set
+    """
+    assert Iall.shape == Dall.shape
+    nshard, n, k = Dall.shape
+    Dnew = np.empty((n, k), dtype=Dall.dtype)
+    Inew = np.empty((n, k), dtype=Iall.dtype)
+    func = merge_knn_results_CMax if keep_max else merge_knn_results_CMin
+    func(
+        n, k, nshard,
+        swig_ptr(Dall), swig_ptr(Iall),
+        swig_ptr(Dnew), swig_ptr(Inew)
+    )
+    return Dnew, Inew

 ######################################################
 # KNN function
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@ -938,7 +938,6 @@ REV_SWIG_PTR(uint64_t, NPY_UINT64);

 %template(float_minheap_array_t) faiss::HeapArray<faiss::CMin<float, int64_t> >;
 %template(int_minheap_array_t) faiss::HeapArray<faiss::CMin<int, int64_t> >;
-
 %template(float_maxheap_array_t) faiss::HeapArray<faiss::CMax<float, int64_t> >;
 %template(int_maxheap_array_t) faiss::HeapArray<faiss::CMax<int, int64_t> >;

@ -951,46 +950,55 @@ REV_SWIG_PTR(uint64_t, NPY_UINT64);
 %template(AlignedTableUint16) faiss::AlignedTable<uint16_t>;
 %template(AlignedTableFloat32) faiss::AlignedTable<float>;

+
+// SWIG seems to have some trouble resolving function template types here, so
+// declare explicitly
+
+%define INSTANTIATE_uint16_partition_fuzzy(C, id_t)
+
 %inline %{

-// SWIG seems to have has some trouble resolving the template type here, so
-// declare explicitly
-uint16_t CMax_uint16_partition_fuzzy(
-        uint16_t *vals, int64_t *ids, size_t n,
+uint16_t C ## _uint16_partition_fuzzy(
+        uint16_t *vals, id_t *ids, size_t n,
        size_t q_min, size_t q_max, size_t * q_out)
 {
-    return faiss::partition_fuzzy<faiss::CMax<unsigned short, int64_t> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
-
-uint16_t CMin_uint16_partition_fuzzy(
-        uint16_t *vals, int64_t *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::CMin<unsigned short, int64_t> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
-
-// and overload with the int32 version
-
-uint16_t CMax_uint16_partition_fuzzy(
-        uint16_t *vals, int *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::CMax<unsigned short, int> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
-
-uint16_t CMin_uint16_partition_fuzzy(
-        uint16_t *vals, int *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::CMin<unsigned short, int> >(
+    return faiss::partition_fuzzy<faiss::C<unsigned short, id_t> >(
        vals, ids, n, q_min, q_max, q_out);
 }

 %}

+%enddef
+
+INSTANTIATE_uint16_partition_fuzzy(CMin, int64_t)
+INSTANTIATE_uint16_partition_fuzzy(CMax, int64_t)
+INSTANTIATE_uint16_partition_fuzzy(CMin, int)
+INSTANTIATE_uint16_partition_fuzzy(CMax, int)
+
+// Same for merge_knn_results
+
+// same define as explicit instanciation in Heap.cpp
+%define INSTANTIATE_merge_knn_results(C, distance_t)
+
+%inline %{
+void merge_knn_results_ ## C(
+    size_t n, size_t k, int nshard,
+    const distance_t *all_distances, const faiss::idx_t *all_labels,
+    distance_t *distances, faiss::idx_t *labels)
+{
+    faiss::merge_knn_results<faiss::idx_t, faiss::C<distance_t, int>>(
+        n, k, nshard, all_distances, all_labels, distances, labels);
+}
+%}
+
+%enddef
+
+INSTANTIATE_merge_knn_results(CMin, float);
+INSTANTIATE_merge_knn_results(CMax, float);
+INSTANTIATE_merge_knn_results(CMin, int32_t);
+INSTANTIATE_merge_knn_results(CMax, int32_t);
+
+
 /*******************************************************************
 * Expose a few basic functions
 *******************************************************************/
--- a/faiss/utils/Heap.cpp
+++ b/faiss/utils/Heap.cpp
@ -139,4 +139,111 @@ template struct HeapArray<CMax<float, int64_t>>;
 template struct HeapArray<CMin<int, int64_t>>;
 template struct HeapArray<CMax<int, int64_t>>;

+/**********************************************************
+ * merge knn search results
+ **********************************************************/
+
+/** Merge result tables from several shards. The per-shard results are assumed
+ * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
+ * element heap because we want the best (ie. lowest for L2) result to be on
+ * top, not the worst.
+ *
+ * @param all_distances  size (nshard, n, k)
+ * @param all_labels     size (nshard, n, k)
+ * @param distances      output distances, size (n, k)
+ * @param labels         output labels, size (n, k)
+ */
+template <class idx_t, class C>
+void merge_knn_results(
+        size_t n,
+        size_t k,
+        typename C::TI nshard,
+        const typename C::T* all_distances,
+        const idx_t* all_labels,
+        typename C::T* distances,
+        idx_t* labels) {
+    using distance_t = typename C::T;
+    if (k == 0) {
+        return;
+    }
+    long stride = n * k;
+#pragma omp parallel if (n * nshard * k > 100000)
+    {
+        std::vector<int> buf(2 * nshard);
+        // index in each shard's result list
+        int* pointer = buf.data();
+        // (shard_ids, heap_vals): heap that indexes
+        // shard -> current distance for this shard
+        int* shard_ids = pointer + nshard;
+        std::vector<distance_t> buf2(nshard);
+        distance_t* heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const distance_t* D_in = all_distances + i * k;
+            const idx_t* I_in = all_labels + i * k;
+            int heap_size = 0;
+
+            // push the first element of each shard (if not -1)
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s],
+                            s);
+                }
+            }
+
+            distance_t* D = distances + i * k;
+            idx_t* I = labels + i * k;
+
+            int j;
+            for (j = 0; j < k && heap_size > 0; j++) {
+                // pop element from best shard
+                int s = shard_ids[0]; // top of heap
+                int& p = pointer[s];
+                D[j] = heap_vals[0];
+                I[j] = I_in[stride * s + p];
+
+                // pop from shard, advance pointer for this shard
+                heap_pop<C>(heap_size--, heap_vals, shard_ids);
+                p++;
+                if (p < k && I_in[stride * s + p] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s + p],
+                            s);
+                }
+            }
+            for (; j < k; j++) {
+                I[j] = -1;
+                D[j] = C::Crev::neutral();
+            }
+        }
+    }
+}
+
+// explicit instanciations
+#define INSTANTIATE(C, distance_t)                                \
+    template void merge_knn_results<int64_t, C<distance_t, int>>( \
+            size_t,                                               \
+            size_t,                                               \
+            int,                                                  \
+            const distance_t*,                                    \
+            const int64_t*,                                       \
+            distance_t*,                                          \
+            int64_t*);
+
+INSTANTIATE(CMin, float);
+INSTANTIATE(CMax, float);
+INSTANTIATE(CMin, int32_t);
+INSTANTIATE(CMax, int32_t);
+
+
 } // namespace faiss
--- a/faiss/utils/Heap.h
+++ b/faiss/utils/Heap.h
@ -444,7 +444,7 @@ typedef HeapArray<CMin<int, int64_t>> int_minheap_array_t;
 typedef HeapArray<CMax<float, int64_t>> float_maxheap_array_t;
 typedef HeapArray<CMax<int, int64_t>> int_maxheap_array_t;

-// The heap templates are instanciated explicitly in Heap.cpp
+// The heap templates are instantiated explicitly in Heap.cpp

 /*********************************************************************
 * Indirect heaps: instead of having
@ -505,6 +505,27 @@ inline void indirect_heap_push(
    bh_ids[i] = id;
 }

+/** Merge result tables from several shards. The per-shard results are assumed
+ * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
+ * element heap because we want the best (ie. lowest for L2) result to be on
+ * top, not the worst. Also, it needs to hold an index of a shard id (ie.
+ * usually int32 is more than enough).
+ *
+ * @param all_distances  size (nshard, n, k)
+ * @param all_labels     size (nshard, n, k)
+ * @param distances      output distances, size (n, k)
+ * @param labels         output labels, size (n, k)
+ */
+template <class idx_t, class C>
+void merge_knn_results(
+        size_t n,
+        size_t k,
+        typename C::TI nshard,
+        const typename C::T* all_distances,
+        const idx_t* all_labels,
+        typename C::T* distances,
+        idx_t* labels);
+
 } // namespace faiss

 #endif /* FAISS_Heap_h */
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@ -645,3 +645,45 @@ class TestBucketSort(unittest.TestCase):

    def test_bucket_sort_inplace_parallel_fewbucket(self):
        self.do_test_bucket_sort_inplace(4, nbucket=5)
+
+
+class TestMergeKNNResults(unittest.TestCase):
+
+    def do_test(self, ismax, dtype):
+        rs = np.random.RandomState()
+        n, k, nshard = 10, 5, 3
+        all_ids = rs.randint(100000, size=(nshard, n, k)).astype('int64')
+        all_dis = rs.rand(nshard, n, k)
+        if dtype == 'int32':
+            all_dis = (all_dis * 1000000).astype("int32")
+        else:
+            all_dis = all_dis.astype(dtype)
+        for i in range(nshard):
+            for j in range(n):
+                all_dis[i, j].sort()
+                if ismax:
+                    all_dis[i, j] = all_dis[i, j][::-1]
+        Dref = np.zeros((n, k), dtype=dtype)
+        Iref = np.zeros((n, k), dtype='int64')
+
+        for i in range(n):
+            dis = all_dis[:, i, :].ravel()
+            ids = all_ids[:, i, :].ravel()
+            o = dis.argsort()
+            if ismax:
+                o = o[::-1]
+            Dref[i] = dis[o[:k]]
+            Iref[i] = ids[o[:k]]
+
+        Dnew, Inew = faiss.merge_knn_results(all_dis, all_ids, keep_max=ismax)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_min_float(self):
+        self.do_test(ismax=False, dtype='float32')
+
+    def test_max_int(self):
+        self.do_test(ismax=True, dtype='int32')
+
+    def test_max_float(self):
+        self.do_test(ismax=True, dtype='float32')
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
@ -687,6 +687,7 @@ class TestSplitMerge(unittest.TestCase):
        sub_indexes = [faiss.clone_index(index) for i in range(nsplit)]
        index.add(xb)
        Dref, Iref = index.search(xq, 10)
+        nlist = index.nlist
        for i in range(nsplit):
            if subset_type in (1, 3):
                index.copy_subset_to(sub_indexes[i], subset_type, nsplit, i)
@ -694,6 +695,10 @@ class TestSplitMerge(unittest.TestCase):
                j0 = index.ntotal * i // nsplit
                j1 = index.ntotal * (i + 1) // nsplit
                index.copy_subset_to(sub_indexes[i], subset_type, j0, j1)
+            elif subset_type == 4:
+                index.copy_subset_to(
+                    sub_indexes[i], subset_type,
+                    i * nlist // nsplit, (i + 1) * nlist // nsplit)

        index_shards = faiss.IndexShards(False, False)
        for i in range(nsplit):
@ -713,3 +718,6 @@ class TestSplitMerge(unittest.TestCase):

    def test_Flat_subset_type_3(self):
        self.do_test("IVF30,Flat", subset_type=3)
+
+    def test_Flat_subset_type_4(self):
+        self.do_test("IVF30,Flat", subset_type=4)
--- a/tests/test_ivflib.py
+++ b/tests/test_ivflib.py
@ -47,10 +47,8 @@ def search_single_scan(index, xq, k, bs=128):
        sub_assign[skip_rows, skip_cols] = -1

        index.search_preassigned(
-            nq, faiss.swig_ptr(xq), k,
-            faiss.swig_ptr(sub_assign), faiss.swig_ptr(coarse_dis),
-            faiss.swig_ptr(rh.D), faiss.swig_ptr(rh.I),
-            False, None
+            xq, k, sub_assign, coarse_dis,
+            D=rh.D, I=rh.I
        )

    rh.finalize()