faiss/gpu/utils/BlockSelectFloat.cu

/**
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "blockselect/BlockSelectImpl.cuh"
#include "DeviceDefs.cuh"

namespace faiss { namespace gpu {

// warp Q to thread Q:
// 1, 1
// 32, 2
// 64, 3
// 128, 3
// 256, 4
// 512, 8
// 1024, 8
// 2048, 8

BLOCK_SELECT_DECL(float, true, 1);
BLOCK_SELECT_DECL(float, true, 32);
BLOCK_SELECT_DECL(float, true, 64);
BLOCK_SELECT_DECL(float, true, 128);
BLOCK_SELECT_DECL(float, true, 256);
BLOCK_SELECT_DECL(float, true, 512);
BLOCK_SELECT_DECL(float, true, 1024);
#if GPU_MAX_SELECTION_K >= 2048
BLOCK_SELECT_DECL(float, true, 2048);
#endif

BLOCK_SELECT_DECL(float, false, 1);
BLOCK_SELECT_DECL(float, false, 32);
BLOCK_SELECT_DECL(float, false, 64);
BLOCK_SELECT_DECL(float, false, 128);
BLOCK_SELECT_DECL(float, false, 256);
BLOCK_SELECT_DECL(float, false, 512);
BLOCK_SELECT_DECL(float, false, 1024);
#if GPU_MAX_SELECTION_K >= 2048
BLOCK_SELECT_DECL(float, false, 2048);
#endif

void runBlockSelect(Tensor<float, 2, true>& in,
                    Tensor<float, 2, true>& outK,
                    Tensor<int, 2, true>& outV,
                    bool dir, int k, cudaStream_t stream) {
  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);

  if (dir) {
    if (k == 1) {
      BLOCK_SELECT_CALL(float, true, 1);
    } else if (k <= 32) {
      BLOCK_SELECT_CALL(float, true, 32);
    } else if (k <= 64) {
      BLOCK_SELECT_CALL(float, true, 64);
    } else if (k <= 128) {
      BLOCK_SELECT_CALL(float, true, 128);
    } else if (k <= 256) {
      BLOCK_SELECT_CALL(float, true, 256);
    } else if (k <= 512) {
      BLOCK_SELECT_CALL(float, true, 512);
    } else if (k <= 1024) {
      BLOCK_SELECT_CALL(float, true, 1024);
#if GPU_MAX_SELECTION_K >= 2048
    } else if (k <= 2048) {
      BLOCK_SELECT_CALL(float, true, 2048);
#endif
    }
  } else {
    if (k == 1) {
      BLOCK_SELECT_CALL(float, false, 1);
    } else if (k <= 32) {
      BLOCK_SELECT_CALL(float, false, 32);
    } else if (k <= 64) {
      BLOCK_SELECT_CALL(float, false, 64);
    } else if (k <= 128) {
      BLOCK_SELECT_CALL(float, false, 128);
    } else if (k <= 256) {
      BLOCK_SELECT_CALL(float, false, 256);
    } else if (k <= 512) {
      BLOCK_SELECT_CALL(float, false, 512);
    } else if (k <= 1024) {
      BLOCK_SELECT_CALL(float, false, 1024);
#if GPU_MAX_SELECTION_K >= 2048
    } else if (k <= 2048) {
      BLOCK_SELECT_CALL(float, false, 2048);
#endif
    }
  }
}

void runBlockSelectPair(Tensor<float, 2, true>& inK,
                        Tensor<int, 2, true>& inV,
                        Tensor<float, 2, true>& outK,
                        Tensor<int, 2, true>& outV,
                        bool dir, int k, cudaStream_t stream) {
  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);

  if (dir) {
    if (k == 1) {
      BLOCK_SELECT_PAIR_CALL(float, true, 1);
    } else if (k <= 32) {
      BLOCK_SELECT_PAIR_CALL(float, true, 32);
    } else if (k <= 64) {
      BLOCK_SELECT_PAIR_CALL(float, true, 64);
    } else if (k <= 128) {
      BLOCK_SELECT_PAIR_CALL(float, true, 128);
    } else if (k <= 256) {
      BLOCK_SELECT_PAIR_CALL(float, true, 256);
    } else if (k <= 512) {
      BLOCK_SELECT_PAIR_CALL(float, true, 512);
    } else if (k <= 1024) {
      BLOCK_SELECT_PAIR_CALL(float, true, 1024);
#if GPU_MAX_SELECTION_K >= 2048
    } else if (k <= 2048) {
      BLOCK_SELECT_PAIR_CALL(float, true, 2048);
#endif
    }
  } else {
    if (k == 1) {
      BLOCK_SELECT_PAIR_CALL(float, false, 1);
    } else if (k <= 32) {
      BLOCK_SELECT_PAIR_CALL(float, false, 32);
    } else if (k <= 64) {
      BLOCK_SELECT_PAIR_CALL(float, false, 64);
    } else if (k <= 128) {
      BLOCK_SELECT_PAIR_CALL(float, false, 128);
    } else if (k <= 256) {
      BLOCK_SELECT_PAIR_CALL(float, false, 256);
    } else if (k <= 512) {
      BLOCK_SELECT_PAIR_CALL(float, false, 512);
    } else if (k <= 1024) {
      BLOCK_SELECT_PAIR_CALL(float, false, 1024);
#if GPU_MAX_SELECTION_K >= 2048
    } else if (k <= 2048) {
      BLOCK_SELECT_PAIR_CALL(float, false, 2048);
#endif
    }
  }
}

} } // namespace
Initial commit 2017-02-23 06:26:44 +08:00			`/**`
Facebook sync (May 2019) + relicense (#838) Changelog: - changed license: BSD+Patents -> MIT - propagates exceptions raised in sub-indexes of IndexShards and IndexReplicas - support for searching several inverted lists in parallel (parallel_mode != 0) - better support for PQ codes where nbit != 8 or 16 - IVFSpectralHash implementation: spectral hash codes inside an IVF - 6-bit per component scalar quantizer (4 and 8 bit were already supported) - combinations of inverted lists: HStackInvertedLists and VStackInvertedLists - configurable number of threads for OnDiskInvertedLists prefetching (including 0=no prefetch) - more test and demo code compatible with Python 3 (print with parentheses) - refactored benchmark code: data loading is now in a single file 2019-05-28 22:17:22 +08:00			`* Copyright (c) Facebook, Inc. and its affiliates.`
Initial commit 2017-02-23 06:26:44 +08:00			`*`
Facebook sync (May 2019) + relicense (#838) Changelog: - changed license: BSD+Patents -> MIT - propagates exceptions raised in sub-indexes of IndexShards and IndexReplicas - support for searching several inverted lists in parallel (parallel_mode != 0) - better support for PQ codes where nbit != 8 or 16 - IVFSpectralHash implementation: spectral hash codes inside an IVF - 6-bit per component scalar quantizer (4 and 8 bit were already supported) - combinations of inverted lists: HStackInvertedLists and VStackInvertedLists - configurable number of threads for OnDiskInvertedLists prefetching (including 0=no prefetch) - more test and demo code compatible with Python 3 (print with parentheses) - refactored benchmark code: data loading is now in a single file 2019-05-28 22:17:22 +08:00			`* This source code is licensed under the MIT license found in the`
Initial commit 2017-02-23 06:26:44 +08:00			`* LICENSE file in the root directory of this source tree.`
			`*/`

			`#include "blockselect/BlockSelectImpl.cuh"`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#include "DeviceDefs.cuh"`
Initial commit 2017-02-23 06:26:44 +08:00
			`namespace faiss { namespace gpu {`

			`// warp Q to thread Q:`
			`// 1, 1`
			`// 32, 2`
			`// 64, 3`
			`// 128, 3`
			`// 256, 4`
			`// 512, 8`
			`// 1024, 8`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`// 2048, 8`
Initial commit 2017-02-23 06:26:44 +08:00
			`BLOCK_SELECT_DECL(float, true, 1);`
			`BLOCK_SELECT_DECL(float, true, 32);`
			`BLOCK_SELECT_DECL(float, true, 64);`
			`BLOCK_SELECT_DECL(float, true, 128);`
			`BLOCK_SELECT_DECL(float, true, 256);`
			`BLOCK_SELECT_DECL(float, true, 512);`
			`BLOCK_SELECT_DECL(float, true, 1024);`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#if GPU_MAX_SELECTION_K >= 2048`
			`BLOCK_SELECT_DECL(float, true, 2048);`
			`#endif`
Initial commit 2017-02-23 06:26:44 +08:00
			`BLOCK_SELECT_DECL(float, false, 1);`
			`BLOCK_SELECT_DECL(float, false, 32);`
			`BLOCK_SELECT_DECL(float, false, 64);`
			`BLOCK_SELECT_DECL(float, false, 128);`
			`BLOCK_SELECT_DECL(float, false, 256);`
			`BLOCK_SELECT_DECL(float, false, 512);`
			`BLOCK_SELECT_DECL(float, false, 1024);`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#if GPU_MAX_SELECTION_K >= 2048`
			`BLOCK_SELECT_DECL(float, false, 2048);`
			`#endif`
Initial commit 2017-02-23 06:26:44 +08:00
			`void runBlockSelect(Tensor<float, 2, true>& in,`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 21:11:28 +08:00			`Tensor<float, 2, true>& outK,`
			`Tensor<int, 2, true>& outV,`
			`bool dir, int k, cudaStream_t stream) {`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);`
Initial commit 2017-02-23 06:26:44 +08:00
			`if (dir) {`
			`if (k == 1) {`
			`BLOCK_SELECT_CALL(float, true, 1);`
			`} else if (k <= 32) {`
			`BLOCK_SELECT_CALL(float, true, 32);`
			`} else if (k <= 64) {`
			`BLOCK_SELECT_CALL(float, true, 64);`
			`} else if (k <= 128) {`
			`BLOCK_SELECT_CALL(float, true, 128);`
			`} else if (k <= 256) {`
			`BLOCK_SELECT_CALL(float, true, 256);`
			`} else if (k <= 512) {`
			`BLOCK_SELECT_CALL(float, true, 512);`
			`} else if (k <= 1024) {`
			`BLOCK_SELECT_CALL(float, true, 1024);`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#if GPU_MAX_SELECTION_K >= 2048`
			`} else if (k <= 2048) {`
			`BLOCK_SELECT_CALL(float, true, 2048);`
			`#endif`
Initial commit 2017-02-23 06:26:44 +08:00			`}`
			`} else {`
			`if (k == 1) {`
			`BLOCK_SELECT_CALL(float, false, 1);`
			`} else if (k <= 32) {`
			`BLOCK_SELECT_CALL(float, false, 32);`
			`} else if (k <= 64) {`
			`BLOCK_SELECT_CALL(float, false, 64);`
			`} else if (k <= 128) {`
			`BLOCK_SELECT_CALL(float, false, 128);`
			`} else if (k <= 256) {`
			`BLOCK_SELECT_CALL(float, false, 256);`
			`} else if (k <= 512) {`
			`BLOCK_SELECT_CALL(float, false, 512);`
			`} else if (k <= 1024) {`
			`BLOCK_SELECT_CALL(float, false, 1024);`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#if GPU_MAX_SELECTION_K >= 2048`
			`} else if (k <= 2048) {`
			`BLOCK_SELECT_CALL(float, false, 2048);`
			`#endif`
Initial commit 2017-02-23 06:26:44 +08:00			`}`
			`}`
			`}`

sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 21:11:28 +08:00			`void runBlockSelectPair(Tensor<float, 2, true>& inK,`
			`Tensor<int, 2, true>& inV,`
			`Tensor<float, 2, true>& outK,`
			`Tensor<int, 2, true>& outV,`
			`bool dir, int k, cudaStream_t stream) {`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 21:11:28 +08:00
			`if (dir) {`
			`if (k == 1) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 1);`
			`} else if (k <= 32) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 32);`
			`} else if (k <= 64) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 64);`
			`} else if (k <= 128) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 128);`
			`} else if (k <= 256) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 256);`
			`} else if (k <= 512) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 512);`
			`} else if (k <= 1024) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 1024);`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#if GPU_MAX_SELECTION_K >= 2048`
			`} else if (k <= 2048) {`
			`BLOCK_SELECT_PAIR_CALL(float, true, 2048);`
			`#endif`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 21:11:28 +08:00			`}`
			`} else {`
			`if (k == 1) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 1);`
			`} else if (k <= 32) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 32);`
			`} else if (k <= 64) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 64);`
			`} else if (k <= 128) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 128);`
			`} else if (k <= 256) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 256);`
			`} else if (k <= 512) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 512);`
			`} else if (k <= 1024) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 1024);`
Facebook sync (Mar 2019) (#756) Facebook sync (Mar 2019) - MatrixStats object - option to round coordinates during k-means optimization - alternative option for search in HNSW - moved stats and imbalance_factor of IndexIVF to InvertedLists object - range search for IVFScalarQuantizer - direct unit8 codec in ScalarQuantizer - renamed IndexProxy to IndexReplicas and moved to main Faiss - better support for PQ code assignment with external index - support for IMI2x16 (4B virtual centroids!) - support for k = 2048 search on GPU (instead of 1024) - most CUDA mem alloc failures throw exceptions instead of terminating on an assertion - support for renaming an ondisk invertedlists - interrupt computations with ctrl-C in python 2019-03-29 23:32:28 +08:00			`#if GPU_MAX_SELECTION_K >= 2048`
			`} else if (k <= 2048) {`
			`BLOCK_SELECT_PAIR_CALL(float, false, 2048);`
			`#endif`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 21:11:28 +08:00			`}`
			`}`
			`}`

Initial commit 2017-02-23 06:26:44 +08:00			`} } // namespace`