faiss/gpu/utils/blockselect/BlockSelectImpl.cuh

/**
 * Copyright (c) 2015-present, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under the BSD+Patents license found in the
 * LICENSE file in the root directory of this source tree.
 */

// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once

#include "../BlockSelectKernel.cuh"
#include "../Limits.cuh"

#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                            \
  extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
    Tensor<TYPE, 2, true>& in,                                          \
    Tensor<TYPE, 2, true>& outK,                                        \
    Tensor<int, 2, true>& outV,                                         \
    bool dir,                                                           \
    int k,                                                              \
    cudaStream_t stream);                                               \
                                                                        \
  extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
    Tensor<TYPE, 2, true>& inK,                                         \
    Tensor<int, 2, true>& inV,                                          \
    Tensor<TYPE, 2, true>& outK,                                        \
    Tensor<int, 2, true>& outV,                                         \
    bool dir,                                                           \
    int k,                                                              \
    cudaStream_t stream)

#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
  void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(         \
    Tensor<TYPE, 2, true>& in,                                          \
    Tensor<TYPE, 2, true>& outK,                                        \
    Tensor<int, 2, true>& outV,                                         \
    bool dir,                                                           \
    int k,                                                              \
    cudaStream_t stream) {                                              \
    FAISS_ASSERT(in.getSize(0) == outK.getSize(0));                     \
    FAISS_ASSERT(in.getSize(0) == outV.getSize(0));                     \
    FAISS_ASSERT(outK.getSize(1) == k);                                 \
    FAISS_ASSERT(outV.getSize(1) == k);                                 \
                                                                        \
    auto grid = dim3(in.getSize(0));                                    \
                                                                        \
    constexpr int kBlockSelectNumThreads = 128;                         \
    auto block = dim3(kBlockSelectNumThreads);                          \
                                                                        \
    FAISS_ASSERT(k <= WARP_Q);                                          \
    FAISS_ASSERT(dir == DIR);                                           \
                                                                        \
    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
    auto vInit = -1;                                                    \
                                                                        \
    blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
      <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
    CUDA_TEST_ERROR();                                                  \
  }                                                                     \
                                                                        \
  void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(     \
    Tensor<TYPE, 2, true>& inK,                                         \
    Tensor<int, 2, true>& inV,                                          \
    Tensor<TYPE, 2, true>& outK,                                        \
    Tensor<int, 2, true>& outV,                                         \
    bool dir,                                                           \
    int k,                                                              \
    cudaStream_t stream) {                                              \
    FAISS_ASSERT(inK.isSameSize(inV));                                  \
    FAISS_ASSERT(outK.isSameSize(outV));                                \
                                                                        \
    auto grid = dim3(inK.getSize(0));                                   \
                                                                        \
    constexpr int kBlockSelectNumThreads = 128;                         \
    auto block = dim3(kBlockSelectNumThreads);                          \
                                                                        \
    FAISS_ASSERT(k <= WARP_Q);                                          \
    FAISS_ASSERT(dir == DIR);                                           \
                                                                        \
    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
    auto vInit = -1;                                                    \
                                                                        \
    blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
      <<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
    CUDA_TEST_ERROR();                                                  \
  }


#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q)                    \
  runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(      \
    in, outK, outV, dir, k, stream)

#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q)               \
  runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
    inK, inV, outK, outV, dir, k, stream)
Initial commit 2017-02-22 23:26:44 +01:00			`/**`
			`* Copyright (c) 2015-present, Facebook, Inc.`
			`* All rights reserved.`
			`*`
changed license 2017-07-30 00:18:45 -07:00			`* This source code is licensed under the BSD+Patents license found in the`
Initial commit 2017-02-22 23:26:44 +01:00			`* LICENSE file in the root directory of this source tree.`
			`*/`

			`// Copyright 2004-present Facebook. All Rights Reserved.`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 05:11:28 -08:00			`#pragma once`

Initial commit 2017-02-22 23:26:44 +01:00			`#include "../BlockSelectKernel.cuh"`
			`#include "../Limits.cuh"`

			`#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q) \`
			`extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \`
			`Tensor<TYPE, 2, true>& in, \`
			`Tensor<TYPE, 2, true>& outK, \`
			`Tensor<int, 2, true>& outV, \`
			`bool dir, \`
			`int k, \`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 05:11:28 -08:00			`cudaStream_t stream); \`
			`\`
			`extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \`
			`Tensor<TYPE, 2, true>& inK, \`
			`Tensor<int, 2, true>& inV, \`
			`Tensor<TYPE, 2, true>& outK, \`
			`Tensor<int, 2, true>& outV, \`
			`bool dir, \`
			`int k, \`
Initial commit 2017-02-22 23:26:44 +01:00			`cudaStream_t stream)`

			`#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \`
			`void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \`
			`Tensor<TYPE, 2, true>& in, \`
			`Tensor<TYPE, 2, true>& outK, \`
			`Tensor<int, 2, true>& outV, \`
			`bool dir, \`
			`int k, \`
			`cudaStream_t stream) { \`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 05:11:28 -08:00			`FAISS_ASSERT(in.getSize(0) == outK.getSize(0)); \`
			`FAISS_ASSERT(in.getSize(0) == outV.getSize(0)); \`
			`FAISS_ASSERT(outK.getSize(1) == k); \`
			`FAISS_ASSERT(outV.getSize(1) == k); \`
			`\`
Initial commit 2017-02-22 23:26:44 +01:00			`auto grid = dim3(in.getSize(0)); \`
			`\`
			`constexpr int kBlockSelectNumThreads = 128; \`
			`auto block = dim3(kBlockSelectNumThreads); \`
			`\`
			`FAISS_ASSERT(k <= WARP_Q); \`
			`FAISS_ASSERT(dir == DIR); \`
			`\`
			`auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \`
			`auto vInit = -1; \`
			`\`
			`blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \`
			`<<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \`
Synchronization with FB version 2017-06-21 * moved most FAISS_ASSERT calls to C++ exceptions, and adjusted memory allocation to avoid mem leaks * added an IndexIVFScalarQuantizer type that offers an intermediate compression between IVFFlat and IVFPQ * support removal of indices in IndexIDMap / IndexFlat combination * various fixes in GPU code 2017-06-21 06:54:28 -07:00			`CUDA_TEST_ERROR(); \`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 05:11:28 -08:00			`} \`
			`\`
			`void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \`
			`Tensor<TYPE, 2, true>& inK, \`
			`Tensor<int, 2, true>& inV, \`
			`Tensor<TYPE, 2, true>& outK, \`
			`Tensor<int, 2, true>& outV, \`
			`bool dir, \`
			`int k, \`
			`cudaStream_t stream) { \`
			`FAISS_ASSERT(inK.isSameSize(inV)); \`
			`FAISS_ASSERT(outK.isSameSize(outV)); \`
			`\`
			`auto grid = dim3(inK.getSize(0)); \`
			`\`
			`constexpr int kBlockSelectNumThreads = 128; \`
			`auto block = dim3(kBlockSelectNumThreads); \`
			`\`
			`FAISS_ASSERT(k <= WARP_Q); \`
			`FAISS_ASSERT(dir == DIR); \`
			`\`
			`auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \`
			`auto vInit = -1; \`
			`\`
			`blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \`
			`<<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \`
			`CUDA_TEST_ERROR(); \`
Initial commit 2017-02-22 23:26:44 +01:00			`}`

sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 05:11:28 -08:00
Initial commit 2017-02-22 23:26:44 +01:00			`#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \`
			`runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \`
			`in, outK, outV, dir, k, stream)`
sync with FB version 2017-11-22 various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops 2017-11-22 05:11:28 -08:00
			`#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q) \`
			`runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \`
			`inK, inV, outK, outV, dir, k, stream)`