13 #include "Float16.cuh"
16 namespace faiss {
namespace gpu {
24 __global__
void blockSelect(Tensor<K, 2, true> in,
25 Tensor<K, 2, true> outK,
26 Tensor<IndexType, 2, true> outV,
30 constexpr
int kNumWarps = ThreadsPerBlock / kWarpSize;
32 __shared__ K smemK[kNumWarps * NumWarpQ];
33 __shared__ IndexType smemV[kNumWarps * NumWarpQ];
35 BlockSelect<K, IndexType, Dir, Comparator<K>,
36 NumWarpQ, NumThreadQ, ThreadsPerBlock>
37 heap(initK, initV, smemK, smemV, k);
42 int limit = utils::roundDown(in.getSize(1), kWarpSize);
45 for (; i < limit; i += blockDim.x) {
46 heap.add(in[row][i], (IndexType) i);
50 if (i < in.getSize(1)) {
51 heap.addThreadQ(in[row][i], (IndexType) i);
56 for (
int i = threadIdx.x; i < k; i += blockDim.x) {
57 outK[row][i] = smemK[i];
58 outV[row][i] = smemV[i];
62 void runBlockSelect(Tensor<float, 2, true>& in,
63 Tensor<float, 2, true>& outKeys,
64 Tensor<int, 2, true>& outIndices,
65 bool dir,
int k, cudaStream_t stream);
67 #ifdef FAISS_USE_FLOAT16
68 void runBlockSelect(Tensor<half, 2, true>& in,
69 Tensor<half, 2, true>& outKeys,
70 Tensor<int, 2, true>& outIndices,
71 bool dir,
int k, cudaStream_t stream);