Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
WarpSelectKernel.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #pragma once
10 
11 #include "Float16.cuh"
12 #include "Select.cuh"
13 
14 namespace faiss { namespace gpu {
15 
16 template <typename K,
17  typename IndexType,
18  bool Dir,
19  int NumWarpQ,
20  int NumThreadQ,
21  int ThreadsPerBlock>
22 __global__ void warpSelect(Tensor<K, 2, true> in,
23  Tensor<K, 2, true> outK,
24  Tensor<IndexType, 2, true> outV,
25  K initK,
26  IndexType initV,
27  int k) {
28  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
29 
30  WarpSelect<K, IndexType, Dir, Comparator<K>,
31  NumWarpQ, NumThreadQ, ThreadsPerBlock>
32  heap(initK, initV, k);
33 
34  int warpId = threadIdx.x / kWarpSize;
35  int row = blockIdx.x * kNumWarps + warpId;
36 
37  if (row >= in.getSize(0)) {
38  return;
39  }
40 
41  int i = getLaneId();
42  K* inStart = in[row][i].data();
43 
44  // Whole warps must participate in the selection
45  int limit = utils::roundDown(in.getSize(1), kWarpSize);
46 
47  for (; i < limit; i += kWarpSize) {
48  heap.add(*inStart, (IndexType) i);
49  inStart += kWarpSize;
50  }
51 
52  // Handle non-warp multiple remainder
53  if (i < in.getSize(1)) {
54  heap.addThreadQ(*inStart, (IndexType) i);
55  }
56 
57  heap.reduce();
58  heap.writeOut(outK[row].data(),
59  outV[row].data(), k);
60 }
61 
62 void runWarpSelect(Tensor<float, 2, true>& in,
63  Tensor<float, 2, true>& outKeys,
64  Tensor<int, 2, true>& outIndices,
65  bool dir, int k, cudaStream_t stream);
66 
67 #ifdef FAISS_USE_FLOAT16
68 void runWarpSelect(Tensor<half, 2, true>& in,
69  Tensor<half, 2, true>& outKeys,
70  Tensor<int, 2, true>& outIndices,
71  bool dir, int k, cudaStream_t stream);
72 #endif
73 
74 } } // namespace