9 #include "../utils/DeviceTensor.cuh"
10 #include "../utils/DeviceDefs.cuh"
11 #include "../utils/DeviceUtils.h"
12 #include "../utils/Select.cuh"
14 namespace faiss {
namespace gpu {
17 constexpr
int kWarps = 8;
18 constexpr
int kLanes = kWarpSize;
20 constexpr
int kMaxDistance = std::numeric_limits<int>::max();
25 template <
int NumWarpQ,
28 __launch_bounds__(kWarps * kLanes)
29 __global__
void binaryDistanceAnySize(
const Tensor<BinaryType, 2, true> vecs,
30 const Tensor<BinaryType, 2, true> query,
31 Tensor<int, 2, true> outK,
32 Tensor<int, 2, true> outV,
35 __shared__ BinaryType queryTile[kWarps][kLanes + 1];
38 __shared__ BinaryType vecTile[kLanes][kLanes + 1];
40 WarpSelect<int, int, false, Comparator<int>,
41 NumWarpQ, NumThreadQ, kWarps * kLanes>
42 heap(kMaxDistance, -1, k);
44 int warpId = threadIdx.y;
45 int laneId = threadIdx.x;
48 int warpQuery = blockIdx.x * kWarps + warpId;
49 bool queryInBounds = warpQuery < query.getSize(0);
52 for (
int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
53 int threadDistance = 0;
56 for (
int blockK = 0; blockK < vecs.getSize(1); blockK += kLanes) {
57 int laneK = blockK + laneId;
58 bool kInBounds = laneK < vecs.getSize(1);
60 queryTile[warpId][laneId] = queryInBounds && kInBounds ?
61 query[warpQuery][laneK] : 0;
65 for (
int i = 0; i < kLanes / kWarps; ++i) {
66 int warpVec = i * kWarps + warpId;
67 int vec = blockVec + warpVec;
68 bool vecInBounds = vec < vecs.getSize(0);
70 vecTile[warpVec][laneId] = vecInBounds && kInBounds ?
78 for (
int i = 0; i < kLanes; ++i) {
79 threadDistance += __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);
87 bool valInBounds = queryInBounds && (blockVec + laneId < vecs.getSize(0));
88 threadDistance = valInBounds ? threadDistance : kMaxDistance;
89 int id = valInBounds ? blockVec + laneId : -1;
91 heap.add(threadDistance,
id);
96 if (warpQuery < query.getSize(0)) {
97 heap.writeOut(outK[warpQuery].data(),
98 outV[warpQuery].data(),
105 template <
int NumWarpQ,
108 int ReductionLimit = kLanes>
110 __launch_bounds__(kWarps * kLanes)
111 binaryDistanceLimitSize(
const Tensor<BinaryType, 2, true> vecs,
112 const Tensor<BinaryType, 2, true> query,
113 Tensor<int, 2, true> outK,
114 Tensor<int, 2, true> outV,
117 __shared__ BinaryType queryTile[kWarps][kLanes + 1];
120 __shared__ BinaryType vecTile[kLanes][kLanes + 1];
122 WarpSelect<int, int, false, Comparator<int>,
123 NumWarpQ, NumThreadQ, kWarps * kLanes>
124 heap(kMaxDistance, -1, k);
126 int warpId = threadIdx.y;
127 int laneId = threadIdx.x;
131 int warpQuery = blockIdx.x * kWarps + warpId;
132 bool kInBounds = laneK < vecs.getSize(1);
133 bool queryInBounds = warpQuery < query.getSize(0);
136 queryTile[warpId][laneId] = queryInBounds && kInBounds ?
137 query[warpQuery][laneK] : 0;
140 for (
int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
141 int threadDistance = 0;
145 for (
int i = 0; i < kLanes / kWarps; ++i) {
146 int warpVec = i * kWarps + warpId;
147 int vec = blockVec + warpVec;
148 bool vecInBounds = vec < vecs.getSize(0);
150 vecTile[warpVec][laneId] = vecInBounds && kInBounds ?
151 vecs[vec][laneK] : 0;
158 for (
int i = 0; i < ReductionLimit; ++i) {
159 threadDistance += __popc(queryTile[warpId][i] ^ vecTile[laneId][i]);
166 bool valInBounds = queryInBounds && (blockVec + laneId < vecs.getSize(0));
167 threadDistance = valInBounds ? threadDistance : kMaxDistance;
168 int id = valInBounds ? blockVec + laneId : -1;
170 heap.add(threadDistance,
id);
175 if (warpQuery < query.getSize(0)) {
176 heap.writeOut(outK[warpQuery].data(),
177 outV[warpQuery].data(),
182 template <
typename BinaryType>
183 void runBinaryDistanceAnySize(Tensor<BinaryType, 2, true>& vecs,
184 Tensor<BinaryType, 2, true>& query,
185 Tensor<int, 2, true>& outK,
186 Tensor<int, 2, true>& outV,
187 int k, cudaStream_t stream) {
188 dim3 grid(utils::divUp(query.getSize(0), kWarps));
189 dim3 block(kLanes, kWarps);
192 binaryDistanceAnySize<1, 1, BinaryType>
193 <<<grid, block, 0, stream>>>(
194 vecs, query, outK, outV, k);
195 }
else if (k <= 32) {
196 binaryDistanceAnySize<32, 2, BinaryType>
197 <<<grid, block, 0, stream>>>(
198 vecs, query, outK, outV, k);
199 }
else if (k <= 64) {
200 binaryDistanceAnySize<64, 3, BinaryType>
201 <<<grid, block, 0, stream>>>(
202 vecs, query, outK, outV, k);
203 }
else if (k <= 128) {
204 binaryDistanceAnySize<128, 3, BinaryType>
205 <<<grid, block, 0, stream>>>(
206 vecs, query, outK, outV, k);
207 }
else if (k <= 256) {
208 binaryDistanceAnySize<256, 4, BinaryType>
209 <<<grid, block, 0, stream>>>(
210 vecs, query, outK, outV, k);
211 }
else if (k <= 512) {
212 binaryDistanceAnySize<512, 8, BinaryType>
213 <<<grid, block, 0, stream>>>(
214 vecs, query, outK, outV, k);
215 }
else if (k <= 1024) {
216 binaryDistanceAnySize<1024, 8, BinaryType>
217 <<<grid, block, 0, stream>>>(
218 vecs, query, outK, outV, k);
222 template <
typename BinaryType,
int ReductionLimit>
223 void runBinaryDistanceLimitSize(Tensor<BinaryType, 2, true>& vecs,
224 Tensor<BinaryType, 2, true>& query,
225 Tensor<int, 2, true>& outK,
226 Tensor<int, 2, true>& outV,
227 int k, cudaStream_t stream) {
228 dim3 grid(utils::divUp(query.getSize(0), kWarps));
229 dim3 block(kLanes, kWarps);
232 binaryDistanceLimitSize<1, 1, BinaryType, ReductionLimit>
233 <<<grid, block, 0, stream>>>(
234 vecs, query, outK, outV, k);
235 }
else if (k <= 32) {
236 binaryDistanceLimitSize<32, 2, BinaryType, ReductionLimit>
237 <<<grid, block, 0, stream>>>(
238 vecs, query, outK, outV, k);
239 }
else if (k <= 64) {
240 binaryDistanceLimitSize<64, 3, BinaryType, ReductionLimit>
241 <<<grid, block, 0, stream>>>(
242 vecs, query, outK, outV, k);
243 }
else if (k <= 128) {
244 binaryDistanceLimitSize<128, 3, BinaryType, ReductionLimit>
245 <<<grid, block, 0, stream>>>(
246 vecs, query, outK, outV, k);
247 }
else if (k <= 256) {
248 binaryDistanceLimitSize<256, 4, BinaryType, ReductionLimit>
249 <<<grid, block, 0, stream>>>(
250 vecs, query, outK, outV, k);
251 }
else if (k <= 512) {
252 binaryDistanceLimitSize<512, 8, BinaryType, ReductionLimit>
253 <<<grid, block, 0, stream>>>(
254 vecs, query, outK, outV, k);
255 }
else if (k <= 1024) {
256 binaryDistanceLimitSize<1024, 8, BinaryType, ReductionLimit>
257 <<<grid, block, 0, stream>>>(
258 vecs, query, outK, outV, k);
262 void runBinaryDistance(Tensor<unsigned char, 2, true>& vecs,
263 Tensor<unsigned char, 2, true>& query,
264 Tensor<int, 2, true>& outK,
265 Tensor<int, 2, true>& outV,
266 int k, cudaStream_t stream) {
267 FAISS_ASSERT(k <= 1024);
268 FAISS_ASSERT(vecs.getSize(1) == query.getSize(1));
270 FAISS_ASSERT(outK.getSize(1) == k);
271 FAISS_ASSERT(outV.getSize(1) == k);
274 constexpr
int kReductionLimit32 = 8;
277 constexpr
int kReductionLimit8 = 16;
281 if (vecs.getSize(1) %
sizeof(
unsigned int) == 0 &&
282 (vecs.getSize(1) /
sizeof(
unsigned int)) <= kReductionLimit32) {
283 auto vecs32 = vecs.castResize<
unsigned int>();
284 auto query32 = query.castResize<
unsigned int>();
288 runBinaryDistanceLimitSize<unsigned int, kReductionLimit32>(
289 vecs32, query32, outK, outV, k, stream);
291 }
else if (vecs.getSize(1) <= kReductionLimit8) {
294 runBinaryDistanceLimitSize<unsigned char, kReductionLimit8>(
295 vecs, query, outK, outV, k, stream);
298 runBinaryDistanceAnySize<unsigned char>(
299 vecs, query, outK, outV, k, stream);