12 #include "IVFUtils.cuh"
13 #include "../utils/DeviceUtils.h"
14 #include "../utils/StaticUtils.h"
15 #include "../utils/Tensor.cuh"
16 #include "../utils/ThrustAllocator.cuh"
17 #include <thrust/scan.h>
18 #include <thrust/execution_policy.h>
20 namespace faiss {
namespace gpu {
25 getResultLengths(Tensor<int, 2, true> topQueryToCentroid,
28 Tensor<int, 2, true> length) {
29 int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
30 if (linearThreadId >= totalSize) {
34 int nprobe = topQueryToCentroid.getSize(1);
35 int queryId = linearThreadId / nprobe;
36 int listId = linearThreadId % nprobe;
38 int centroidId = topQueryToCentroid[queryId][listId];
41 length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
44 void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
45 thrust::device_vector<int>& listLengths,
46 Tensor<int, 2, true>& prefixSumOffsets,
47 Tensor<char, 1, true>& thrustMem,
48 cudaStream_t stream) {
49 FAISS_ASSERT(topQueryToCentroid.getSize(0) == prefixSumOffsets.getSize(0));
50 FAISS_ASSERT(topQueryToCentroid.getSize(1) == prefixSumOffsets.getSize(1));
52 int totalSize = topQueryToCentroid.numElements();
54 int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
55 int numBlocks = utils::divUp(totalSize, numThreads);
57 auto grid = dim3(numBlocks);
58 auto block = dim3(numThreads);
60 getResultLengths<<<grid, block, 0, stream>>>(
62 listLengths.data().get(),
70 GpuResourcesThrustAllocator alloc(thrustMem.data(),
71 thrustMem.getSizeInBytes());
73 thrust::inclusive_scan(thrust::cuda::par(alloc).on(stream),
74 prefixSumOffsets.data(),
75 prefixSumOffsets.data() + totalSize,
76 prefixSumOffsets.data());