10 #include "IVFUtils.cuh"
11 #include "../utils/DeviceUtils.h"
12 #include "../utils/StaticUtils.h"
13 #include "../utils/Tensor.cuh"
14 #include "../utils/ThrustAllocator.cuh"
15 #include <thrust/scan.h>
16 #include <thrust/execution_policy.h>
18 namespace faiss {
namespace gpu {
23 getResultLengths(Tensor<int, 2, true> topQueryToCentroid,
26 Tensor<int, 2, true> length) {
27 int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
28 if (linearThreadId >= totalSize) {
32 int nprobe = topQueryToCentroid.getSize(1);
33 int queryId = linearThreadId / nprobe;
34 int listId = linearThreadId % nprobe;
36 int centroidId = topQueryToCentroid[queryId][listId];
39 length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
42 void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
43 thrust::device_vector<int>& listLengths,
44 Tensor<int, 2, true>& prefixSumOffsets,
45 Tensor<char, 1, true>& thrustMem,
46 cudaStream_t stream) {
47 FAISS_ASSERT(topQueryToCentroid.getSize(0) == prefixSumOffsets.getSize(0));
48 FAISS_ASSERT(topQueryToCentroid.getSize(1) == prefixSumOffsets.getSize(1));
50 int totalSize = topQueryToCentroid.numElements();
52 int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
53 int numBlocks = utils::divUp(totalSize, numThreads);
55 auto grid = dim3(numBlocks);
56 auto block = dim3(numThreads);
58 getResultLengths<<<grid, block, 0, stream>>>(
60 listLengths.data().get(),
69 GpuResourcesThrustAllocator alloc(thrustMem.data(),
70 thrustMem.getSizeInBytes());
72 thrust::inclusive_scan(thrust::cuda::par(alloc).on(stream),
73 prefixSumOffsets.data(),
74 prefixSumOffsets.data() + totalSize,
75 prefixSumOffsets.data());