9 #include "IVFUtils.cuh"
10 #include "../utils/DeviceUtils.h"
11 #include "../utils/StaticUtils.h"
12 #include "../utils/Tensor.cuh"
13 #include "../utils/ThrustAllocator.cuh"
14 #include <thrust/scan.h>
15 #include <thrust/execution_policy.h>
17 namespace faiss {
namespace gpu {
22 getResultLengths(Tensor<int, 2, true> topQueryToCentroid,
25 Tensor<int, 2, true> length) {
26 int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
27 if (linearThreadId >= totalSize) {
31 int nprobe = topQueryToCentroid.getSize(1);
32 int queryId = linearThreadId / nprobe;
33 int listId = linearThreadId % nprobe;
35 int centroidId = topQueryToCentroid[queryId][listId];
38 length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
41 void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
42 thrust::device_vector<int>& listLengths,
43 Tensor<int, 2, true>& prefixSumOffsets,
44 Tensor<char, 1, true>& thrustMem,
45 cudaStream_t stream) {
46 FAISS_ASSERT(topQueryToCentroid.getSize(0) == prefixSumOffsets.getSize(0));
47 FAISS_ASSERT(topQueryToCentroid.getSize(1) == prefixSumOffsets.getSize(1));
49 int totalSize = topQueryToCentroid.numElements();
51 int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
52 int numBlocks = utils::divUp(totalSize, numThreads);
54 auto grid = dim3(numBlocks);
55 auto block = dim3(numThreads);
57 getResultLengths<<<grid, block, 0, stream>>>(
59 listLengths.data().get(),
68 GpuResourcesThrustAllocator alloc(thrustMem.data(),
69 thrustMem.getSizeInBytes());
71 thrust::inclusive_scan(thrust::cuda::par(alloc).on(stream),
72 prefixSumOffsets.data(),
73 prefixSumOffsets.data() + totalSize,
74 prefixSumOffsets.data());