Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
IVFUtils.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "IVFUtils.cuh"
12 #include "../utils/DeviceUtils.h"
13 #include "../utils/StaticUtils.h"
14 #include "../utils/Tensor.cuh"
15 #include "../utils/ThrustAllocator.cuh"
16 #include <thrust/scan.h>
17 #include <thrust/execution_policy.h>
18 
19 namespace faiss { namespace gpu {
20 
21 // Calculates the total number of intermediate distances to consider
22 // for all queries
23 __global__ void
24 getResultLengths(Tensor<int, 2, true> topQueryToCentroid,
25  int* listLengths,
26  int totalSize,
27  Tensor<int, 2, true> length) {
28  int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
29  if (linearThreadId >= totalSize) {
30  return;
31  }
32 
33  int nprobe = topQueryToCentroid.getSize(1);
34  int queryId = linearThreadId / nprobe;
35  int listId = linearThreadId % nprobe;
36 
37  int centroidId = topQueryToCentroid[queryId][listId];
38 
39  // Safety guard in case NaNs in input cause no list ID to be generated
40  length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
41 }
42 
43 void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
44  thrust::device_vector<int>& listLengths,
45  Tensor<int, 2, true>& prefixSumOffsets,
46  Tensor<char, 1, true>& thrustMem,
47  cudaStream_t stream) {
48  FAISS_ASSERT(topQueryToCentroid.getSize(0) == prefixSumOffsets.getSize(0));
49  FAISS_ASSERT(topQueryToCentroid.getSize(1) == prefixSumOffsets.getSize(1));
50 
51  int totalSize = topQueryToCentroid.numElements();
52 
53  int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
54  int numBlocks = utils::divUp(totalSize, numThreads);
55 
56  auto grid = dim3(numBlocks);
57  auto block = dim3(numThreads);
58 
59  getResultLengths<<<grid, block, 0, stream>>>(
60  topQueryToCentroid,
61  listLengths.data().get(),
62  totalSize,
63  prefixSumOffsets);
64  CUDA_TEST_ERROR();
65 
66  // Prefix sum of the indices, so we know where the intermediate
67  // results should be maintained
68  // Thrust wants a place for its temporary allocations, so provide
69  // one, so it won't call cudaMalloc/Free
70  GpuResourcesThrustAllocator alloc(thrustMem.data(),
71  thrustMem.getSizeInBytes());
72 
73  thrust::inclusive_scan(thrust::cuda::par(alloc).on(stream),
74  prefixSumOffsets.data(),
75  prefixSumOffsets.data() + totalSize,
76  prefixSumOffsets.data());
77  CUDA_TEST_ERROR();
78 }
79 
80 } } // namespace