/** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include // // This kernel is split into a separate compilation unit to cut down // on compile time // namespace faiss { namespace gpu { template __global__ void pass1SelectLists(Tensor prefixSumOffsets, Tensor distance, int nprobe, int k, Tensor heapDistances, Tensor heapIndices) { constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; __shared__ float smemK[kNumWarps * NumWarpQ]; __shared__ int smemV[kNumWarps * NumWarpQ]; constexpr auto kInit = Dir ? kFloatMin : kFloatMax; BlockSelect, NumWarpQ, NumThreadQ, ThreadsPerBlock> heap(kInit, -1, smemK, smemV, k); auto queryId = blockIdx.y; auto sliceId = blockIdx.x; auto numSlices = gridDim.x; int sliceSize = (nprobe / numSlices); int sliceStart = sliceSize * sliceId; int sliceEnd = sliceId == (numSlices - 1) ? nprobe : sliceStart + sliceSize; auto offsets = prefixSumOffsets[queryId].data(); // We ensure that before the array (at offset -1), there is a 0 value int start = *(&offsets[sliceStart] - 1); int end = offsets[sliceEnd - 1]; int num = end - start; int limit = utils::roundDown(num, kWarpSize); int i = threadIdx.x; auto distanceStart = distance[start].data(); // BlockSelect add cannot be used in a warp divergent circumstance; we // handle the remainder warp below for (; i < limit; i += blockDim.x) { heap.add(distanceStart[i], start + i); } // Handle warp divergence separately if (i < num) { heap.addThreadQ(distanceStart[i], start + i); } // Merge all final results heap.reduce(); // Write out the final k-selected values; they should be all // together for (int i = threadIdx.x; i < k; i += blockDim.x) { heapDistances[queryId][sliceId][i] = smemK[i]; heapIndices[queryId][sliceId][i] = smemV[i]; } } void runPass1SelectLists(Tensor& prefixSumOffsets, Tensor& distance, int nprobe, int k, bool chooseLargest, Tensor& heapDistances, Tensor& heapIndices, cudaStream_t stream) { // This is caught at a higher level FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); auto grid = dim3(heapDistances.getSize(1), prefixSumOffsets.getSize(0)); #define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR) \ do { \ pass1SelectLists \ <<>>(prefixSumOffsets, \ distance, \ nprobe, \ k, \ heapDistances, \ heapIndices); \ CUDA_TEST_ERROR(); \ return; /* success */ \ } while (0) #if GPU_MAX_SELECTION_K >= 2048 // block size 128 for k <= 1024, 64 for k = 2048 #define RUN_PASS_DIR(DIR) \ do { \ if (k == 1) { \ RUN_PASS(128, 1, 1, DIR); \ } else if (k <= 32) { \ RUN_PASS(128, 32, 2, DIR); \ } else if (k <= 64) { \ RUN_PASS(128, 64, 3, DIR); \ } else if (k <= 128) { \ RUN_PASS(128, 128, 3, DIR); \ } else if (k <= 256) { \ RUN_PASS(128, 256, 4, DIR); \ } else if (k <= 512) { \ RUN_PASS(128, 512, 8, DIR); \ } else if (k <= 1024) { \ RUN_PASS(128, 1024, 8, DIR); \ } else if (k <= 2048) { \ RUN_PASS(64, 2048, 8, DIR); \ } \ } while (0) #else #define RUN_PASS_DIR(DIR) \ do { \ if (k == 1) { \ RUN_PASS(128, 1, 1, DIR); \ } else if (k <= 32) { \ RUN_PASS(128, 32, 2, DIR); \ } else if (k <= 64) { \ RUN_PASS(128, 64, 3, DIR); \ } else if (k <= 128) { \ RUN_PASS(128, 128, 3, DIR); \ } else if (k <= 256) { \ RUN_PASS(128, 256, 4, DIR); \ } else if (k <= 512) { \ RUN_PASS(128, 512, 8, DIR); \ } else if (k <= 1024) { \ RUN_PASS(128, 1024, 8, DIR); \ } \ } while (0) #endif // GPU_MAX_SELECTION_K if (chooseLargest) { RUN_PASS_DIR(true); } else { RUN_PASS_DIR(false); } #undef RUN_PASS_DIR #undef RUN_PASS } } } // namespace