12 #include "Distance.cuh"
13 #include "BroadcastSum.cuh"
15 #include "L2Select.cuh"
16 #include "../../FaissAssert.h"
17 #include "../GpuResources.h"
18 #include "../utils/DeviceUtils.h"
19 #include "../utils/Limits.cuh"
20 #include "../utils/MatrixMult.cuh"
21 #include "../utils/BlockSelectKernel.cuh"
24 #include <thrust/fill.h>
25 #include <thrust/for_each.h>
26 #include <thrust/device_ptr.h>
27 #include <thrust/execution_policy.h>
29 namespace faiss {
namespace gpu {
31 constexpr
int kDefaultTileSize = 256;
34 void runL2Distance(GpuResources* resources,
35 Tensor<T, 2, true>& centroids,
36 Tensor<T, 1, true>* centroidNorms,
37 Tensor<T, 2, true>& queries,
39 Tensor<T, 2, true>& outDistances,
40 Tensor<int, 2, true>& outIndices,
41 bool ignoreOutDistances =
false,
43 FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
44 FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
45 FAISS_ASSERT(outDistances.getSize(1) == k);
46 FAISS_ASSERT(outIndices.getSize(1) == k);
48 auto& mem = resources->getMemoryManagerCurrentDevice();
49 auto defaultStream = resources->getDefaultStreamCurrentDevice();
52 if (centroids.numElements() == 0) {
53 thrust::fill(thrust::cuda::par.on(defaultStream),
54 outDistances.data(), outDistances.end(),
57 thrust::fill(thrust::cuda::par.on(defaultStream),
58 outIndices.data(), outIndices.end(),
65 DeviceTensor<T, 1, true> cNorms;
67 cNorms = std::move(DeviceTensor<T, 1, true>(
69 {centroids.getSize(0)}, defaultStream));
70 runL2Norm(centroids, cNorms,
true, defaultStream);
71 centroidNorms = &cNorms;
77 int qNormSize[1] = {queries.getSize(0)};
78 DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
81 runL2Norm(queries, queryNorms,
true, defaultStream);
88 FAISS_ASSERT(k <= centroids.getSize(0));
89 FAISS_ASSERT(k <= 1024);
93 size_t distanceRowSize = centroids.getSize(0) *
sizeof(T);
96 int defaultTileSize =
sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
97 tileSize = tileSize <= 0 ? defaultTileSize : tileSize;
99 int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
102 DeviceTensor<T, 2, true> distanceBuf1(
103 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
104 DeviceTensor<T, 2, true> distanceBuf2(
105 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
106 DeviceTensor<T, 2, true>* distanceBufs[2] =
107 {&distanceBuf1, &distanceBuf2};
109 auto streams = resources->getAlternateStreamsCurrentDevice();
110 streamWait(streams, {defaultStream});
114 for (
int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
115 int numQueriesForIteration = std::min(maxQueriesPerIteration,
116 queries.getSize(0) - i);
118 auto distanceBufView =
119 distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
121 queries.narrowOutermost(i, numQueriesForIteration);
122 auto outDistanceView =
123 outDistances.narrowOutermost(i, numQueriesForIteration);
125 outIndices.narrowOutermost(i, numQueriesForIteration);
127 queryNorms.narrowOutermost(i, numQueriesForIteration);
133 runMatrixMult(distanceBufView,
false,
137 resources->getBlasHandleCurrentDevice(),
144 runL2SelectMin(distanceBufView,
151 if (!ignoreOutDistances) {
154 runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);
157 curStream = (curStream + 1) % 2;
161 streamWait({defaultStream}, streams);
164 template <
typename T>
165 void runIPDistance(GpuResources* resources,
166 Tensor<T, 2, true>& centroids,
167 Tensor<T, 2, true>& queries,
169 Tensor<T, 2, true>& outDistances,
170 Tensor<int, 2, true>& outIndices,
172 FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
173 FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
174 FAISS_ASSERT(outDistances.getSize(1) == k);
175 FAISS_ASSERT(outIndices.getSize(1) == k);
177 auto& mem = resources->getMemoryManagerCurrentDevice();
178 auto defaultStream = resources->getDefaultStreamCurrentDevice();
181 if (centroids.numElements() == 0) {
182 thrust::fill(thrust::cuda::par.on(defaultStream),
183 outDistances.data(), outDistances.end(),
184 Limits<T>::getMax());
186 thrust::fill(thrust::cuda::par.on(defaultStream),
187 outIndices.data(), outIndices.end(),
198 FAISS_ASSERT(k <= centroids.getSize(0));
199 FAISS_ASSERT(k <= 1024);
203 size_t distanceRowSize = centroids.getSize(0) *
sizeof(T);
206 int defaultTileSize =
sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
207 tileSize = tileSize <= 0 ? defaultTileSize : tileSize;
209 int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
212 DeviceTensor<T, 2, true> distanceBuf1(
213 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
214 DeviceTensor<T, 2, true> distanceBuf2(
215 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
216 DeviceTensor<T, 2, true>* distanceBufs[2] =
217 {&distanceBuf1, &distanceBuf2};
219 auto streams = resources->getAlternateStreamsCurrentDevice();
220 streamWait(streams, {defaultStream});
224 for (
int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
225 int numQueriesForIteration = std::min(maxQueriesPerIteration,
226 queries.getSize(0) - i);
228 auto distanceBufView =
229 distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
231 queries.narrowOutermost(i, numQueriesForIteration);
232 auto outDistanceView =
233 outDistances.narrowOutermost(i, numQueriesForIteration);
235 outIndices.narrowOutermost(i, numQueriesForIteration);
238 runMatrixMult(distanceBufView,
false,
239 queryView,
false, centroids,
true,
241 resources->getBlasHandleCurrentDevice(),
246 runBlockSelect(distanceBufView,
249 true, k, streams[curStream]);
251 curStream = (curStream + 1) % 2;
254 streamWait({defaultStream}, streams);
262 runIPDistance(GpuResources* resources,
263 Tensor<float, 2, true>& vectors,
264 Tensor<float, 2, true>& queries,
266 Tensor<float, 2, true>& outDistances,
267 Tensor<int, 2, true>& outIndices,
269 runIPDistance<float>(resources,
278 #ifdef FAISS_USE_FLOAT16
280 runIPDistance(GpuResources* resources,
281 Tensor<half, 2, true>& vectors,
282 Tensor<half, 2, true>& queries,
284 Tensor<half, 2, true>& outDistances,
285 Tensor<int, 2, true>& outIndices,
287 runIPDistance<half>(resources,
298 runL2Distance(GpuResources* resources,
299 Tensor<float, 2, true>& vectors,
300 Tensor<float, 1, true>* vectorNorms,
301 Tensor<float, 2, true>& queries,
303 Tensor<float, 2, true>& outDistances,
304 Tensor<int, 2, true>& outIndices,
305 bool ignoreOutDistances,
307 runL2Distance<float>(resources,
318 #ifdef FAISS_USE_FLOAT16
320 runL2Distance(GpuResources* resources,
321 Tensor<half, 2, true>& vectors,
322 Tensor<half, 1, true>* vectorNorms,
323 Tensor<half, 2, true>& queries,
325 Tensor<half, 2, true>& outDistances,
326 Tensor<int, 2, true>& outIndices,
327 bool ignoreOutDistances,
329 runL2Distance<half>(resources,