12 #include "Distance.cuh"
13 #include "BroadcastSum.cuh"
15 #include "L2Select.cuh"
16 #include "../../FaissAssert.h"
17 #include "../GpuResources.h"
18 #include "../utils/DeviceUtils.h"
19 #include "../utils/Limits.cuh"
20 #include "../utils/MatrixMult.cuh"
21 #include "../utils/BlockSelectKernel.cuh"
24 #include <thrust/fill.h>
25 #include <thrust/for_each.h>
26 #include <thrust/device_ptr.h>
27 #include <thrust/execution_policy.h>
29 namespace faiss {
namespace gpu {
31 constexpr
int kDefaultTileSize = 256;
34 void runL2Distance(GpuResources* resources,
35 Tensor<T, 2, true>& centroids,
36 Tensor<T, 2, true>* centroidsTransposed,
37 Tensor<T, 1, true>* centroidNorms,
38 Tensor<T, 2, true>& queries,
40 Tensor<T, 2, true>& outDistances,
41 Tensor<int, 2, true>& outIndices,
42 bool ignoreOutDistances =
false,
44 FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
45 FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
46 FAISS_ASSERT(outDistances.getSize(1) == k);
47 FAISS_ASSERT(outIndices.getSize(1) == k);
49 auto& mem = resources->getMemoryManagerCurrentDevice();
50 auto defaultStream = resources->getDefaultStreamCurrentDevice();
53 if (centroids.numElements() == 0) {
54 thrust::fill(thrust::cuda::par.on(defaultStream),
55 outDistances.data(), outDistances.end(),
58 thrust::fill(thrust::cuda::par.on(defaultStream),
59 outIndices.data(), outIndices.end(),
66 DeviceTensor<T, 1, true> cNorms;
68 cNorms = std::move(DeviceTensor<T, 1, true>(
70 {centroids.getSize(0)}, defaultStream));
71 runL2Norm(centroids, cNorms,
true, defaultStream);
72 centroidNorms = &cNorms;
78 int qNormSize[1] = {queries.getSize(0)};
79 DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
82 runL2Norm(queries, queryNorms,
true, defaultStream);
89 FAISS_ASSERT(k <= centroids.getSize(0));
90 FAISS_ASSERT(k <= 1024);
94 size_t distanceRowSize = centroids.getSize(0) *
sizeof(T);
97 int defaultTileSize =
sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
98 tileSize = tileSize <= 0 ? defaultTileSize : tileSize;
100 int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
103 DeviceTensor<T, 2, true> distanceBuf1(
104 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
105 DeviceTensor<T, 2, true> distanceBuf2(
106 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
107 DeviceTensor<T, 2, true>* distanceBufs[2] =
108 {&distanceBuf1, &distanceBuf2};
110 auto streams = resources->getAlternateStreamsCurrentDevice();
111 streamWait(streams, {defaultStream});
115 for (
int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
116 int numQueriesForIteration = std::min(maxQueriesPerIteration,
117 queries.getSize(0) - i);
119 auto distanceBufView =
120 distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
122 queries.narrowOutermost(i, numQueriesForIteration);
123 auto outDistanceView =
124 outDistances.narrowOutermost(i, numQueriesForIteration);
126 outIndices.narrowOutermost(i, numQueriesForIteration);
128 queryNorms.narrowOutermost(i, numQueriesForIteration);
134 runMatrixMult(distanceBufView,
false,
136 centroidsTransposed ? *centroidsTransposed : centroids,
137 centroidsTransposed ?
false :
true,
139 resources->getBlasHandleCurrentDevice(),
146 runL2SelectMin(distanceBufView,
153 if (!ignoreOutDistances) {
156 runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);
159 curStream = (curStream + 1) % 2;
163 streamWait({defaultStream}, streams);
166 template <
typename T>
167 void runIPDistance(GpuResources* resources,
168 Tensor<T, 2, true>& centroids,
169 Tensor<T, 2, true>* centroidsTransposed,
170 Tensor<T, 2, true>& queries,
172 Tensor<T, 2, true>& outDistances,
173 Tensor<int, 2, true>& outIndices,
175 FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
176 FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
177 FAISS_ASSERT(outDistances.getSize(1) == k);
178 FAISS_ASSERT(outIndices.getSize(1) == k);
180 auto& mem = resources->getMemoryManagerCurrentDevice();
181 auto defaultStream = resources->getDefaultStreamCurrentDevice();
184 if (centroids.numElements() == 0) {
185 thrust::fill(thrust::cuda::par.on(defaultStream),
186 outDistances.data(), outDistances.end(),
187 Limits<T>::getMax());
189 thrust::fill(thrust::cuda::par.on(defaultStream),
190 outIndices.data(), outIndices.end(),
201 FAISS_ASSERT(k <= centroids.getSize(0));
202 FAISS_ASSERT(k <= 1024);
206 size_t distanceRowSize = centroids.getSize(0) *
sizeof(T);
209 int defaultTileSize =
sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
210 tileSize = tileSize <= 0 ? defaultTileSize : tileSize;
212 int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
215 DeviceTensor<T, 2, true> distanceBuf1(
216 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
217 DeviceTensor<T, 2, true> distanceBuf2(
218 mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
219 DeviceTensor<T, 2, true>* distanceBufs[2] =
220 {&distanceBuf1, &distanceBuf2};
222 auto streams = resources->getAlternateStreamsCurrentDevice();
223 streamWait(streams, {defaultStream});
227 for (
int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
228 int numQueriesForIteration = std::min(maxQueriesPerIteration,
229 queries.getSize(0) - i);
231 auto distanceBufView =
232 distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
234 queries.narrowOutermost(i, numQueriesForIteration);
235 auto outDistanceView =
236 outDistances.narrowOutermost(i, numQueriesForIteration);
238 outIndices.narrowOutermost(i, numQueriesForIteration);
241 runMatrixMult(distanceBufView,
false,
243 centroidsTransposed ? *centroidsTransposed : centroids,
244 centroidsTransposed ?
false :
true,
246 resources->getBlasHandleCurrentDevice(),
251 runBlockSelect(distanceBufView,
254 true, k, streams[curStream]);
256 curStream = (curStream + 1) % 2;
259 streamWait({defaultStream}, streams);
267 runIPDistance(GpuResources* resources,
268 Tensor<float, 2, true>& vectors,
269 Tensor<float, 2, true>* vectorsTransposed,
270 Tensor<float, 2, true>& queries,
272 Tensor<float, 2, true>& outDistances,
273 Tensor<int, 2, true>& outIndices,
275 runIPDistance<float>(resources,
285 #ifdef FAISS_USE_FLOAT16
287 runIPDistance(GpuResources* resources,
288 Tensor<half, 2, true>& vectors,
289 Tensor<half, 2, true>* vectorsTransposed,
290 Tensor<half, 2, true>& queries,
292 Tensor<half, 2, true>& outDistances,
293 Tensor<int, 2, true>& outIndices,
295 runIPDistance<half>(resources,
307 runL2Distance(GpuResources* resources,
308 Tensor<float, 2, true>& vectors,
309 Tensor<float, 2, true>* vectorsTransposed,
310 Tensor<float, 1, true>* vectorNorms,
311 Tensor<float, 2, true>& queries,
313 Tensor<float, 2, true>& outDistances,
314 Tensor<int, 2, true>& outIndices,
315 bool ignoreOutDistances,
317 runL2Distance<float>(resources,
329 #ifdef FAISS_USE_FLOAT16
331 runL2Distance(GpuResources* resources,
332 Tensor<half, 2, true>& vectors,
333 Tensor<half, 2, true>* vectorsTransposed,
334 Tensor<half, 1, true>* vectorNorms,
335 Tensor<half, 2, true>& queries,
337 Tensor<half, 2, true>& outDistances,
338 Tensor<int, 2, true>& outIndices,
339 bool ignoreOutDistances,
341 runL2Distance<half>(resources,