Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
Distance.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "Distance.cuh"
13 #include "BroadcastSum.cuh"
14 #include "L2Norm.cuh"
15 #include "L2Select.cuh"
16 #include "../../FaissAssert.h"
17 #include "../GpuResources.h"
18 #include "../utils/DeviceUtils.h"
19 #include "../utils/Limits.cuh"
20 #include "../utils/MatrixMult.cuh"
21 #include "../utils/BlockSelectKernel.cuh"
22 
23 #include <memory>
24 #include <thrust/fill.h>
25 #include <thrust/for_each.h>
26 #include <thrust/device_ptr.h>
27 #include <thrust/execution_policy.h>
28 
29 namespace faiss { namespace gpu {
30 
31 namespace {
32 
33 constexpr int kDefaultTileSize = 256;
34 
35 template <typename T>
36 int chooseTileSize(int tileSizeOverride,
37  size_t numCentroids,
38  size_t tempMemAvailable) {
39  if (tileSizeOverride > 0) {
40  return tileSizeOverride;
41  }
42 
43  size_t tileSize =
44  sizeof(T) < 4 ? kDefaultTileSize * 2 : kDefaultTileSize;
45 
46  while (tileSize > 64) {
47  size_t memRequirement = 2 * tileSize * numCentroids * sizeof(T);
48 
49  if (memRequirement <= tempMemAvailable) {
50  // This fits entirely into our temporary memory
51  return tileSize;
52  }
53 
54  // Otherwise, halve the tile size
55  tileSize /= 2;
56  }
57 
58  // We use 64 as the minimum acceptable tile size
59  FAISS_ASSERT(tileSize >= 64);
60 
61  // FIXME: if we're running with no available temp memory, do we try
62  // and go larger based on free memory available on the device?
63 
64  return tileSize;
65 }
66 
67 }
68 
69 template <typename T>
70 void runL2Distance(GpuResources* resources,
71  Tensor<T, 2, true>& centroids,
72  Tensor<T, 2, true>* centroidsTransposed,
73  Tensor<T, 1, true>* centroidNorms,
74  Tensor<T, 2, true>& queries,
75  int k,
76  Tensor<T, 2, true>& outDistances,
77  Tensor<int, 2, true>& outIndices,
78  bool ignoreOutDistances = false,
79  int tileSizeOverride = -1) {
80  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
81  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
82  FAISS_ASSERT(outDistances.getSize(1) == k);
83  FAISS_ASSERT(outIndices.getSize(1) == k);
84 
85  auto& mem = resources->getMemoryManagerCurrentDevice();
86  auto defaultStream = resources->getDefaultStreamCurrentDevice();
87 
88  // If we're quering against a 0 sized set, just return empty results
89  if (centroids.numElements() == 0) {
90  thrust::fill(thrust::cuda::par.on(defaultStream),
91  outDistances.data(), outDistances.end(),
92  Limits<T>::getMax());
93 
94  thrust::fill(thrust::cuda::par.on(defaultStream),
95  outIndices.data(), outIndices.end(),
96  -1);
97 
98  return;
99  }
100 
101  // If ||c||^2 is not pre-computed, calculate it
102  DeviceTensor<T, 1, true> cNorms;
103  if (!centroidNorms) {
104  cNorms = std::move(DeviceTensor<T, 1, true>(
105  mem,
106  {centroids.getSize(0)}, defaultStream));
107  runL2Norm(centroids, cNorms, true, defaultStream);
108  centroidNorms = &cNorms;
109  }
110 
111  //
112  // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
113  //
114  int qNormSize[1] = {queries.getSize(0)};
115  DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
116 
117  // ||q||^2
118  runL2Norm(queries, queryNorms, true, defaultStream);
119 
120  //
121  // Handle the problem in row tiles, to avoid excessive temporary
122  // memory requests
123  //
124 
125  FAISS_ASSERT(k <= centroids.getSize(0));
126  FAISS_ASSERT(k <= 1024); // select limitation
127 
128  int tileSize =
129  chooseTileSize<T>(
130  tileSizeOverride,
131  centroids.getSize(0),
132  resources->getMemoryManagerCurrentDevice().getSizeAvailable());
133 
134  int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
135 
136  // Temporary output memory space we'll use
137  DeviceTensor<T, 2, true> distanceBuf1(
138  mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
139  DeviceTensor<T, 2, true> distanceBuf2(
140  mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
141  DeviceTensor<T, 2, true>* distanceBufs[2] =
142  {&distanceBuf1, &distanceBuf2};
143 
144  auto streams = resources->getAlternateStreamsCurrentDevice();
145  streamWait(streams, {defaultStream});
146 
147  int curStream = 0;
148 
149  for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
150  int numQueriesForIteration = std::min(maxQueriesPerIteration,
151  queries.getSize(0) - i);
152 
153  auto distanceBufView =
154  distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
155  auto queryView =
156  queries.narrowOutermost(i, numQueriesForIteration);
157  auto outDistanceView =
158  outDistances.narrowOutermost(i, numQueriesForIteration);
159  auto outIndexView =
160  outIndices.narrowOutermost(i, numQueriesForIteration);
161  auto queryNormNiew =
162  queryNorms.narrowOutermost(i, numQueriesForIteration);
163 
164  // L2 distance is ||c||^2 - 2qc + ||q||^2
165 
166  // -2qc
167  // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
168  runMatrixMult(distanceBufView, false,
169  queryView, false,
170  centroidsTransposed ? *centroidsTransposed : centroids,
171  centroidsTransposed ? false : true,
172  -2.0f, 0.0f,
173  resources->getBlasHandleCurrentDevice(),
174  streams[curStream]);
175 
176  // For L2 distance, we use this fused kernel that performs both
177  // adding ||c||^2 to -2qc and k-selection, so we only need two
178  // passes (one write by the gemm, one read here) over the huge
179  // region of output memory
180  runL2SelectMin(distanceBufView,
181  *centroidNorms,
182  outDistanceView,
183  outIndexView,
184  k,
185  streams[curStream]);
186 
187  if (!ignoreOutDistances) {
188  // expand (query id) to (query id, k) by duplicating along rows
189  // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
190  runSumAlongRows(queryNormNiew, outDistanceView, streams[curStream]);
191  }
192 
193  curStream = (curStream + 1) % 2;
194  }
195 
196  // Have the desired ordering stream wait on the multi-stream
197  streamWait({defaultStream}, streams);
198 }
199 
200 template <typename T>
201 void runIPDistance(GpuResources* resources,
202  Tensor<T, 2, true>& centroids,
203  Tensor<T, 2, true>* centroidsTransposed,
204  Tensor<T, 2, true>& queries,
205  int k,
206  Tensor<T, 2, true>& outDistances,
207  Tensor<int, 2, true>& outIndices,
208  int tileSizeOverride = -1) {
209  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
210  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
211  FAISS_ASSERT(outDistances.getSize(1) == k);
212  FAISS_ASSERT(outIndices.getSize(1) == k);
213 
214  auto& mem = resources->getMemoryManagerCurrentDevice();
215  auto defaultStream = resources->getDefaultStreamCurrentDevice();
216 
217  // If we're quering against a 0 sized set, just return empty results
218  if (centroids.numElements() == 0) {
219  thrust::fill(thrust::cuda::par.on(defaultStream),
220  outDistances.data(), outDistances.end(),
221  Limits<T>::getMax());
222 
223  thrust::fill(thrust::cuda::par.on(defaultStream),
224  outIndices.data(), outIndices.end(),
225  -1);
226 
227  return;
228  }
229 
230  //
231  // Handle the problem in row tiles, to avoid excessive temporary
232  // memory requests
233  //
234 
235  FAISS_ASSERT(k <= centroids.getSize(0));
236  FAISS_ASSERT(k <= 1024); // select limitation
237 
238  int tileSize =
239  chooseTileSize<T>(
240  tileSizeOverride,
241  centroids.getSize(0),
242  resources->getMemoryManagerCurrentDevice().getSizeAvailable());
243 
244  int maxQueriesPerIteration = std::min(tileSize, queries.getSize(0));
245 
246  // Temporary output memory space we'll use
247  DeviceTensor<T, 2, true> distanceBuf1(
248  mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
249  DeviceTensor<T, 2, true> distanceBuf2(
250  mem, {maxQueriesPerIteration, centroids.getSize(0)}, defaultStream);
251  DeviceTensor<T, 2, true>* distanceBufs[2] =
252  {&distanceBuf1, &distanceBuf2};
253 
254  auto streams = resources->getAlternateStreamsCurrentDevice();
255  streamWait(streams, {defaultStream});
256 
257  int curStream = 0;
258 
259  for (int i = 0; i < queries.getSize(0); i += maxQueriesPerIteration) {
260  int numQueriesForIteration = std::min(maxQueriesPerIteration,
261  queries.getSize(0) - i);
262 
263  auto distanceBufView =
264  distanceBufs[curStream]->narrowOutermost(0, numQueriesForIteration);
265  auto queryView =
266  queries.narrowOutermost(i, numQueriesForIteration);
267  auto outDistanceView =
268  outDistances.narrowOutermost(i, numQueriesForIteration);
269  auto outIndexView =
270  outIndices.narrowOutermost(i, numQueriesForIteration);
271 
272  // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
273  runMatrixMult(distanceBufView, false,
274  queryView, false,
275  centroidsTransposed ? *centroidsTransposed : centroids,
276  centroidsTransposed ? false : true,
277  1.0f, 0.0f,
278  resources->getBlasHandleCurrentDevice(),
279  streams[curStream]);
280 
281  // top-k of dot products
282  // (query id, top k centroids)
283  runBlockSelect(distanceBufView,
284  outDistanceView,
285  outIndexView,
286  true, k, streams[curStream]);
287 
288  curStream = (curStream + 1) % 2;
289  }
290 
291  streamWait({defaultStream}, streams);
292 }
293 
294 //
295 // Instantiations of the distance templates
296 //
297 
298 void
299 runIPDistance(GpuResources* resources,
300  Tensor<float, 2, true>& vectors,
301  Tensor<float, 2, true>* vectorsTransposed,
302  Tensor<float, 2, true>& queries,
303  int k,
304  Tensor<float, 2, true>& outDistances,
305  Tensor<int, 2, true>& outIndices,
306  int tileSizeOverride) {
307  runIPDistance<float>(resources,
308  vectors,
309  vectorsTransposed,
310  queries,
311  k,
312  outDistances,
313  outIndices,
314  tileSizeOverride);
315 }
316 
317 #ifdef FAISS_USE_FLOAT16
318 void
319 runIPDistance(GpuResources* resources,
320  Tensor<half, 2, true>& vectors,
321  Tensor<half, 2, true>* vectorsTransposed,
322  Tensor<half, 2, true>& queries,
323  int k,
324  Tensor<half, 2, true>& outDistances,
325  Tensor<int, 2, true>& outIndices,
326  int tileSizeOverride) {
327  runIPDistance<half>(resources,
328  vectors,
329  vectorsTransposed,
330  queries,
331  k,
332  outDistances,
333  outIndices,
334  tileSizeOverride);
335 }
336 #endif
337 
338 void
339 runL2Distance(GpuResources* resources,
340  Tensor<float, 2, true>& vectors,
341  Tensor<float, 2, true>* vectorsTransposed,
342  Tensor<float, 1, true>* vectorNorms,
343  Tensor<float, 2, true>& queries,
344  int k,
345  Tensor<float, 2, true>& outDistances,
346  Tensor<int, 2, true>& outIndices,
347  bool ignoreOutDistances,
348  int tileSizeOverride) {
349  runL2Distance<float>(resources,
350  vectors,
351  vectorsTransposed,
352  vectorNorms,
353  queries,
354  k,
355  outDistances,
356  outIndices,
357  ignoreOutDistances,
358  tileSizeOverride);
359 }
360 
361 #ifdef FAISS_USE_FLOAT16
362 void
363 runL2Distance(GpuResources* resources,
364  Tensor<half, 2, true>& vectors,
365  Tensor<half, 2, true>* vectorsTransposed,
366  Tensor<half, 1, true>* vectorNorms,
367  Tensor<half, 2, true>& queries,
368  int k,
369  Tensor<half, 2, true>& outDistances,
370  Tensor<int, 2, true>& outIndices,
371  bool ignoreOutDistances,
372  int tileSizeOverride) {
373  runL2Distance<half>(resources,
374  vectors,
375  vectorsTransposed,
376  vectorNorms,
377  queries,
378  k,
379  outDistances,
380  outIndices,
381  ignoreOutDistances,
382  tileSizeOverride);
383 }
384 #endif
385 
386 } } // namespace