Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
GpuIndex.cu
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #include "GpuIndex.h"
10 #include "../FaissAssert.h"
11 #include "GpuResources.h"
12 #include "utils/CopyUtils.cuh"
13 #include "utils/DeviceUtils.h"
14 #include "utils/StaticUtils.h"
15 #include <limits>
16 #include <memory>
17 
18 namespace faiss { namespace gpu {
19 
20 /// Default CPU search size for which we use paged copies
21 constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
22 
23 /// Size above which we page copies from the CPU to GPU (non-paged
24 /// memory usage)
25 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
26 
27 // Default size for which we page add or search
28 constexpr size_t kAddPageSize = (size_t) 256 * 1024 * 1024;
29 
30 // Or, maximum number of vectors to consider per page of add or search
31 constexpr size_t kAddVecSize = (size_t) 512 * 1024;
32 
33 // Use a smaller search size, as precomputed code usage on IVFPQ
34 // requires substantial amounts of memory
35 // FIXME: parameterize based on algorithm need
36 constexpr size_t kSearchVecSize = (size_t) 32 * 1024;
37 
38 GpuIndex::GpuIndex(GpuResources* resources,
39  int dims,
40  faiss::MetricType metric,
41  GpuIndexConfig config) :
42  Index(dims, metric),
43  resources_(resources),
44  device_(config.device),
45  memorySpace_(config.memorySpace),
46  minPagedSize_(kMinPageSize) {
47  FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(),
48  "Invalid GPU device %d", device_);
49 
50  FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
51 
52 #ifdef FAISS_UNIFIED_MEM
53  FAISS_THROW_IF_NOT_FMT(
54  memorySpace_ == MemorySpace::Device ||
55  (memorySpace_ == MemorySpace::Unified &&
56  getFullUnifiedMemSupport(device_)),
57  "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
58  config.device);
59 #else
60  FAISS_THROW_IF_NOT_MSG(memorySpace_ == MemorySpace::Device,
61  "Must compile with CUDA 8+ for Unified Memory support");
62 #endif
63 
64  FAISS_ASSERT(resources_);
65  resources_->initializeForDevice(device_);
66 }
67 
68 void
70  minPagedSize_ = size;
71 }
72 
73 size_t
75  return minPagedSize_;
76 }
77 
78 void
79 GpuIndex::add(Index::idx_t n, const float* x) {
80  // Pass to add_with_ids
81  add_with_ids(n, x, nullptr);
82 }
83 
84 void
86  const float* x,
87  const Index::idx_t* ids) {
88  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
89 
90  // For now, only support <= max int results
91  FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),
92  "GPU index only supports up to %d indices",
93  std::numeric_limits<int>::max());
94 
95  if (n == 0) {
96  // nothing to add
97  return;
98  }
99 
100  std::vector<Index::idx_t> generatedIds;
101 
102  // Generate IDs if we need them
103  if (!ids && addImplRequiresIDs_()) {
104  generatedIds = std::vector<Index::idx_t>(n);
105 
106  for (Index::idx_t i = 0; i < n; ++i) {
107  generatedIds[i] = this->ntotal + i;
108  }
109  }
110 
111  DeviceScope scope(device_);
112  addPaged_((int) n, x, ids ? ids : generatedIds.data());
113 }
114 
115 void
116 GpuIndex::addPaged_(int n,
117  const float* x,
118  const Index::idx_t* ids) {
119  if (n > 0) {
120  size_t totalSize = (size_t) n * this->d * sizeof(float);
121 
122  if (totalSize > kAddPageSize || n > kAddVecSize) {
123  // How many vectors fit into kAddPageSize?
124  size_t maxNumVecsForPageSize =
125  kAddPageSize / ((size_t) this->d * sizeof(float));
126 
127  // Always add at least 1 vector, if we have huge vectors
128  maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t) 1);
129 
130  size_t tileSize = std::min((size_t) n, maxNumVecsForPageSize);
131  tileSize = std::min(tileSize, kSearchVecSize);
132 
133  for (size_t i = 0; i < (size_t) n; i += tileSize) {
134  size_t curNum = std::min(tileSize, n - i);
135 
136  addPage_(curNum,
137  x + i * (size_t) this->d,
138  ids ? ids + i : nullptr);
139  }
140  } else {
141  addPage_(n, x, ids);
142  }
143  }
144 }
145 
146 void
147 GpuIndex::addPage_(int n,
148  const float* x,
149  const Index::idx_t* ids) {
150  // At this point, `x` can be resident on CPU or GPU, and `ids` may be resident
151  // on CPU, GPU or may be null.
152  //
153  // Before continuing, we guarantee that all data will be resident on the GPU.
154  auto stream = resources_->getDefaultStreamCurrentDevice();
155 
156  auto vecs = toDevice<float, 2>(resources_,
157  device_,
158  const_cast<float*>(x),
159  stream,
160  {n, this->d});
161 
162  if (ids) {
163  auto indices = toDevice<Index::idx_t, 1>(resources_,
164  device_,
165  const_cast<Index::idx_t*>(ids),
166  stream,
167  {n});
168 
169  addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
170  } else {
171  addImpl_(n, vecs.data(), nullptr);
172  }
173 }
174 
175 void
177  const float* x,
178  Index::idx_t k,
179  float* distances,
180  Index::idx_t* labels) const {
181  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
182 
183  // For now, only support <= max int results
184  FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),
185  "GPU index only supports up to %d indices",
186  std::numeric_limits<int>::max());
187 
188  // Maximum k-selection supported is based on the CUDA SDK
189  FAISS_THROW_IF_NOT_FMT(k <= (Index::idx_t) getMaxKSelection(),
190  "GPU index only supports k <= %d (requested %d)",
191  getMaxKSelection(),
192  (int) k); // select limitation
193 
194  if (n == 0 || k == 0) {
195  // nothing to search
196  return;
197  }
198 
199  DeviceScope scope(device_);
200  auto stream = resources_->getDefaultStream(device_);
201 
202  // We guarantee that the searchImpl_ will be called with device-resident
203  // pointers.
204 
205  // The input vectors may be too large for the GPU, but we still
206  // assume that the output distances and labels are not.
207  // Go ahead and make space for output distances and labels on the
208  // GPU.
209  // If we reach a point where all inputs are too big, we can add
210  // another level of tiling.
211  auto outDistances =
212  toDevice<float, 2>(resources_, device_, distances, stream,
213  {(int) n, (int) k});
214 
215  auto outLabels =
216  toDevice<faiss::Index::idx_t, 2>(resources_, device_, labels, stream,
217  {(int) n, (int) k});
218 
219  bool usePaged = false;
220 
221  if (getDeviceForAddress(x) == -1) {
222  // It is possible that the user is querying for a vector set size
223  // `x` that won't fit on the GPU.
224  // In this case, we will have to handle paging of the data from CPU
225  // -> GPU.
226  // Currently, we don't handle the case where the output data won't
227  // fit on the GPU (e.g., n * k is too large for the GPU memory).
228  size_t dataSize = (size_t) n * this->d * sizeof(float);
229 
230  if (dataSize >= minPagedSize_) {
231  searchFromCpuPaged_(n, x, k,
232  outDistances.data(),
233  outLabels.data());
234  usePaged = true;
235  }
236  }
237 
238  if (!usePaged) {
239  searchNonPaged_(n, x, k,
240  outDistances.data(),
241  outLabels.data());
242  }
243 
244  // Copy back if necessary
245  fromDevice<float, 2>(outDistances, distances, stream);
246  fromDevice<faiss::Index::idx_t, 2>(outLabels, labels, stream);
247 }
248 
249 void
250 GpuIndex::searchNonPaged_(int n,
251  const float* x,
252  int k,
253  float* outDistancesData,
254  Index::idx_t* outIndicesData) const {
255  auto stream = resources_->getDefaultStream(device_);
256 
257  // Make sure arguments are on the device we desire; use temporary
258  // memory allocations to move it if necessary
259  auto vecs = toDevice<float, 2>(resources_,
260  device_,
261  const_cast<float*>(x),
262  stream,
263  {n, (int) this->d});
264 
265  searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
266 }
267 
268 void
269 GpuIndex::searchFromCpuPaged_(int n,
270  const float* x,
271  int k,
272  float* outDistancesData,
273  Index::idx_t* outIndicesData) const {
274  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
275  Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
276 
277  // Is pinned memory available?
278  auto pinnedAlloc = resources_->getPinnedMemory();
279  int pageSizeInVecs =
280  (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
281 
282  if (!pinnedAlloc.first || pageSizeInVecs < 1) {
283  // Just page without overlapping copy with compute
284  int batchSize = utils::nextHighestPowerOf2(
285  (int) ((size_t) kNonPinnedPageSize /
286  (sizeof(float) * this->d)));
287 
288  for (int cur = 0; cur < n; cur += batchSize) {
289  int num = std::min(batchSize, n - cur);
290 
291  auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
292  auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
293 
294  searchNonPaged_(num,
295  x + (size_t) cur * this->d,
296  k,
297  outDistancesSlice.data(),
298  outIndicesSlice.data());
299  }
300 
301  return;
302  }
303 
304  //
305  // Pinned memory is available, so we can overlap copy with compute.
306  // We use two pinned memory buffers, and triple-buffer the
307  // procedure:
308  //
309  // 1 CPU copy -> pinned
310  // 2 pinned copy -> GPU
311  // 3 GPU compute
312  //
313  // 1 2 3 1 2 3 ... (pinned buf A)
314  // 1 2 3 1 2 ... (pinned buf B)
315  // 1 2 3 1 ... (pinned buf A)
316  // time ->
317  //
318  auto defaultStream = resources_->getDefaultStream(device_);
319  auto copyStream = resources_->getAsyncCopyStream(device_);
320 
321  FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=
322  (size_t) std::numeric_limits<int>::max());
323 
324  float* bufPinnedA = (float*) pinnedAlloc.first;
325  float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;
326  float* bufPinned[2] = {bufPinnedA, bufPinnedB};
327 
328  // Reserve space on the GPU for the destination of the pinned buffer
329  // copy
330  DeviceTensor<float, 2, true> bufGpuA(
331  resources_->getMemoryManagerCurrentDevice(),
332  {(int) pageSizeInVecs, (int) this->d},
333  defaultStream);
334  DeviceTensor<float, 2, true> bufGpuB(
335  resources_->getMemoryManagerCurrentDevice(),
336  {(int) pageSizeInVecs, (int) this->d},
337  defaultStream);
338  DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
339 
340  // Copy completion events for the pinned buffers
341  std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
342 
343  // Execute completion events for the GPU buffers
344  std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
345 
346  // All offsets are in terms of number of vectors; they remain within
347  // int bounds (as this function only handles max in vectors)
348 
349  // Current start offset for buffer 1
350  int cur1 = 0;
351  int cur1BufIndex = 0;
352 
353  // Current start offset for buffer 2
354  int cur2 = -1;
355  int cur2BufIndex = 0;
356 
357  // Current start offset for buffer 3
358  int cur3 = -1;
359  int cur3BufIndex = 0;
360 
361  while (cur3 < n) {
362  // Start async pinned -> GPU copy first (buf 2)
363  if (cur2 != -1 && cur2 < n) {
364  // Copy pinned to GPU
365  int numToCopy = std::min(pageSizeInVecs, n - cur2);
366 
367  // Make sure any previous execution has completed before continuing
368  auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
369  if (eventPrev.get()) {
370  eventPrev->streamWaitOnEvent(copyStream);
371  }
372 
373  CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
374  bufPinned[cur2BufIndex],
375  (size_t) numToCopy * this->d * sizeof(float),
376  cudaMemcpyHostToDevice,
377  copyStream));
378 
379  // Mark a completion event in this stream
380  eventPinnedCopyDone[cur2BufIndex] =
381  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));
382 
383  // We pick up from here
384  cur3 = cur2;
385  cur2 += numToCopy;
386  cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
387  }
388 
389  if (cur3 != -1 && cur3 < n) {
390  // Process on GPU
391  int numToProcess = std::min(pageSizeInVecs, n - cur3);
392 
393  // Make sure the previous copy has completed before continuing
394  auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
395  FAISS_ASSERT(eventPrev.get());
396 
397  eventPrev->streamWaitOnEvent(defaultStream);
398 
399  // Create tensor wrappers
400  // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
401  // {numToProcess, this->d});
402  auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
403  auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
404 
405  searchImpl_(numToProcess,
406  bufGpus[cur3BufIndex]->data(),
407  k,
408  outDistancesSlice.data(),
409  outIndicesSlice.data());
410 
411  // Create completion event
412  eventGpuExecuteDone[cur3BufIndex] =
413  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));
414 
415  // We pick up from here
416  cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
417  cur3 += numToProcess;
418  }
419 
420  if (cur1 < n) {
421  // Copy CPU mem to CPU pinned
422  int numToCopy = std::min(pageSizeInVecs, n - cur1);
423 
424  // Make sure any previous copy has completed before continuing
425  auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
426  if (eventPrev.get()) {
427  eventPrev->cpuWaitOnEvent();
428  }
429 
430  memcpy(bufPinned[cur1BufIndex],
431  x + (size_t) cur1 * this->d,
432  (size_t) numToCopy * this->d * sizeof(float));
433 
434  // We pick up from here
435  cur2 = cur1;
436  cur1 += numToCopy;
437  cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
438  }
439  }
440 }
441 
442 } } // namespace
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
Definition: GpuIndex.cu:74
virtual void searchImpl_(int n, const float *x, int k, float *distances, Index::idx_t *labels) const =0
virtual bool addImplRequiresIDs_() const =0
void add_with_ids(Index::idx_t n, const float *x, const Index::idx_t *ids) override
Definition: GpuIndex.cu:85
int d
vector dimension
Definition: Index.h:66
long idx_t
all indices are this type
Definition: Index.h:62
const int device_
The GPU device we are resident on.
Definition: GpuIndex.h:126
GpuResources * resources_
Manages streams, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:123
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
void add(faiss::Index::idx_t, const float *x) override
Definition: GpuIndex.cu:79
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
Definition: GpuIndex.h:132
void setMinPagingSize(size_t size)
Definition: GpuIndex.cu:69
virtual void addImpl_(int n, const float *x, const Index::idx_t *ids)=0
void search(Index::idx_t n, const float *x, Index::idx_t k, float *distances, Index::idx_t *labels) const override
Definition: GpuIndex.cu:176
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44