Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
GpuIndexFlat.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #include "GpuIndexFlat.h"
11 #include "../IndexFlat.h"
12 #include "GpuResources.h"
13 #include "impl/FlatIndex.cuh"
14 #include "utils/ConversionOperators.cuh"
15 #include "utils/CopyUtils.cuh"
16 #include "utils/DeviceUtils.h"
17 #include "utils/Float16.cuh"
18 #include "utils/StaticUtils.h"
19 
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
22 #include <limits>
23 
24 namespace faiss { namespace gpu {
25 
26 /// Default CPU search size for which we use paged copies
27 constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
28 
29 /// Size above which we page copies from the CPU to GPU (non-paged
30 /// memory usage)
31 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
32 
34  const faiss::IndexFlat* index,
35  GpuIndexFlatConfig config) :
36  GpuIndex(resources, index->d, index->metric_type, config),
37  minPagedSize_(kMinPageSize),
38  config_(std::move(config)),
39  data_(nullptr) {
40  verifySettings_();
41 
42  // Flat index doesn't need training
43  this->is_trained = true;
44 
45  copyFrom(index);
46 }
47 
49  int dims,
50  faiss::MetricType metric,
51  GpuIndexFlatConfig config) :
52  GpuIndex(resources, dims, metric, config),
53  minPagedSize_(kMinPageSize),
54  config_(std::move(config)),
55  data_(nullptr) {
56  verifySettings_();
57 
58  // Flat index doesn't need training
59  this->is_trained = true;
60 
61  // Construct index
62  DeviceScope scope(device_);
63  data_ = new FlatIndex(resources,
64  dims,
65  metric == faiss::METRIC_L2,
69  memorySpace_);
70 }
71 
72 GpuIndexFlat::~GpuIndexFlat() {
73  delete data_;
74 }
75 
76 void
78  minPagedSize_ = size;
79 }
80 
81 size_t
83  return minPagedSize_;
84 }
85 
86 void
88  DeviceScope scope(device_);
89 
90  this->d = index->d;
91  this->metric_type = index->metric_type;
92 
93  // GPU code has 32 bit indices
94  FAISS_THROW_IF_NOT_FMT(index->ntotal <=
95  (faiss::Index::idx_t) std::numeric_limits<int>::max(),
96  "GPU index only supports up to %zu indices; "
97  "attempting to copy CPU index with %zu parameters",
98  (size_t) std::numeric_limits<int>::max(),
99  (size_t) index->ntotal);
100  this->ntotal = index->ntotal;
101 
102  delete data_;
103  data_ = new FlatIndex(resources_,
104  this->d,
105  index->metric_type == faiss::METRIC_L2,
109  memorySpace_);
110 
111  // The index could be empty
112  if (index->ntotal > 0) {
113  data_->add(index->xb.data(),
114  index->ntotal,
116  }
117 }
118 
119 void
121  DeviceScope scope(device_);
122 
123  index->d = this->d;
124  index->ntotal = this->ntotal;
125  index->metric_type = this->metric_type;
126 
127  FAISS_ASSERT(data_);
128  FAISS_ASSERT(data_->getSize() == this->ntotal);
129  index->xb.resize(this->ntotal * this->d);
130 
131  auto stream = resources_->getDefaultStream(device_);
132 
133  if (this->ntotal > 0) {
134  if (config_.useFloat16) {
135  auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
136  fromDevice(vecFloat32, index->xb.data(), stream);
137  } else {
138  fromDevice(data_->getVectorsFloat32Ref(), index->xb.data(), stream);
139  }
140  }
141 }
142 
143 size_t
145  return this->ntotal;
146 }
147 
148 void
150  DeviceScope scope(device_);
151 
152  // Free the underlying memory
153  data_->reset();
154  this->ntotal = 0;
155 }
156 
157 void
158 GpuIndexFlat::train(Index::idx_t n, const float* x) {
159  // nothing to do
160 }
161 
162 void
163 GpuIndexFlat::add(Index::idx_t n, const float* x) {
164  DeviceScope scope(device_);
165 
166  // To avoid multiple re-allocations, ensure we have enough storage
167  // available
169 
170  // If we're not operating in float16 mode, we don't need the input
171  // data to be resident on our device; we can add directly.
172  if (!config_.useFloat16) {
173  addImpl_(n, x, nullptr);
174  } else {
175  // Otherwise, perform the paging
176  GpuIndex::add(n, x);
177  }
178 }
179 
180 void
182  const float* x,
183  const Index::idx_t* ids) {
184  // Device is already set in GpuIndex::addInternal_
185 
186  // We do not support add_with_ids
187  FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
188  FAISS_THROW_IF_NOT(n > 0);
189 
190  // Due to GPU indexing in int32, we can't store more than this
191  // number of vectors on a GPU
192  FAISS_THROW_IF_NOT_FMT(this->ntotal + n <=
193  (faiss::Index::idx_t) std::numeric_limits<int>::max(),
194  "GPU index only supports up to %zu indices",
195  (size_t) std::numeric_limits<int>::max());
196 
198  this->ntotal += n;
199 }
200 
201 void
203  const float* x,
205  float* distances,
206  faiss::Index::idx_t* labels) const {
207  if (n == 0) {
208  return;
209  }
210 
211  // For now, only support <= max int results
212  FAISS_THROW_IF_NOT_FMT(n <=
213  (faiss::Index::idx_t) std::numeric_limits<int>::max(),
214  "GPU index only supports up to %zu indices",
215  (size_t) std::numeric_limits<int>::max());
216  FAISS_THROW_IF_NOT_FMT(k <= 1024,
217  "GPU only supports k <= 1024 (requested %d)",
218  (int) k); // select limitation
219 
220  DeviceScope scope(device_);
221  auto stream = resources_->getDefaultStream(device_);
222 
223  // The input vectors may be too large for the GPU, but we still
224  // assume that the output distances and labels are not.
225  // Go ahead and make space for output distances and labels on the
226  // GPU.
227  // If we reach a point where all inputs are too big, we can add
228  // another level of tiling.
229  auto outDistances = toDevice<float, 2>(resources_,
230  device_,
231  distances,
232  stream,
233  {(int) n, (int) k});
234 
235  // FlatIndex only supports an interface returning int indices
236  DeviceTensor<int, 2, true> outIntIndices(
237  resources_->getMemoryManagerCurrentDevice(),
238  {(int) n, (int) k}, stream);
239 
240  bool usePaged = false;
241 
242  if (getDeviceForAddress(x) == -1) {
243  // It is possible that the user is querying for a vector set size
244  // `x` that won't fit on the GPU.
245  // In this case, we will have to handle paging of the data from CPU
246  // -> GPU.
247  // Currently, we don't handle the case where the output data won't
248  // fit on the GPU (e.g., n * k is too large for the GPU memory).
249  size_t dataSize = (size_t) n * this->d * sizeof(float);
250 
251  if (dataSize >= minPagedSize_) {
252  searchFromCpuPaged_(n, x, k,
253  outDistances.data(),
254  outIntIndices.data());
255  usePaged = true;
256  }
257  }
258 
259  if (!usePaged) {
260  searchNonPaged_(n, x, k,
261  outDistances.data(),
262  outIntIndices.data());
263  }
264 
265  // Convert and copy int indices out
266  auto outIndices = toDevice<faiss::Index::idx_t, 2>(resources_,
267  device_,
268  labels,
269  stream,
270  {(int) n, (int) k});
271 
272  // Convert int to idx_t
273  thrust::transform(thrust::cuda::par.on(stream),
274  outIntIndices.data(),
275  outIntIndices.end(),
276  outIndices.data(),
277  IntToIdxType());
278 
279  // Copy back if necessary
280  fromDevice<float, 2>(outDistances, distances, stream);
281  fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
282 }
283 
284 void
286  const float* x,
288  float* distances,
289  faiss::Index::idx_t* labels) const {
290  FAISS_ASSERT_MSG(false, "Should not be called");
291 }
292 
293 void
294 GpuIndexFlat::searchNonPaged_(int n,
295  const float* x,
296  int k,
297  float* outDistancesData,
298  int* outIndicesData) const {
299  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
300  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
301 
302  auto stream = resources_->getDefaultStream(device_);
303 
304  // Make sure arguments are on the device we desire; use temporary
305  // memory allocations to move it if necessary
306  auto vecs = toDevice<float, 2>(resources_,
307  device_,
308  const_cast<float*>(x),
309  stream,
310  {n, (int) this->d});
311 
312  data_->query(vecs, k, outDistances, outIndices, true);
313 }
314 
315 void
317  const float* x,
318  int k,
319  float* outDistancesData,
320  int* outIndicesData) const {
321  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
322  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
323 
324  // Is pinned memory available?
325  auto pinnedAlloc = resources_->getPinnedMemory();
326  int pageSizeInVecs =
327  (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
328 
329  if (!pinnedAlloc.first || pageSizeInVecs < 1) {
330  // Just page without overlapping copy with compute
331  int batchSize = utils::nextHighestPowerOf2(
332  (int) ((size_t) kNonPinnedPageSize /
333  (sizeof(float) * this->d)));
334 
335  for (int cur = 0; cur < n; cur += batchSize) {
336  int num = std::min(batchSize, n - cur);
337 
338  auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
339  auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
340 
341  searchNonPaged_(num,
342  x + (size_t) cur * this->d,
343  k,
344  outDistancesSlice.data(),
345  outIndicesSlice.data());
346  }
347 
348  return;
349  }
350 
351  //
352  // Pinned memory is available, so we can overlap copy with compute.
353  // We use two pinned memory buffers, and triple-buffer the
354  // procedure:
355  //
356  // 1 CPU copy -> pinned
357  // 2 pinned copy -> GPU
358  // 3 GPU compute
359  //
360  // 1 2 3 1 2 3 ... (pinned buf A)
361  // 1 2 3 1 2 ... (pinned buf B)
362  // 1 2 3 1 ... (pinned buf A)
363  // time ->
364  //
365  auto defaultStream = resources_->getDefaultStream(device_);
366  auto copyStream = resources_->getAsyncCopyStream(device_);
367 
368  FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=
369  (size_t) std::numeric_limits<int>::max());
370 
371  float* bufPinnedA = (float*) pinnedAlloc.first;
372  float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;
373  float* bufPinned[2] = {bufPinnedA, bufPinnedB};
374 
375  // Reserve space on the GPU for the destination of the pinned buffer
376  // copy
378  resources_->getMemoryManagerCurrentDevice(),
379  {(int) pageSizeInVecs, (int) this->d},
380  defaultStream);
382  resources_->getMemoryManagerCurrentDevice(),
383  {(int) pageSizeInVecs, (int) this->d},
384  defaultStream);
385  DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
386 
387  // Copy completion events for the pinned buffers
388  std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
389 
390  // Execute completion events for the GPU buffers
391  std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
392 
393  // All offsets are in terms of number of vectors; they remain within
394  // int bounds (as this function only handles max in vectors)
395 
396  // Current start offset for buffer 1
397  int cur1 = 0;
398  int cur1BufIndex = 0;
399 
400  // Current start offset for buffer 2
401  int cur2 = -1;
402  int cur2BufIndex = 0;
403 
404  // Current start offset for buffer 3
405  int cur3 = -1;
406  int cur3BufIndex = 0;
407 
408  while (cur3 < n) {
409  // Start async pinned -> GPU copy first (buf 2)
410  if (cur2 != -1 && cur2 < n) {
411  // Copy pinned to GPU
412  int numToCopy = std::min(pageSizeInVecs, n - cur2);
413 
414  // Make sure any previous execution has completed before continuing
415  auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
416  if (eventPrev.get()) {
417  eventPrev->streamWaitOnEvent(copyStream);
418  }
419 
420  CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
421  bufPinned[cur2BufIndex],
422  (size_t) numToCopy * this->d * sizeof(float),
423  cudaMemcpyHostToDevice,
424  copyStream));
425 
426  // Mark a completion event in this stream
427  eventPinnedCopyDone[cur2BufIndex] =
428  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));
429 
430  // We pick up from here
431  cur3 = cur2;
432  cur2 += numToCopy;
433  cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
434  }
435 
436  if (cur3 != -1 && cur3 < n) {
437  // Process on GPU
438  int numToProcess = std::min(pageSizeInVecs, n - cur3);
439 
440  // Make sure the previous copy has completed before continuing
441  auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
442  FAISS_ASSERT(eventPrev.get());
443 
444  eventPrev->streamWaitOnEvent(defaultStream);
445 
446  // Create tensor wrappers
447  DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
448  {numToProcess, this->d});
449  auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
450  auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
451 
452  data_->query(input, k,
453  outDistancesSlice,
454  outIndicesSlice, true);
455 
456  // Create completion event
457  eventGpuExecuteDone[cur3BufIndex] =
458  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));
459 
460  // We pick up from here
461  cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
462  cur3 += numToProcess;
463  }
464 
465  if (cur1 < n) {
466  // Copy CPU mem to CPU pinned
467  int numToCopy = std::min(pageSizeInVecs, n - cur1);
468 
469  // Make sure any previous copy has completed before continuing
470  auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
471  if (eventPrev.get()) {
472  eventPrev->cpuWaitOnEvent();
473  }
474 
475  memcpy(bufPinned[cur1BufIndex],
476  x + (size_t) cur1 * this->d,
477  (size_t) numToCopy * this->d * sizeof(float));
478 
479  // We pick up from here
480  cur2 = cur1;
481  cur1 += numToCopy;
482  cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
483  }
484  }
485 }
486 
487 void
489  float* out) const {
490  DeviceScope scope(device_);
491 
492  FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
493  auto stream = resources_->getDefaultStream(device_);
494 
495  if (config_.useFloat16) {
496  auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
497  fromDevice(vec.data(), out, this->d, stream);
498  } else {
499  auto vec = data_->getVectorsFloat32Ref()[key];
500  fromDevice(vec.data(), out, this->d, stream);
501  }
502 }
503 
504 void
507  float* out) const {
508  DeviceScope scope(device_);
509 
510  FAISS_THROW_IF_NOT_MSG(i0 < this->ntotal, "index out of bounds");
511  FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal, "num out of bounds");
512  auto stream = resources_->getDefaultStream(device_);
513 
514  if (config_.useFloat16) {
515  auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
516  fromDevice(vec.data(), out, num * this->d, stream);
517  } else {
518  auto vec = data_->getVectorsFloat32Ref()[i0];
519  fromDevice(vec.data(), out, this->d * num, stream);
520  }
521 }
522 
523 void
524 GpuIndexFlat::verifySettings_() const {
525  // If we want Hgemm, ensure that it is supported on this device
527 #ifdef FAISS_USE_FLOAT16
528  FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
529  "useFloat16Accumulator can only be enabled "
530  "with useFloat16");
531 
532  FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
533  "Device %d does not support Hgemm "
534  "(useFloat16Accumulator)",
535  config_.device);
536 #else
537  FAISS_THROW_IF_NOT_MSG(false, "not compiled with float16 support");
538 #endif
539  }
540 }
541 
542 //
543 // GpuIndexFlatL2
544 //
545 
547  faiss::IndexFlatL2* index,
548  GpuIndexFlatConfig config) :
549  GpuIndexFlat(resources, index, config) {
550 }
551 
553  int dims,
554  GpuIndexFlatConfig config) :
555  GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
556 }
557 
558 void
560  GpuIndexFlat::copyFrom(index);
561 }
562 
563 void
565  GpuIndexFlat::copyTo(index);
566 }
567 
568 //
569 // GpuIndexFlatIP
570 //
571 
573  faiss::IndexFlatIP* index,
574  GpuIndexFlatConfig config) :
575  GpuIndexFlat(resources, index, config) {
576 }
577 
579  int dims,
580  GpuIndexFlatConfig config) :
581  GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
582 }
583 
584 void
586  GpuIndexFlat::copyFrom(index);
587 }
588 
589 void
591  GpuIndexFlat::copyTo(index);
592 }
593 
594 } } // namespace
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
Definition: FlatIndex.cu:90
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const override
Batch reconstruction method.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
Definition: GpuIndexFlat.cu:82
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:46
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:22
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
void addImpl_(faiss::Index::idx_t n, const float *x, const faiss::Index::idx_t *ids) override
Called from GpuIndex for add.
void searchImpl_(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
Should not be called (we have our own implementation)
size_t getNumVecs() const
Returns the number of vectors we contain.
GpuIndexFlat(GpuResources *resources, const faiss::IndexFlat *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
Definition: GpuIndexFlat.cu:33
void setMinPagingSize(size_t size)
Definition: GpuIndexFlat.cu:77
bool useFloat16
Whether or not data is stored as float16.
Definition: GpuIndexFlat.h:34
int device
GPU device on which the index is resident.
Definition: GpuIndex.h:26
GpuIndexFlatL2(GpuResources *resources, faiss::IndexFlatL2 *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
virtual cudaStream_t getDefaultStream(int device)=0
__host__ __device__ Tensor< T, Dim, InnerContig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
Definition: Tensor-inl.cuh:660
int d
vector dimension
Definition: Index.h:66
void reconstruct(faiss::Index::idx_t key, float *out) const override
const int device_
The GPU device we are resident on.
Definition: GpuIndex.h:93
void copyTo(faiss::IndexFlatIP *index)
GpuIndexFlatIP(GpuResources *resources, faiss::IndexFlatIP *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:90
long idx_t
all indices are this type
Definition: Index.h:64
void reserve(size_t numVecs, cudaStream_t stream)
Reserve storage that can contain at least this many vectors.
Definition: FlatIndex.cu:67
void add(const float *data, int numVecs, cudaStream_t stream)
Definition: FlatIndex.cu:194
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
void add(faiss::Index::idx_t, const float *x) override
Definition: GpuIndex.cu:60
void copyFrom(const faiss::IndexFlat *index)
Definition: GpuIndexFlat.cu:87
Our tensor type.
Definition: Tensor.cuh:29
const MemorySpace memorySpace_
The memory space of our primary storage on the GPU.
Definition: GpuIndex.h:96
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
Definition: GpuIndexFlat.h:159
const GpuIndexFlatConfig config_
Our config object.
Definition: GpuIndexFlat.h:156
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
void add(faiss::Index::idx_t, const float *x) override
Overrides to avoid excessive copies.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:78
void reset() override
Clears all vectors from this index.
void copyFrom(faiss::IndexFlatIP *index)
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
void reset()
Free all storage.
Definition: FlatIndex.cu:268
void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
std::vector< float > xb
database vectors, size ntotal * d
Definition: IndexFlat.h:24
void train(Index::idx_t n, const float *x) override
This index is not trained, so this does nothing.
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45