Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
GpuIndexFlat.cu
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the CC-by-NC license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "GpuIndexFlat.h"
12 #include "../IndexFlat.h"
13 #include "GpuResources.h"
14 #include "impl/FlatIndex.cuh"
15 #include "utils/CopyUtils.cuh"
16 #include "utils/DeviceUtils.h"
17 #include "utils/Float16.cuh"
18 #include "utils/StaticUtils.h"
19 
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
22 #include <limits>
23 
24 namespace faiss { namespace gpu {
25 
26 /// Default CPU search size for which we use paged copies
27 constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
28 
29 /// Size above which we page copies from the CPU to GPU (non-paged
30 /// memory usage)
31 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
32 
34  const faiss::IndexFlat* index,
35  GpuIndexFlatConfig config) :
36  GpuIndex(resources, index->d, index->metric_type, config),
37  minPagedSize_(kMinPageSize),
38  config_(config),
39  data_(nullptr) {
40  verifySettings_();
41 
42  // Flat index doesn't need training
43  this->is_trained = true;
44 
45  copyFrom(index);
46 }
47 
49  int dims,
50  faiss::MetricType metric,
51  GpuIndexFlatConfig config) :
52  GpuIndex(resources, dims, metric, config),
53  minPagedSize_(kMinPageSize),
54  config_(config),
55  data_(nullptr) {
56  verifySettings_();
57 
58  // Flat index doesn't need training
59  this->is_trained = true;
60 
61  // Construct index
62  DeviceScope scope(device_);
63  data_ = new FlatIndex(resources,
64  dims,
65  metric == faiss::METRIC_L2,
69  memorySpace_);
70 }
71 
72 GpuIndexFlat::~GpuIndexFlat() {
73  delete data_;
74 }
75 
76 void
78  minPagedSize_ = size;
79 }
80 
81 size_t
83  return minPagedSize_;
84 }
85 
86 void
88  DeviceScope scope(device_);
89 
90  this->d = index->d;
91  this->metric_type = index->metric_type;
92 
93  // GPU code has 32 bit indices
94  FAISS_THROW_IF_NOT_FMT(index->ntotal <=
95  (faiss::Index::idx_t) std::numeric_limits<int>::max(),
96  "GPU index only supports up to %zu indices; "
97  "attempting to copy CPU index with %zu parameters",
98  (size_t) std::numeric_limits<int>::max(),
99  (size_t) index->ntotal);
100  this->ntotal = index->ntotal;
101 
102  delete data_;
103  data_ = new FlatIndex(resources_,
104  this->d,
105  index->metric_type == faiss::METRIC_L2,
109  memorySpace_);
110 
111  // The index could be empty
112  if (index->ntotal > 0) {
113  data_->add(index->xb.data(),
114  index->ntotal,
115  resources_->getDefaultStream(device_));
116  }
117 }
118 
119 void
121  DeviceScope scope(device_);
122 
123  index->d = this->d;
124  index->ntotal = this->ntotal;
125  index->metric_type = this->metric_type;
126 
127  FAISS_ASSERT(data_->getSize() == this->ntotal);
128  index->xb.resize(this->ntotal * this->d);
129 
130  auto stream = resources_->getDefaultStream(device_);
131 
132  if (this->ntotal > 0) {
133  if (config_.useFloat16) {
134  auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
135  fromDevice(vecFloat32, index->xb.data(), stream);
136  } else {
137  fromDevice(data_->getVectorsFloat32Ref(), index->xb.data(), stream);
138  }
139  }
140 }
141 
142 size_t
144  return this->ntotal;
145 }
146 
147 void
149  DeviceScope scope(device_);
150 
151  // Free the underlying memory
152  data_->reset();
153  this->ntotal = 0;
154 }
155 
156 void
157 GpuIndexFlat::train(Index::idx_t n, const float* x) {
158  // nothing to do
159 }
160 
161 void
162 GpuIndexFlat::add(Index::idx_t n, const float* x) {
163  DeviceScope scope(device_);
164 
165  // To avoid multiple re-allocations, ensure we have enough storage
166  // available
167  data_->reserve(n, resources_->getDefaultStream(device_));
168 
169  // If we're not operating in float16 mode, we don't need the input
170  // data to be resident on our device; we can add directly.
171  if (!config_.useFloat16) {
172  addImpl_(n, x, nullptr);
173  } else {
174  // Otherwise, perform the paging
175  GpuIndex::add(n, x);
176  }
177 }
178 
179 void
181  const float* x,
182  const Index::idx_t* ids) {
183  // Device is already set in GpuIndex::addInternal_
184 
185  // We do not support add_with_ids
186  FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
187  FAISS_THROW_IF_NOT(n > 0);
188 
189  // Due to GPU indexing in int32, we can't store more than this
190  // number of vectors on a GPU
191  FAISS_THROW_IF_NOT_FMT(this->ntotal + n <=
192  (faiss::Index::idx_t) std::numeric_limits<int>::max(),
193  "GPU index only supports up to %zu indices",
194  (size_t) std::numeric_limits<int>::max());
195 
196  data_->add(x, n, resources_->getDefaultStream(device_));
197  this->ntotal += n;
198 }
199 
200 struct IntToLong {
201  __device__ long operator()(int v) const { return (long) v; }
202 };
203 
204 void
206  const float* x,
208  float* distances,
209  faiss::Index::idx_t* labels) const {
210  if (n == 0) {
211  return;
212  }
213 
214  // For now, only support <= max int results
215  FAISS_THROW_IF_NOT_FMT(n <=
216  (faiss::Index::idx_t) std::numeric_limits<int>::max(),
217  "GPU index only supports up to %zu indices",
218  (size_t) std::numeric_limits<int>::max());
219  FAISS_THROW_IF_NOT_FMT(k <= 1024,
220  "GPU only supports k <= 1024 (requested %d)",
221  (int) k); // select limitation
222 
223  DeviceScope scope(device_);
224  auto stream = resources_->getDefaultStream(device_);
225 
226  // The input vectors may be too large for the GPU, but we still
227  // assume that the output distances and labels are not.
228  // Go ahead and make space for output distances and labels on the
229  // GPU.
230  // If we reach a point where all inputs are too big, we can add
231  // another level of tiling.
232  auto outDistances = toDevice<float, 2>(resources_,
233  device_,
234  distances,
235  stream,
236  {(int) n, (int) k});
237 
238  // FlatIndex only supports an interface returning int indices
239  DeviceTensor<int, 2, true> outIntIndices(
240  resources_->getMemoryManagerCurrentDevice(),
241  {(int) n, (int) k}, stream);
242 
243  bool usePaged = false;
244 
245  if (getDeviceForAddress(x) == -1) {
246  // It is possible that the user is querying for a vector set size
247  // `x` that won't fit on the GPU.
248  // In this case, we will have to handle paging of the data from CPU
249  // -> GPU.
250  // Currently, we don't handle the case where the output data won't
251  // fit on the GPU (e.g., n * k is too large for the GPU memory).
252  size_t dataSize = (size_t) n * this->d * sizeof(float);
253 
254  if (dataSize >= minPagedSize_) {
255  searchFromCpuPaged_(n, x, k,
256  outDistances.data(),
257  outIntIndices.data());
258  usePaged = true;
259  }
260  }
261 
262  if (!usePaged) {
263  searchNonPaged_(n, x, k,
264  outDistances.data(),
265  outIntIndices.data());
266  }
267 
268  // Convert and copy int indices out
269  auto outIndices = toDevice<faiss::Index::idx_t, 2>(resources_,
270  device_,
271  labels,
272  stream,
273  {(int) n, (int) k});
274 
275  // Convert int to long
276  thrust::transform(thrust::cuda::par.on(stream),
277  outIntIndices.data(),
278  outIntIndices.end(),
279  outIndices.data(),
280  IntToLong());
281 
282  // Copy back if necessary
283  fromDevice<float, 2>(outDistances, distances, stream);
284  fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
285 }
286 
287 void
289  const float* x,
291  float* distances,
292  faiss::Index::idx_t* labels) const {
293  FAISS_ASSERT_MSG(false, "Should not be called");
294 }
295 
296 void
297 GpuIndexFlat::searchNonPaged_(int n,
298  const float* x,
299  int k,
300  float* outDistancesData,
301  int* outIndicesData) const {
302  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
303  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
304 
305  auto stream = resources_->getDefaultStream(device_);
306 
307  // Make sure arguments are on the device we desire; use temporary
308  // memory allocations to move it if necessary
309  auto vecs = toDevice<float, 2>(resources_,
310  device_,
311  const_cast<float*>(x),
312  stream,
313  {n, (int) this->d});
314 
315  data_->query(vecs, k, outDistances, outIndices, true);
316 }
317 
318 void
320  const float* x,
321  int k,
322  float* outDistancesData,
323  int* outIndicesData) const {
324  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
325  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
326 
327  // Is pinned memory available?
328  auto pinnedAlloc = resources_->getPinnedMemory();
329  int pageSizeInVecs =
330  (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
331 
332  if (!pinnedAlloc.first || pageSizeInVecs < 1) {
333  // Just page without overlapping copy with compute
334  int batchSize = utils::nextHighestPowerOf2(
335  (int) ((size_t) kNonPinnedPageSize /
336  (sizeof(float) * this->d)));
337 
338  for (int cur = 0; cur < n; cur += batchSize) {
339  int num = std::min(batchSize, n - cur);
340 
341  auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
342  auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
343 
344  searchNonPaged_(num,
345  x + (size_t) cur * this->d,
346  k,
347  outDistancesSlice.data(),
348  outIndicesSlice.data());
349  }
350 
351  return;
352  }
353 
354  //
355  // Pinned memory is available, so we can overlap copy with compute.
356  // We use two pinned memory buffers, and triple-buffer the
357  // procedure:
358  //
359  // 1 CPU copy -> pinned
360  // 2 pinned copy -> GPU
361  // 3 GPU compute
362  //
363  // 1 2 3 1 2 3 ... (pinned buf A)
364  // 1 2 3 1 2 ... (pinned buf B)
365  // 1 2 3 1 ... (pinned buf A)
366  // time ->
367  //
368  auto defaultStream = resources_->getDefaultStream(device_);
369  auto copyStream = resources_->getAsyncCopyStream(device_);
370 
371  FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=
372  (size_t) std::numeric_limits<int>::max());
373 
374  float* bufPinnedA = (float*) pinnedAlloc.first;
375  float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;
376  float* bufPinned[2] = {bufPinnedA, bufPinnedB};
377 
378  // Reserve space on the GPU for the destination of the pinned buffer
379  // copy
381  resources_->getMemoryManagerCurrentDevice(),
382  {(int) pageSizeInVecs, (int) this->d},
383  defaultStream);
385  resources_->getMemoryManagerCurrentDevice(),
386  {(int) pageSizeInVecs, (int) this->d},
387  defaultStream);
388  DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
389 
390  // Copy completion events for the pinned buffers
391  std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
392 
393  // Execute completion events for the GPU buffers
394  std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
395 
396  // All offsets are in terms of number of vectors; they remain within
397  // int bounds (as this function only handles max in vectors)
398 
399  // Current start offset for buffer 1
400  int cur1 = 0;
401  int cur1BufIndex = 0;
402 
403  // Current start offset for buffer 2
404  int cur2 = -1;
405  int cur2BufIndex = 0;
406 
407  // Current start offset for buffer 3
408  int cur3 = -1;
409  int cur3BufIndex = 0;
410 
411  while (cur3 < n) {
412  // Start async pinned -> GPU copy first (buf 2)
413  if (cur2 != -1 && cur2 < n) {
414  // Copy pinned to GPU
415  int numToCopy = std::min(pageSizeInVecs, n - cur2);
416 
417  // Make sure any previous execution has completed before continuing
418  auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
419  if (eventPrev.get()) {
420  eventPrev->streamWaitOnEvent(copyStream);
421  }
422 
423  CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
424  bufPinned[cur2BufIndex],
425  (size_t) numToCopy * this->d * sizeof(float),
426  cudaMemcpyHostToDevice,
427  copyStream));
428 
429  // Mark a completion event in this stream
430  eventPinnedCopyDone[cur2BufIndex] =
431  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));
432 
433  // We pick up from here
434  cur3 = cur2;
435  cur2 += numToCopy;
436  cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
437  }
438 
439  if (cur3 != -1 && cur3 < n) {
440  // Process on GPU
441  int numToProcess = std::min(pageSizeInVecs, n - cur3);
442 
443  // Make sure the previous copy has completed before continuing
444  auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
445  FAISS_ASSERT(eventPrev.get());
446 
447  eventPrev->streamWaitOnEvent(defaultStream);
448 
449  // Create tensor wrappers
450  DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
451  {numToProcess, this->d});
452  auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
453  auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
454 
455  data_->query(input, k,
456  outDistancesSlice,
457  outIndicesSlice, true);
458 
459  // Create completion event
460  eventGpuExecuteDone[cur3BufIndex] =
461  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));
462 
463  // We pick up from here
464  cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
465  cur3 += numToProcess;
466  }
467 
468  if (cur1 < n) {
469  // Copy CPU mem to CPU pinned
470  int numToCopy = std::min(pageSizeInVecs, n - cur1);
471 
472  // Make sure any previous copy has completed before continuing
473  auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
474  if (eventPrev.get()) {
475  eventPrev->cpuWaitOnEvent();
476  }
477 
478  memcpy(bufPinned[cur1BufIndex],
479  x + (size_t) cur1 * this->d,
480  (size_t) numToCopy * this->d * sizeof(float));
481 
482  // We pick up from here
483  cur2 = cur1;
484  cur1 += numToCopy;
485  cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
486  }
487  }
488 }
489 
490 void
492  float* out) const {
493  DeviceScope scope(device_);
494 
495  FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
496  auto stream = resources_->getDefaultStream(device_);
497 
498  if (config_.useFloat16) {
499  auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
500  fromDevice(vec.data(), out, this->d, stream);
501  } else {
502  auto vec = data_->getVectorsFloat32Ref()[key];
503  fromDevice(vec.data(), out, this->d, stream);
504  }
505 }
506 
507 void
510  float* out) const {
511  DeviceScope scope(device_);
512 
513  FAISS_THROW_IF_NOT_MSG(i0 < this->ntotal, "index out of bounds");
514  FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal, "num out of bounds");
515  auto stream = resources_->getDefaultStream(device_);
516 
517  if (config_.useFloat16) {
518  auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
519  fromDevice(vec.data(), out, num * this->d, stream);
520  } else {
521  auto vec = data_->getVectorsFloat32Ref()[i0];
522  fromDevice(vec.data(), out, this->d * num, stream);
523  }
524 }
525 
526 void
527 GpuIndexFlat::verifySettings_() const {
528  // If we want Hgemm, ensure that it is supported on this device
530 #ifdef FAISS_USE_FLOAT16
531  FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
532  "useFloat16Accumulator can only be enabled "
533  "with useFloat16");
534 
535  FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
536  "Device %d does not support Hgemm "
537  "(useFloat16Accumulator)",
538  config_.device);
539 #else
540  FAISS_THROW_IF_NOT_MSG(false, "not compiled with float16 support");
541 #endif
542  }
543 }
544 
545 //
546 // GpuIndexFlatL2
547 //
548 
550  faiss::IndexFlatL2* index,
551  GpuIndexFlatConfig config) :
552  GpuIndexFlat(resources, index, config) {
553 }
554 
556  int dims,
557  GpuIndexFlatConfig config) :
558  GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
559 }
560 
561 void
563  GpuIndexFlat::copyFrom(index);
564 }
565 
566 void
568  GpuIndexFlat::copyTo(index);
569 }
570 
571 //
572 // GpuIndexFlatIP
573 //
574 
576  faiss::IndexFlatIP* index,
577  GpuIndexFlatConfig config) :
578  GpuIndexFlat(resources, index, config) {
579 }
580 
582  int dims,
583  GpuIndexFlatConfig config) :
584  GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
585 }
586 
587 void
589  GpuIndexFlat::copyFrom(index);
590 }
591 
592 void
594  GpuIndexFlat::copyTo(index);
595 }
596 
597 } } // namespace
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
Definition: FlatIndex.cu:91
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const override
Batch reconstruction method.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
Definition: GpuIndexFlat.cu:82
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:47
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
void addImpl_(faiss::Index::idx_t n, const float *x, const faiss::Index::idx_t *ids) override
Called from GpuIndex for add.
__host__ __device__ Tensor< T, Dim, Contig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
Definition: Tensor-inl.cuh:605
void searchImpl_(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
Should not be called (we have our own implementation)
size_t getNumVecs() const
Returns the number of vectors we contain.
GpuIndexFlat(GpuResources *resources, const faiss::IndexFlat *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
Definition: GpuIndexFlat.cu:33
void setMinPagingSize(size_t size)
Definition: GpuIndexFlat.cu:77
bool useFloat16
Whether or not data is stored as float16.
Definition: GpuIndexFlat.h:35
int device
GPU device on which the index is resident.
Definition: GpuIndex.h:27
GpuIndexFlatL2(GpuResources *resources, faiss::IndexFlatL2 *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
int d
vector dimension
Definition: Index.h:64
void reconstruct(faiss::Index::idx_t key, float *out) const override
const int device_
The GPU device we are resident on.
Definition: GpuIndex.h:94
void copyTo(faiss::IndexFlatIP *index)
GpuIndexFlatIP(GpuResources *resources, faiss::IndexFlatIP *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:91
long idx_t
all indices are this type
Definition: Index.h:62
void reserve(size_t numVecs, cudaStream_t stream)
Reserve storage that can contain at least this many vectors.
Definition: FlatIndex.cu:68
void add(const float *data, int numVecs, cudaStream_t stream)
Definition: FlatIndex.cu:201
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:65
void add(faiss::Index::idx_t, const float *x) override
Definition: GpuIndex.cu:61
void copyFrom(const faiss::IndexFlat *index)
Definition: GpuIndexFlat.cu:87
Our tensor type.
Definition: Tensor.cuh:30
const MemorySpace memorySpace_
The memory space of our primary storage on the GPU.
Definition: GpuIndex.h:97
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
Definition: GpuIndexFlat.h:160
const GpuIndexFlatConfig config_
Our config object.
Definition: GpuIndexFlat.h:157
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:72
void add(faiss::Index::idx_t, const float *x) override
Overrides to avoid excessive copies.
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:79
void reset() override
Clears all vectors from this index.
void copyFrom(faiss::IndexFlatIP *index)
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:69
void reset()
Free all storage.
Definition: FlatIndex.cu:273
void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
std::vector< float > xb
database vectors, size ntotal * d
Definition: IndexFlat.h:25
void train(Index::idx_t n, const float *x) override
This index is not trained, so this does nothing.
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:43
FlatIndex * data_
Holds our GPU data containing the list of vectors.
Definition: GpuIndexFlat.h:163