Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
GpuIndexFlat.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "GpuIndexFlat.h"
13 #include "../IndexFlat.h"
14 #include "GpuResources.h"
15 #include "impl/FlatIndex.cuh"
16 #include "utils/CopyUtils.cuh"
17 #include "utils/DeviceUtils.h"
18 #include "utils/StaticUtils.h"
19 
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
22 #include <limits>
23 
24 namespace faiss { namespace gpu {
25 
26 /// Default CPU search size for which we use paged copies
27 constexpr size_t kMinPagedQuerySize = (size_t) 256 * 1024 * 1024;
28 
29 /// Size above which we page copies from the CPU to GPU (non-paged
30 /// memory usage)
31 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
32 
34  int device,
35  bool useFloat16,
36  const faiss::IndexFlat* index) :
37  GpuIndex(resources, device, index->d, index->metric_type),
38  minPagedSize_(kMinPagedQuerySize),
39  useFloat16_(useFloat16),
40  data_(nullptr) {
41  copyFrom(index);
42 }
43 
45  int device,
46  int dims,
47  bool useFloat16,
48  faiss::MetricType metric) :
49  GpuIndex(resources, device, dims, metric),
50  minPagedSize_(kMinPagedQuerySize),
51  useFloat16_(useFloat16),
52  data_(nullptr) {
53  DeviceScope scope(device_);
54 
55  data_ = new FlatIndex(resources,
56  dims,
57  metric == faiss::METRIC_L2,
58  useFloat16);
59 }
60 
61 GpuIndexFlat::~GpuIndexFlat() {
62  delete data_;
63 }
64 
65 void
67  minPagedSize_ = size;
68 }
69 
70 size_t
72  return minPagedSize_;
73 }
74 
75 bool
77  return useFloat16_;
78 }
79 
80 void
82  DeviceScope scope(device_);
83 
84  this->d = index->d;
85  this->metric_type = index->metric_type;
86 
87  // GPU code has 32 bit indices
88  FAISS_ASSERT(index->ntotal <=
89  (faiss::Index::idx_t) std::numeric_limits<int>::max());
90  this->ntotal = index->ntotal;
91 
92  delete data_;
94  this->d,
95  index->metric_type == faiss::METRIC_L2,
96  useFloat16_);
97 
98  // The index could be empty
99  if (index->ntotal > 0) {
100  data_->add(index->xb.data(),
101  index->ntotal,
102  resources_->getDefaultStream(device_));
103  }
104 }
105 
106 void
108  DeviceScope scope(device_);
109 
110  index->d = this->d;
111  index->ntotal = this->ntotal;
112  index->metric_type = this->metric_type;
113 
114  FAISS_ASSERT(data_->getSize() == this->ntotal);
115  index->xb.resize(this->ntotal * this->d);
116 
117  auto stream = resources_->getDefaultStream(device_);
118 
119  if (this->ntotal > 0) {
120  if (useFloat16_) {
121  auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
122  fromDevice(vecFloat32, index->xb.data(), stream);
123  } else {
124  fromDevice(data_->getVectorsFloat32Ref(), index->xb.data(), stream);
125  }
126  }
127 }
128 
129 size_t
131  return this->ntotal;
132 }
133 
134 void
136  DeviceScope scope(device_);
137 
138  // Free the underlying memory
139  data_->reset();
140  this->ntotal = 0;
141 }
142 
143 void
144 GpuIndexFlat::train(Index::idx_t n, const float* x) {
145  // nothing to do
146 }
147 
148 void
149 GpuIndexFlat::add(Index::idx_t n, const float* x) {
150  // Due to GPU indexing in int32, we can't store more than this
151  // number of vectors on a GPU
152  FAISS_ASSERT(this->ntotal + n <=
153  (faiss::Index::idx_t) std::numeric_limits<int>::max());
154 
155  if (n > 0) {
156  DeviceScope scope(device_);
157  data_->add(x, n, resources_->getDefaultStream(device_));
158  this->ntotal += n;
159  }
160 }
161 
162 struct IntToLong {
163  __device__ long operator()(int v) const { return (long) v; }
164 };
165 
166 void
168  const float* x,
170  float* distances,
171  faiss::Index::idx_t* labels) const {
172  if (n == 0) {
173  return;
174  }
175 
176  // For now, only support <= max int results
177  // TODO: handle tiling over arbitrary n to keep within 32 bit bounds
178  FAISS_ASSERT(n <= (faiss::Index::idx_t) std::numeric_limits<int>::max());
179  FAISS_ASSERT(k <= 1024); // select limitation
180 
181  DeviceScope scope(device_);
182  auto stream = resources_->getDefaultStream(device_);
183 
184  // The input vectors may be too large for the GPU, but we still
185  // assume that the output distances and labels are not.
186  // Go ahead and make space for output distances and labels on the
187  // GPU.
188  // If we reach a point where all inputs are too big, we can add
189  // another level of tiling.
190  auto outDistances = toDevice<float, 2>(resources_,
191  device_,
192  distances,
193  stream,
194  {(int) n, (int) k});
195 
196  // FlatIndex only supports an interface returning int indices
197  DeviceTensor<int, 2, true> outIntIndices(
198  resources_->getMemoryManagerCurrentDevice(),
199  {(int) n, (int) k}, stream);
200 
201  bool usePaged = false;
202 
203  if (getDeviceForAddress(x) == -1) {
204  // It is possible that the user is querying for a vector set size
205  // `x` that won't fit on the GPU.
206  // In this case, we will have to handle paging of the data from CPU
207  // -> GPU.
208  // Currently, we don't handle the case where the output data won't
209  // fit on the GPU (e.g., n * k is too large for the GPU memory).
210  size_t dataSize = (size_t) n * this->d * sizeof(float);
211 
212  if (dataSize >= minPagedSize_) {
213  searchFromCpuPaged_(n, x, k,
214  outDistances.data(),
215  outIntIndices.data());
216  usePaged = true;
217  }
218  }
219 
220  if (!usePaged) {
221  searchNonPaged_(n, x, k,
222  outDistances.data(),
223  outIntIndices.data());
224  }
225 
226  // Convert and copy int indices out
227  auto outIndices = toDevice<faiss::Index::idx_t, 2>(resources_,
228  device_,
229  labels,
230  stream,
231  {(int) n, (int) k});
232 
233  // Convert int to long
234  thrust::transform(thrust::cuda::par.on(stream),
235  outIntIndices.data(),
236  outIntIndices.end(),
237  outIndices.data(),
238  IntToLong());
239 
240  // Copy back if necessary
241  fromDevice<float, 2>(outDistances, distances, stream);
242  fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
243 }
244 
245 void
246 GpuIndexFlat::searchNonPaged_(int n,
247  const float* x,
248  int k,
249  float* outDistancesData,
250  int* outIndicesData) const {
251  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
252  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
253 
254  auto stream = resources_->getDefaultStream(device_);
255 
256  // Make sure arguments are on the device we desire; use temporary
257  // memory allocations to move it if necessary
258  auto vecs = toDevice<float, 2>(resources_,
259  device_,
260  const_cast<float*>(x),
261  stream,
262  {n, (int) this->d});
263 
264  data_->query(vecs, k, outDistances, outIndices, true);
265 }
266 
267 void
269  const float* x,
270  int k,
271  float* outDistancesData,
272  int* outIndicesData) const {
273  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
274  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
275 
276  // Is pinned memory available?
277  auto pinnedAlloc = resources_->getPinnedMemory();
278  int pageSizeInVecs =
279  (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
280 
281  if (!pinnedAlloc.first || pageSizeInVecs < 1) {
282  // Just page without overlapping copy with compute
283  int batchSize = utils::nextHighestPowerOf2(
284  (int) ((size_t) kNonPinnedPageSize /
285  (sizeof(float) * this->d)));
286 
287  for (int cur = 0; cur < n; cur += batchSize) {
288  int num = std::min(batchSize, n - cur);
289 
290  auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
291  auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
292 
293  searchNonPaged_(num,
294  x + (size_t) cur * this->d,
295  k,
296  outDistancesSlice.data(),
297  outIndicesSlice.data());
298  }
299 
300  return;
301  }
302 
303  //
304  // Pinned memory is available, so we can overlap copy with compute.
305  // We use two pinned memory buffers, and triple-buffer the
306  // procedure:
307  //
308  // 1 CPU copy -> pinned
309  // 2 pinned copy -> GPU
310  // 3 GPU compute
311  //
312  // 1 2 3 1 2 3 ... (pinned buf A)
313  // 1 2 3 1 2 ... (pinned buf B)
314  // 1 2 3 1 ... (pinned buf A)
315  // time ->
316  //
317  auto defaultStream = resources_->getDefaultStream(device_);
318  auto copyStream = resources_->getAsyncCopyStream(device_);
319 
320  FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=
321  (size_t) std::numeric_limits<int>::max());
322 
323  float* bufPinnedA = (float*) pinnedAlloc.first;
324  float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;
325  float* bufPinned[2] = {bufPinnedA, bufPinnedB};
326 
327  // Reserve space on the GPU for the destination of the pinned buffer
328  // copy
330  resources_->getMemoryManagerCurrentDevice(),
331  {(int) pageSizeInVecs, (int) this->d},
332  defaultStream);
334  resources_->getMemoryManagerCurrentDevice(),
335  {(int) pageSizeInVecs, (int) this->d},
336  defaultStream);
337  DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
338 
339  // Copy completion events for the pinned buffers
340  std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
341 
342  // Execute completion events for the GPU buffers
343  std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
344 
345  // All offsets are in terms of number of vectors; they remain within
346  // int bounds (as this function only handles max in vectors)
347 
348  // Current start offset for buffer 1
349  int cur1 = 0;
350  int cur1BufIndex = 0;
351 
352  // Current start offset for buffer 2
353  int cur2 = -1;
354  int cur2BufIndex = 0;
355 
356  // Current start offset for buffer 3
357  int cur3 = -1;
358  int cur3BufIndex = 0;
359 
360  while (cur3 < n) {
361  // Start async pinned -> GPU copy first (buf 2)
362  if (cur2 != -1 && cur2 < n) {
363  // Copy pinned to GPU
364  int numToCopy = std::min(pageSizeInVecs, n - cur2);
365 
366  // Make sure any previous execution has completed before continuing
367  auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
368  if (eventPrev.get()) {
369  eventPrev->streamWaitOnEvent(copyStream);
370  }
371 
372  CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
373  bufPinned[cur2BufIndex],
374  (size_t) numToCopy * this->d * sizeof(float),
375  cudaMemcpyHostToDevice,
376  copyStream));
377 
378  // Mark a completion event in this stream
379  eventPinnedCopyDone[cur2BufIndex] =
380  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));
381 
382  // We pick up from here
383  cur3 = cur2;
384  cur2 += numToCopy;
385  cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
386  }
387 
388  if (cur3 != -1 && cur3 < n) {
389  // Process on GPU
390  int numToProcess = std::min(pageSizeInVecs, n - cur3);
391 
392  // Make sure the previous copy has completed before continuing
393  auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
394  FAISS_ASSERT(eventPrev.get());
395 
396  eventPrev->streamWaitOnEvent(defaultStream);
397 
398  // Create tensor wrappers
399  DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
400  {numToProcess, this->d});
401  auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
402  auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
403 
404  data_->query(input, k,
405  outDistancesSlice,
406  outIndicesSlice, true);
407 
408  // Create completion event
409  eventGpuExecuteDone[cur3BufIndex] =
410  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));
411 
412  // We pick up from here
413  cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
414  cur3 += numToProcess;
415  }
416 
417  if (cur1 < n) {
418  // Copy CPU mem to CPU pinned
419  int numToCopy = std::min(pageSizeInVecs, n - cur1);
420 
421  // Make sure any previous copy has completed before continuing
422  auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
423  if (eventPrev.get()) {
424  eventPrev->cpuWaitOnEvent();
425  }
426 
427  memcpy(bufPinned[cur1BufIndex],
428  x + (size_t) cur1 * this->d,
429  (size_t) numToCopy * this->d * sizeof(float));
430 
431  // We pick up from here
432  cur2 = cur1;
433  cur1 += numToCopy;
434  cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
435  }
436  }
437 }
438 
439 void
441  float* out) const {
442  DeviceScope scope(device_);
443 
444  FAISS_ASSERT(key < this->ntotal);
445  auto stream = resources_->getDefaultStream(device_);
446 
447  if (useFloat16_) {
448  auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
449  fromDevice(vec.data(), out, this->d, stream);
450  } else {
451  auto vec = data_->getVectorsFloat32Ref()[key];
452  fromDevice(vec.data(), out, this->d, stream);
453  }
454 }
455 
456 void
459  float* out) const {
460  DeviceScope scope(device_);
461 
462  FAISS_ASSERT(i0 < this->ntotal);
463  FAISS_ASSERT(i0 + num - 1 < this->ntotal);
464  auto stream = resources_->getDefaultStream(device_);
465 
466  if (useFloat16_) {
467  auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
468  fromDevice(vec.data(), out, num * this->d, stream);
469  } else {
470  auto vec = data_->getVectorsFloat32Ref()[i0];
471  fromDevice(vec.data(), out, this->d * num, stream);
472  }
473 }
474 
475 void
476 GpuIndexFlat::set_typename() {
477  if (this->metric_type == faiss::METRIC_L2) {
478  this->index_typename = "GpuL2";
479  } else {
480  this->index_typename = "GpuIP";
481  }
482 }
483 
484 //
485 // GpuIndexFlatL2
486 //
487 
489  int device,
490  bool useFloat16,
491  faiss::IndexFlatL2* index) :
492  GpuIndexFlat(resources, device, useFloat16, index) {
493 }
494 
496  int device,
497  int dims,
498  bool useFloat16) :
499  GpuIndexFlat(resources, device, dims, useFloat16, faiss::METRIC_L2) {
500 }
501 
502 void
504  GpuIndexFlat::copyFrom(index);
505 }
506 
507 void
509  GpuIndexFlat::copyTo(index);
510 }
511 
512 //
513 // GpuIndexFlatIP
514 //
515 
517  int device,
518  bool useFloat16,
519  faiss::IndexFlatIP* index) :
520  GpuIndexFlat(resources, device, useFloat16, index) {
521 }
522 
524  int device,
525  int dims,
526  bool useFloat16) :
527  GpuIndexFlat(resources, device, dims, useFloat16,
528  faiss::METRIC_INNER_PRODUCT) {
529 }
530 
531 void
533  GpuIndexFlat::copyFrom(index);
534 }
535 
536 void
538  GpuIndexFlat::copyTo(index);
539 }
540 
541 } } // namespace
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
Definition: FlatIndex.cu:73
GpuIndexFlat(GpuResources *resources, int device, bool useFloat16, const faiss::IndexFlat *index)
Definition: GpuIndexFlat.cu:33
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const override
Batch reconstruction method.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
Definition: GpuIndexFlat.cu:71
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:40
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
__host__ __device__ Tensor< T, Dim, Contig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
Definition: Tensor-inl.cuh:553
const bool useFloat16_
Whether or not we store our vectors in float32 or float16.
Definition: GpuIndexFlat.h:122
size_t getNumVecs() const
Returns the number of vectors we contain.
void setMinPagingSize(size_t size)
Definition: GpuIndexFlat.cu:66
int device_
The GPU device we are resident on.
Definition: GpuIndex.h:43
bool getUseFloat16() const
Do we store vectors and perform math in float16?
Definition: GpuIndexFlat.cu:76
int d
vector dimension
Definition: Index.h:66
GpuIndexFlatL2(GpuResources *resources, int device, bool useFloat16, faiss::IndexFlatL2 *index)
void reconstruct(faiss::Index::idx_t key, float *out) const override
void copyTo(faiss::IndexFlatIP *index)
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:40
long idx_t
all indices are this type
Definition: Index.h:64
void add(const float *data, int numVecs, cudaStream_t stream)
Definition: FlatIndex.cu:177
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
void copyFrom(const faiss::IndexFlat *index)
Definition: GpuIndexFlat.cu:81
Our tensor type.
Definition: Tensor.cuh:31
void add(Index::idx_t n, const float *x) override
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
Definition: GpuIndexFlat.h:119
GpuIndexFlatIP(GpuResources *resources, int device, bool useFloat16, faiss::IndexFlatIP *index)
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:61
void reset() override
Clears all vectors from this index.
void copyFrom(faiss::IndexFlatIP *index)
void reset()
Free all storage.
Definition: FlatIndex.cu:235
void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const override
std::vector< float > xb
database vectors, size ntotal * d
Definition: IndexFlat.h:26
void train(Index::idx_t n, const float *x) override
This index is not trained, so this does nothing.
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:44
FlatIndex * data_
Holds our GPU data containing the list of vectors.
Definition: GpuIndexFlat.h:125