Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
GpuIndexFlat.cu
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "GpuIndexFlat.h"
13 #include "../IndexFlat.h"
14 #include "GpuResources.h"
15 #include "impl/FlatIndex.cuh"
16 #include "utils/CopyUtils.cuh"
17 #include "utils/DeviceUtils.h"
18 #include "utils/StaticUtils.h"
19 
20 #include <thrust/execution_policy.h>
21 #include <thrust/transform.h>
22 #include <limits>
23 
24 namespace faiss { namespace gpu {
25 
26 /// Default CPU search size for which we use paged copies
27 constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
28 
29 /// Size above which we page copies from the CPU to GPU (non-paged
30 /// memory usage)
31 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
32 
34  const faiss::IndexFlat* index,
35  GpuIndexFlatConfig config) :
36  GpuIndex(resources, config.device, index->d, index->metric_type),
37  minPagedSize_(kMinPageSize),
38  config_(config),
39  data_(nullptr) {
40  // Flat index doesn't need training
41  this->is_trained = true;
42  copyFrom(index);
43 }
44 
46  int dims,
47  faiss::MetricType metric,
48  GpuIndexFlatConfig config) :
49  GpuIndex(resources, config.device, dims, metric),
50  minPagedSize_(kMinPageSize),
51  config_(config),
52  data_(nullptr) {
53  // Flat index doesn't need training
54  this->is_trained = true;
55  DeviceScope scope(device_);
56 
57  data_ = new FlatIndex(resources,
58  dims,
59  metric == faiss::METRIC_L2,
60  config_.useFloat16,
61  config_.storeTransposed);
62 }
63 
64 GpuIndexFlat::~GpuIndexFlat() {
65  delete data_;
66 }
67 
68 void
70  minPagedSize_ = size;
71 }
72 
73 size_t
75  return minPagedSize_;
76 }
77 
78 bool
80  return config_.useFloat16;
81 }
82 
83 void
85  DeviceScope scope(device_);
86 
87  this->d = index->d;
88  this->metric_type = index->metric_type;
89 
90  // GPU code has 32 bit indices
91  FAISS_ASSERT(index->ntotal <=
92  (faiss::Index::idx_t) std::numeric_limits<int>::max());
93  this->ntotal = index->ntotal;
94 
95  delete data_;
97  this->d,
98  index->metric_type == faiss::METRIC_L2,
99  config_.useFloat16,
100  config_.storeTransposed);
101 
102  // The index could be empty
103  if (index->ntotal > 0) {
104  data_->add(index->xb.data(),
105  index->ntotal,
106  resources_->getDefaultStream(device_));
107  }
108 }
109 
110 void
112  DeviceScope scope(device_);
113 
114  index->d = this->d;
115  index->ntotal = this->ntotal;
116  index->metric_type = this->metric_type;
117 
118  FAISS_ASSERT(data_->getSize() == this->ntotal);
119  index->xb.resize(this->ntotal * this->d);
120 
121  auto stream = resources_->getDefaultStream(device_);
122 
123  if (this->ntotal > 0) {
124  if (config_.useFloat16) {
125  auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
126  fromDevice(vecFloat32, index->xb.data(), stream);
127  } else {
128  fromDevice(data_->getVectorsFloat32Ref(), index->xb.data(), stream);
129  }
130  }
131 }
132 
133 size_t
135  return this->ntotal;
136 }
137 
138 void
140  DeviceScope scope(device_);
141 
142  // Free the underlying memory
143  data_->reset();
144  this->ntotal = 0;
145 }
146 
147 void
148 GpuIndexFlat::train(Index::idx_t n, const float* x) {
149  // nothing to do
150 }
151 
152 void
154  const float* x,
155  const Index::idx_t* ids) {
156  // Device is already set in GpuIndex::addInternal_
157 
158  // We do not support add_with_ids
159  FAISS_ASSERT(!ids);
160  FAISS_ASSERT(n > 0);
161 
162  // Due to GPU indexing in int32, we can't store more than this
163  // number of vectors on a GPU
164  FAISS_ASSERT(this->ntotal + n <=
165  (faiss::Index::idx_t) std::numeric_limits<int>::max());
166 
167  data_->add(x, n, resources_->getDefaultStream(device_));
168  this->ntotal += n;
169 }
170 
171 struct IntToLong {
172  __device__ long operator()(int v) const { return (long) v; }
173 };
174 
175 void
177  const float* x,
179  float* distances,
180  faiss::Index::idx_t* labels) const {
181  if (n == 0) {
182  return;
183  }
184 
185  // For now, only support <= max int results
186  // TODO: handle tiling over arbitrary n to keep within 32 bit bounds
187  FAISS_ASSERT(n <= (faiss::Index::idx_t) std::numeric_limits<int>::max());
188  FAISS_ASSERT(k <= 1024); // select limitation
189 
190  DeviceScope scope(device_);
191  auto stream = resources_->getDefaultStream(device_);
192 
193  // The input vectors may be too large for the GPU, but we still
194  // assume that the output distances and labels are not.
195  // Go ahead and make space for output distances and labels on the
196  // GPU.
197  // If we reach a point where all inputs are too big, we can add
198  // another level of tiling.
199  auto outDistances = toDevice<float, 2>(resources_,
200  device_,
201  distances,
202  stream,
203  {(int) n, (int) k});
204 
205  // FlatIndex only supports an interface returning int indices
206  DeviceTensor<int, 2, true> outIntIndices(
207  resources_->getMemoryManagerCurrentDevice(),
208  {(int) n, (int) k}, stream);
209 
210  bool usePaged = false;
211 
212  if (getDeviceForAddress(x) == -1) {
213  // It is possible that the user is querying for a vector set size
214  // `x` that won't fit on the GPU.
215  // In this case, we will have to handle paging of the data from CPU
216  // -> GPU.
217  // Currently, we don't handle the case where the output data won't
218  // fit on the GPU (e.g., n * k is too large for the GPU memory).
219  size_t dataSize = (size_t) n * this->d * sizeof(float);
220 
221  if (dataSize >= minPagedSize_) {
222  searchFromCpuPaged_(n, x, k,
223  outDistances.data(),
224  outIntIndices.data());
225  usePaged = true;
226  }
227  }
228 
229  if (!usePaged) {
230  searchNonPaged_(n, x, k,
231  outDistances.data(),
232  outIntIndices.data());
233  }
234 
235  // Convert and copy int indices out
236  auto outIndices = toDevice<faiss::Index::idx_t, 2>(resources_,
237  device_,
238  labels,
239  stream,
240  {(int) n, (int) k});
241 
242  // Convert int to long
243  thrust::transform(thrust::cuda::par.on(stream),
244  outIntIndices.data(),
245  outIntIndices.end(),
246  outIndices.data(),
247  IntToLong());
248 
249  // Copy back if necessary
250  fromDevice<float, 2>(outDistances, distances, stream);
251  fromDevice<faiss::Index::idx_t, 2>(outIndices, labels, stream);
252 }
253 
254 void
256  const float* x,
258  float* distances,
259  faiss::Index::idx_t* labels) const {
260  FAISS_ASSERT(!"Should not be called");
261 }
262 
263 void
264 GpuIndexFlat::searchNonPaged_(int n,
265  const float* x,
266  int k,
267  float* outDistancesData,
268  int* outIndicesData) const {
269  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
270  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
271 
272  auto stream = resources_->getDefaultStream(device_);
273 
274  // Make sure arguments are on the device we desire; use temporary
275  // memory allocations to move it if necessary
276  auto vecs = toDevice<float, 2>(resources_,
277  device_,
278  const_cast<float*>(x),
279  stream,
280  {n, (int) this->d});
281 
282  data_->query(vecs, k, outDistances, outIndices, true);
283 }
284 
285 void
287  const float* x,
288  int k,
289  float* outDistancesData,
290  int* outIndicesData) const {
291  Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
292  Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
293 
294  // Is pinned memory available?
295  auto pinnedAlloc = resources_->getPinnedMemory();
296  int pageSizeInVecs =
297  (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
298 
299  if (!pinnedAlloc.first || pageSizeInVecs < 1) {
300  // Just page without overlapping copy with compute
301  int batchSize = utils::nextHighestPowerOf2(
302  (int) ((size_t) kNonPinnedPageSize /
303  (sizeof(float) * this->d)));
304 
305  for (int cur = 0; cur < n; cur += batchSize) {
306  int num = std::min(batchSize, n - cur);
307 
308  auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
309  auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
310 
311  searchNonPaged_(num,
312  x + (size_t) cur * this->d,
313  k,
314  outDistancesSlice.data(),
315  outIndicesSlice.data());
316  }
317 
318  return;
319  }
320 
321  //
322  // Pinned memory is available, so we can overlap copy with compute.
323  // We use two pinned memory buffers, and triple-buffer the
324  // procedure:
325  //
326  // 1 CPU copy -> pinned
327  // 2 pinned copy -> GPU
328  // 3 GPU compute
329  //
330  // 1 2 3 1 2 3 ... (pinned buf A)
331  // 1 2 3 1 2 ... (pinned buf B)
332  // 1 2 3 1 ... (pinned buf A)
333  // time ->
334  //
335  auto defaultStream = resources_->getDefaultStream(device_);
336  auto copyStream = resources_->getAsyncCopyStream(device_);
337 
338  FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=
339  (size_t) std::numeric_limits<int>::max());
340 
341  float* bufPinnedA = (float*) pinnedAlloc.first;
342  float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;
343  float* bufPinned[2] = {bufPinnedA, bufPinnedB};
344 
345  // Reserve space on the GPU for the destination of the pinned buffer
346  // copy
348  resources_->getMemoryManagerCurrentDevice(),
349  {(int) pageSizeInVecs, (int) this->d},
350  defaultStream);
352  resources_->getMemoryManagerCurrentDevice(),
353  {(int) pageSizeInVecs, (int) this->d},
354  defaultStream);
355  DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
356 
357  // Copy completion events for the pinned buffers
358  std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
359 
360  // Execute completion events for the GPU buffers
361  std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
362 
363  // All offsets are in terms of number of vectors; they remain within
364  // int bounds (as this function only handles max in vectors)
365 
366  // Current start offset for buffer 1
367  int cur1 = 0;
368  int cur1BufIndex = 0;
369 
370  // Current start offset for buffer 2
371  int cur2 = -1;
372  int cur2BufIndex = 0;
373 
374  // Current start offset for buffer 3
375  int cur3 = -1;
376  int cur3BufIndex = 0;
377 
378  while (cur3 < n) {
379  // Start async pinned -> GPU copy first (buf 2)
380  if (cur2 != -1 && cur2 < n) {
381  // Copy pinned to GPU
382  int numToCopy = std::min(pageSizeInVecs, n - cur2);
383 
384  // Make sure any previous execution has completed before continuing
385  auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
386  if (eventPrev.get()) {
387  eventPrev->streamWaitOnEvent(copyStream);
388  }
389 
390  CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
391  bufPinned[cur2BufIndex],
392  (size_t) numToCopy * this->d * sizeof(float),
393  cudaMemcpyHostToDevice,
394  copyStream));
395 
396  // Mark a completion event in this stream
397  eventPinnedCopyDone[cur2BufIndex] =
398  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));
399 
400  // We pick up from here
401  cur3 = cur2;
402  cur2 += numToCopy;
403  cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
404  }
405 
406  if (cur3 != -1 && cur3 < n) {
407  // Process on GPU
408  int numToProcess = std::min(pageSizeInVecs, n - cur3);
409 
410  // Make sure the previous copy has completed before continuing
411  auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
412  FAISS_ASSERT(eventPrev.get());
413 
414  eventPrev->streamWaitOnEvent(defaultStream);
415 
416  // Create tensor wrappers
417  DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
418  {numToProcess, this->d});
419  auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
420  auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
421 
422  data_->query(input, k,
423  outDistancesSlice,
424  outIndicesSlice, true);
425 
426  // Create completion event
427  eventGpuExecuteDone[cur3BufIndex] =
428  std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));
429 
430  // We pick up from here
431  cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
432  cur3 += numToProcess;
433  }
434 
435  if (cur1 < n) {
436  // Copy CPU mem to CPU pinned
437  int numToCopy = std::min(pageSizeInVecs, n - cur1);
438 
439  // Make sure any previous copy has completed before continuing
440  auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
441  if (eventPrev.get()) {
442  eventPrev->cpuWaitOnEvent();
443  }
444 
445  memcpy(bufPinned[cur1BufIndex],
446  x + (size_t) cur1 * this->d,
447  (size_t) numToCopy * this->d * sizeof(float));
448 
449  // We pick up from here
450  cur2 = cur1;
451  cur1 += numToCopy;
452  cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
453  }
454  }
455 }
456 
457 void
459  float* out) const {
460  DeviceScope scope(device_);
461 
462  FAISS_ASSERT(key < this->ntotal);
463  auto stream = resources_->getDefaultStream(device_);
464 
465  if (config_.useFloat16) {
466  auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
467  fromDevice(vec.data(), out, this->d, stream);
468  } else {
469  auto vec = data_->getVectorsFloat32Ref()[key];
470  fromDevice(vec.data(), out, this->d, stream);
471  }
472 }
473 
474 void
477  float* out) const {
478  DeviceScope scope(device_);
479 
480  FAISS_ASSERT(i0 < this->ntotal);
481  FAISS_ASSERT(i0 + num - 1 < this->ntotal);
482  auto stream = resources_->getDefaultStream(device_);
483 
484  if (config_.useFloat16) {
485  auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
486  fromDevice(vec.data(), out, num * this->d, stream);
487  } else {
488  auto vec = data_->getVectorsFloat32Ref()[i0];
489  fromDevice(vec.data(), out, this->d * num, stream);
490  }
491 }
492 
493 void
494 GpuIndexFlat::set_typename() {
495  if (this->metric_type == faiss::METRIC_L2) {
496  this->index_typename = "GpuL2";
497  } else {
498  this->index_typename = "GpuIP";
499  }
500 }
501 
502 //
503 // GpuIndexFlatL2
504 //
505 
507  faiss::IndexFlatL2* index,
508  GpuIndexFlatConfig config) :
509  GpuIndexFlat(resources, index, config) {
510 }
511 
513  int dims,
514  GpuIndexFlatConfig config) :
515  GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {
516 }
517 
518 void
520  GpuIndexFlat::copyFrom(index);
521 }
522 
523 void
525  GpuIndexFlat::copyTo(index);
526 }
527 
528 //
529 // GpuIndexFlatIP
530 //
531 
533  faiss::IndexFlatIP* index,
534  GpuIndexFlatConfig config) :
535  GpuIndexFlat(resources, index, config) {
536 }
537 
539  int dims,
540  GpuIndexFlatConfig config) :
541  GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {
542 }
543 
544 void
546  GpuIndexFlat::copyFrom(index);
547 }
548 
549 void
551  GpuIndexFlat::copyTo(index);
552 }
553 
554 } } // namespace
DeviceTensor< float, 2, true > getVectorsFloat32Copy(cudaStream_t stream)
Definition: FlatIndex.cu:76
void copyFrom(faiss::IndexFlatL2 *index)
void copyTo(faiss::IndexFlat *index) const
virtual void searchImpl_(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const
Should not be called (we have our own implementation)
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
Definition: GpuIndexFlat.cu:74
int getSize() const
Returns the number of vectors we contain.
Definition: FlatIndex.cu:43
Holder of GPU resources for a particular flat index.
Definition: FlatIndex.cuh:23
void copyTo(faiss::IndexFlatL2 *index)
void searchFromCpuPaged_(int n, const float *x, int k, float *outDistancesData, int *outIndicesData) const
__host__ __device__ Tensor< T, Dim, Contig, IndexT, PtrTraits > narrowOutermost(IndexT start, IndexT size)
Definition: Tensor-inl.cuh:606
size_t getNumVecs() const
Returns the number of vectors we contain.
virtual void train(Index::idx_t n, const float *x)
This index is not trained, so this does nothing.
GpuIndexFlat(GpuResources *resources, const faiss::IndexFlat *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
Definition: GpuIndexFlat.cu:33
virtual void addImpl_(faiss::Index::idx_t n, const float *x, const faiss::Index::idx_t *ids)
Called from GpuIndex for add.
void setMinPagingSize(size_t size)
Definition: GpuIndexFlat.cu:69
int device_
The GPU device we are resident on.
Definition: GpuIndex.h:80
GpuIndexFlatL2(GpuResources *resources, faiss::IndexFlatL2 *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
bool getUseFloat16() const
Do we store vectors and perform math in float16?
Definition: GpuIndexFlat.cu:79
virtual void reset()
Clears all vectors from this index.
int d
vector dimension
Definition: Index.h:66
void copyTo(faiss::IndexFlatIP *index)
GpuIndexFlatIP(GpuResources *resources, faiss::IndexFlatIP *index, GpuIndexFlatConfig config=GpuIndexFlatConfig())
GpuResources * resources_
Manages streans, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:77
long idx_t
all indices are this type
Definition: Index.h:64
virtual void search(faiss::Index::idx_t n, const float *x, faiss::Index::idx_t k, float *distances, faiss::Index::idx_t *labels) const
void add(const float *data, int numVecs, cudaStream_t stream)
Definition: FlatIndex.cu:184
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
void copyFrom(const faiss::IndexFlat *index)
Definition: GpuIndexFlat.cu:84
Our tensor type.
Definition: Tensor.cuh:31
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
Definition: GpuIndexFlat.h:140
virtual void reconstruct(faiss::Index::idx_t key, float *out) const
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
Tensor< float, 2, true > & getVectorsFloat32Ref()
Returns a reference to our vectors currently in use.
Definition: FlatIndex.cu:64
void copyFrom(faiss::IndexFlatIP *index)
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
void reset()
Free all storage.
Definition: FlatIndex.cu:256
std::vector< float > xb
database vectors, size ntotal * d
Definition: IndexFlat.h:26
virtual void reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float *out) const
Batch reconstruction method.
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.
Definition: Index.h:44
FlatIndex * data_
Holds our GPU data containing the list of vectors.
Definition: GpuIndexFlat.h:145