10 #include "../FaissAssert.h"
11 #include "GpuResources.h"
12 #include "utils/CopyUtils.cuh"
13 #include "utils/DeviceUtils.h"
14 #include "utils/StaticUtils.h"
18 namespace faiss {
namespace gpu {
21 constexpr
size_t kMinPageSize = (size_t) 256 * 1024 * 1024;
25 constexpr
size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;
28 constexpr
size_t kAddPageSize = (size_t) 256 * 1024 * 1024;
31 constexpr
size_t kAddVecSize = (size_t) 512 * 1024;
36 constexpr
size_t kSearchVecSize = (size_t) 32 * 1024;
38 GpuIndex::GpuIndex(GpuResources* resources,
41 GpuIndexConfig config) :
43 resources_(resources),
44 device_(config.device),
45 memorySpace_(config.memorySpace),
46 minPagedSize_(kMinPageSize) {
47 FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(),
48 "Invalid GPU device %d", device_);
50 FAISS_THROW_IF_NOT_MSG(dims > 0,
"Invalid number of dimensions");
52 #ifdef FAISS_UNIFIED_MEM
53 FAISS_THROW_IF_NOT_FMT(
54 memorySpace_ == MemorySpace::Device ||
55 (memorySpace_ == MemorySpace::Unified &&
56 getFullUnifiedMemSupport(device_)),
57 "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
60 FAISS_THROW_IF_NOT_MSG(memorySpace_ == MemorySpace::Device,
61 "Must compile with CUDA 8+ for Unified Memory support");
64 FAISS_ASSERT(resources_);
65 resources_->initializeForDevice(device_);
88 FAISS_THROW_IF_NOT_MSG(this->
is_trained,
"Index not trained");
91 FAISS_THROW_IF_NOT_FMT(n <= (
Index::idx_t) std::numeric_limits<int>::max(),
92 "GPU index only supports up to %d indices",
93 std::numeric_limits<int>::max());
100 std::vector<Index::idx_t> generatedIds;
104 generatedIds = std::vector<Index::idx_t>(n);
107 generatedIds[i] = this->
ntotal + i;
112 addPaged_((
int) n, x, ids ? ids : generatedIds.data());
116 GpuIndex::addPaged_(
int n,
120 size_t totalSize = (size_t) n * this->
d *
sizeof(
float);
122 if (totalSize > kAddPageSize || n > kAddVecSize) {
124 size_t maxNumVecsForPageSize =
125 kAddPageSize / ((size_t) this->
d *
sizeof(
float));
128 maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (
size_t) 1);
130 size_t tileSize = std::min((
size_t) n, maxNumVecsForPageSize);
131 tileSize = std::min(tileSize, kSearchVecSize);
133 for (
size_t i = 0; i < (size_t) n; i += tileSize) {
134 size_t curNum = std::min(tileSize, n - i);
137 x + i * (
size_t) this->
d,
138 ids ? ids + i :
nullptr);
147 GpuIndex::addPage_(
int n,
158 const_cast<float*
>(x),
163 auto indices = toDevice<Index::idx_t, 1>(
resources_,
169 addImpl_(n, vecs.data(), ids ? indices.data() :
nullptr);
181 FAISS_THROW_IF_NOT_MSG(this->
is_trained,
"Index not trained");
184 FAISS_THROW_IF_NOT_FMT(n <= (
Index::idx_t) std::numeric_limits<int>::max(),
185 "GPU index only supports up to %d indices",
186 std::numeric_limits<int>::max());
189 FAISS_THROW_IF_NOT_FMT(k <= (
Index::idx_t) getMaxKSelection(),
190 "GPU index only supports k <= %d (requested %d)",
194 if (n == 0 || k == 0) {
200 auto stream = resources_->getDefaultStream(device_);
219 bool usePaged =
false;
221 if (getDeviceForAddress(x) == -1) {
228 size_t dataSize = (size_t) n * this->
d *
sizeof(
float);
231 searchFromCpuPaged_(n, x, k,
239 searchNonPaged_(n, x, k,
245 fromDevice<float, 2>(outDistances, distances, stream);
246 fromDevice<faiss::Index::idx_t, 2>(outLabels, labels, stream);
250 GpuIndex::searchNonPaged_(
int n,
253 float* outDistancesData,
255 auto stream = resources_->getDefaultStream(device_);
261 const_cast<float*
>(x),
265 searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
269 GpuIndex::searchFromCpuPaged_(
int n,
272 float* outDistancesData,
274 Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
275 Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
278 auto pinnedAlloc = resources_->getPinnedMemory();
280 (int) ((pinnedAlloc.second / 2) / (
sizeof(float) * this->
d));
282 if (!pinnedAlloc.first || pageSizeInVecs < 1) {
284 int batchSize = utils::nextHighestPowerOf2(
285 (
int) ((
size_t) kNonPinnedPageSize /
286 (
sizeof(
float) * this->
d)));
288 for (
int cur = 0; cur < n; cur += batchSize) {
289 int num = std::min(batchSize, n - cur);
291 auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
292 auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
295 x + (
size_t) cur * this->
d,
297 outDistancesSlice.data(),
298 outIndicesSlice.data());
318 auto defaultStream = resources_->getDefaultStream(device_);
319 auto copyStream = resources_->getAsyncCopyStream(device_);
321 FAISS_ASSERT((
size_t) pageSizeInVecs * this->
d <=
322 (
size_t) std::numeric_limits<int>::max());
324 float* bufPinnedA = (
float*) pinnedAlloc.first;
325 float* bufPinnedB = bufPinnedA + (
size_t) pageSizeInVecs * this->
d;
326 float* bufPinned[2] = {bufPinnedA, bufPinnedB};
330 DeviceTensor<float, 2, true> bufGpuA(
331 resources_->getMemoryManagerCurrentDevice(),
332 {(int) pageSizeInVecs, (
int) this->d},
334 DeviceTensor<float, 2, true> bufGpuB(
335 resources_->getMemoryManagerCurrentDevice(),
336 {(int) pageSizeInVecs, (
int) this->d},
338 DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
341 std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
344 std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
351 int cur1BufIndex = 0;
355 int cur2BufIndex = 0;
359 int cur3BufIndex = 0;
363 if (cur2 != -1 && cur2 < n) {
365 int numToCopy = std::min(pageSizeInVecs, n - cur2);
368 auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
369 if (eventPrev.get()) {
370 eventPrev->streamWaitOnEvent(copyStream);
373 CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),
374 bufPinned[cur2BufIndex],
375 (
size_t) numToCopy * this->d *
sizeof(
float),
376 cudaMemcpyHostToDevice,
380 eventPinnedCopyDone[cur2BufIndex] =
381 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(copyStream)));
386 cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
389 if (cur3 != -1 && cur3 < n) {
391 int numToProcess = std::min(pageSizeInVecs, n - cur3);
394 auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
395 FAISS_ASSERT(eventPrev.get());
397 eventPrev->streamWaitOnEvent(defaultStream);
402 auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);
403 auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);
406 bufGpus[cur3BufIndex]->data(),
408 outDistancesSlice.data(),
409 outIndicesSlice.data());
412 eventGpuExecuteDone[cur3BufIndex] =
413 std::move(std::unique_ptr<CudaEvent>(
new CudaEvent(defaultStream)));
416 cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
417 cur3 += numToProcess;
422 int numToCopy = std::min(pageSizeInVecs, n - cur1);
425 auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
426 if (eventPrev.get()) {
427 eventPrev->cpuWaitOnEvent();
430 memcpy(bufPinned[cur1BufIndex],
431 x + (
size_t) cur1 * this->d,
432 (
size_t) numToCopy * this->d *
sizeof(
float));
437 cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
virtual void searchImpl_(int n, const float *x, int k, float *distances, Index::idx_t *labels) const =0
virtual bool addImplRequiresIDs_() const =0
void add_with_ids(Index::idx_t n, const float *x, const Index::idx_t *ids) override
long idx_t
all indices are this type
const int device_
The GPU device we are resident on.
GpuResources * resources_
Manages streams, cuBLAS handles and scratch memory for devices.
idx_t ntotal
total nb of indexed vectors
void add(faiss::Index::idx_t, const float *x) override
bool is_trained
set if the Index does not require training, or if training is done already
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
void setMinPagingSize(size_t size)
virtual void addImpl_(int n, const float *x, const Index::idx_t *ids)=0
void search(Index::idx_t n, const float *x, Index::idx_t k, float *distances, Index::idx_t *labels) const override
MetricType
Some algorithms support both an inner product version and a L2 search version.