docs/html/GpuIndex_8cu_source.html

 /**

  * Copyright (c) Facebook, Inc. and its affiliates.

  *

  * This source code is licensed under the MIT license found in the

  * LICENSE file in the root directory of this source tree.

  */


 #include "GpuIndex.h"

 #include "../FaissAssert.h"

 #include "GpuResources.h"

 #include "utils/CopyUtils.cuh"

 #include "utils/DeviceUtils.h"

 #include "utils/StaticUtils.h"

 #include <limits>

 #include <memory>


 namespace faiss { namespace gpu {


 /// Default CPU search size for which we use paged copies

 constexpr size_t kMinPageSize = (size_t) 256 * 1024 * 1024;


 /// Size above which we page copies from the CPU to GPU (non-paged

 /// memory usage)

 constexpr size_t kNonPinnedPageSize = (size_t) 256 * 1024 * 1024;


 // Default size for which we page add or search

 constexpr size_t kAddPageSize = (size_t) 256 * 1024 * 1024;


 // Or, maximum number of vectors to consider per page of add or search

 constexpr size_t kAddVecSize = (size_t) 512 * 1024;


 // Use a smaller search size, as precomputed code usage on IVFPQ

 // requires substantial amounts of memory

 // FIXME: parameterize based on algorithm need

 constexpr size_t kSearchVecSize = (size_t) 32 * 1024;


 GpuIndex::GpuIndex(GpuResources* resources,

                    int dims,

                    faiss::MetricType metric,

                    GpuIndexConfig config) :

     Index(dims, metric),

     resources_(resources),

     device_(config.device),

     memorySpace_(config.memorySpace),

     minPagedSize_(kMinPageSize) {

   FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(),

                      "Invalid GPU device %d", device_);


   FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");


 #ifdef FAISS_UNIFIED_MEM

   FAISS_THROW_IF_NOT_FMT(

     memorySpace_ == MemorySpace::Device ||

     (memorySpace_ == MemorySpace::Unified &&

      getFullUnifiedMemSupport(device_)),

     "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",

     config.device);

 #else

   FAISS_THROW_IF_NOT_MSG(memorySpace_ == MemorySpace::Device,

                      "Must compile with CUDA 8+ for Unified Memory support");

 #endif


   FAISS_ASSERT(resources_);

   resources_->initializeForDevice(device_);

 }


 void

 GpuIndex::setMinPagingSize(size_t size) {

   minPagedSize_ = size;

 }


 size_t

 GpuIndex::getMinPagingSize() const {

   return minPagedSize_;

 }


 void

 GpuIndex::add(Index::idx_t n, const float* x) {

   // Pass to add_with_ids

   add_with_ids(n, x, nullptr);

 }


 void

 GpuIndex::add_with_ids(Index::idx_t n,

                        const float* x,

                        const Index::idx_t* ids) {

   FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");


   // For now, only support <= max int results

   FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),

                          "GPU index only supports up to %d indices",

                          std::numeric_limits<int>::max());


   if (n == 0) {

     // nothing to add

     return;

   }


   std::vector<Index::idx_t> generatedIds;


   // Generate IDs if we need them

   if (!ids && addImplRequiresIDs_()) {

     generatedIds = std::vector<Index::idx_t>(n);


     for (Index::idx_t i = 0; i < n; ++i) {

       generatedIds[i] = this->ntotal + i;

     }

   }


   DeviceScope scope(device_);

   addPaged_((int) n, x, ids ? ids : generatedIds.data());

 }


 void

 GpuIndex::addPaged_(int n,

                     const float* x,

                     const Index::idx_t* ids) {

   if (n > 0) {

     size_t totalSize = (size_t) n * this->d * sizeof(float);


     if (totalSize > kAddPageSize || n > kAddVecSize) {

       // How many vectors fit into kAddPageSize?

       size_t maxNumVecsForPageSize =

         kAddPageSize / ((size_t) this->d * sizeof(float));


       // Always add at least 1 vector, if we have huge vectors

       maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t) 1);


       size_t tileSize = std::min((size_t) n, maxNumVecsForPageSize);

       tileSize = std::min(tileSize, kSearchVecSize);


       for (size_t i = 0; i < (size_t) n; i += tileSize) {

         size_t curNum = std::min(tileSize, n - i);


         addPage_(curNum,

                  x + i * (size_t) this->d,

                  ids ? ids + i : nullptr);

       }

     } else {

       addPage_(n, x, ids);

     }

   }

 }


 void

 GpuIndex::addPage_(int n,

                    const float* x,

                    const Index::idx_t* ids) {

   // At this point, `x` can be resident on CPU or GPU, and `ids` may be resident

   // on CPU, GPU or may be null.

   //

   // Before continuing, we guarantee that all data will be resident on the GPU.

   auto stream = resources_->getDefaultStreamCurrentDevice();


   auto vecs = toDevice<float, 2>(resources_,

                                  device_,

                                  const_cast<float*>(x),

                                  stream,

                                  {n, this->d});


   if (ids) {

     auto indices = toDevice<Index::idx_t, 1>(resources_,

                                              device_,

                                              const_cast<Index::idx_t*>(ids),

                                              stream,

                                              {n});


     addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);

   } else {

     addImpl_(n, vecs.data(), nullptr);

   }

 }


 void

 GpuIndex::search(Index::idx_t n,

                  const float* x,

                  Index::idx_t k,

                  float* distances,

                  Index::idx_t* labels) const {

   FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");


   // For now, only support <= max int results

   FAISS_THROW_IF_NOT_FMT(n <= (Index::idx_t) std::numeric_limits<int>::max(),

                          "GPU index only supports up to %d indices",

                          std::numeric_limits<int>::max());


   // Maximum k-selection supported is based on the CUDA SDK

   FAISS_THROW_IF_NOT_FMT(k <= (Index::idx_t) getMaxKSelection(),

                          "GPU index only supports k <= %d (requested %d)",

                          getMaxKSelection(),

                          (int) k); // select limitation


   if (n == 0 || k == 0) {

     // nothing to search

     return;

   }


   DeviceScope scope(device_);

   auto stream = resources_->getDefaultStream(device_);


   // We guarantee that the searchImpl_ will be called with device-resident

   // pointers.


   // The input vectors may be too large for the GPU, but we still

   // assume that the output distances and labels are not.

   // Go ahead and make space for output distances and labels on the

   // GPU.

   // If we reach a point where all inputs are too big, we can add

   // another level of tiling.

   auto outDistances =

     toDevice<float, 2>(resources_, device_, distances, stream,

                        {(int) n, (int) k});


   auto outLabels =

     toDevice<faiss::Index::idx_t, 2>(resources_, device_, labels, stream,

                                      {(int) n, (int) k});


   bool usePaged = false;


   if (getDeviceForAddress(x) == -1) {

     // It is possible that the user is querying for a vector set size

     // `x` that won't fit on the GPU.

     // In this case, we will have to handle paging of the data from CPU

     // -> GPU.

     // Currently, we don't handle the case where the output data won't

     // fit on the GPU (e.g., n * k is too large for the GPU memory).

     size_t dataSize = (size_t) n * this->d * sizeof(float);


     if (dataSize >= minPagedSize_) {

       searchFromCpuPaged_(n, x, k,

                           outDistances.data(),

                           outLabels.data());

       usePaged = true;

     }

   }


   if (!usePaged) {

     searchNonPaged_(n, x, k,

                     outDistances.data(),

                     outLabels.data());

   }


   // Copy back if necessary

   fromDevice<float, 2>(outDistances, distances, stream);

   fromDevice<faiss::Index::idx_t, 2>(outLabels, labels, stream);

 }


 void

 GpuIndex::searchNonPaged_(int n,

                           const float* x,

                           int k,

                           float* outDistancesData,

                           Index::idx_t* outIndicesData) const {

   auto stream = resources_->getDefaultStream(device_);


   // Make sure arguments are on the device we desire; use temporary

   // memory allocations to move it if necessary

   auto vecs = toDevice<float, 2>(resources_,

                                  device_,

                                  const_cast<float*>(x),

                                  stream,

                                  {n, (int) this->d});


   searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);

 }


 void

 GpuIndex::searchFromCpuPaged_(int n,

                               const float* x,

                               int k,

                               float* outDistancesData,

                               Index::idx_t* outIndicesData) const {

   Tensor<float, 2, true> outDistances(outDistancesData, {n, k});

   Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});


   // Is pinned memory available?

   auto pinnedAlloc = resources_->getPinnedMemory();

   int pageSizeInVecs =

     (int) ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));


   if (!pinnedAlloc.first || pageSizeInVecs < 1) {

     // Just page without overlapping copy with compute

     int batchSize = utils::nextHighestPowerOf2(

       (int) ((size_t) kNonPinnedPageSize /

              (sizeof(float) * this->d)));


     for (int cur = 0; cur < n; cur += batchSize) {

       int num = std::min(batchSize, n - cur);


       auto outDistancesSlice = outDistances.narrowOutermost(cur, num);

       auto outIndicesSlice = outIndices.narrowOutermost(cur, num);


       searchNonPaged_(num,

                       x + (size_t) cur * this->d,

                       k,

                       outDistancesSlice.data(),

                       outIndicesSlice.data());

     }


     return;

   }


   //

   // Pinned memory is available, so we can overlap copy with compute.

   // We use two pinned memory buffers, and triple-buffer the

   // procedure:

   //

   // 1 CPU copy -> pinned

   // 2 pinned copy -> GPU

   // 3 GPU compute

   //

   // 1 2 3 1 2 3 ...   (pinned buf A)

   //   1 2 3 1 2 ...   (pinned buf B)

   //     1 2 3 1 ...   (pinned buf A)

   // time ->

   //

   auto defaultStream = resources_->getDefaultStream(device_);

   auto copyStream = resources_->getAsyncCopyStream(device_);


   FAISS_ASSERT((size_t) pageSizeInVecs * this->d <=

                (size_t) std::numeric_limits<int>::max());


   float* bufPinnedA = (float*) pinnedAlloc.first;

   float* bufPinnedB = bufPinnedA + (size_t) pageSizeInVecs * this->d;

   float* bufPinned[2] = {bufPinnedA, bufPinnedB};


   // Reserve space on the GPU for the destination of the pinned buffer

   // copy

   DeviceTensor<float, 2, true> bufGpuA(

     resources_->getMemoryManagerCurrentDevice(),

     {(int) pageSizeInVecs, (int) this->d},

     defaultStream);

   DeviceTensor<float, 2, true> bufGpuB(

     resources_->getMemoryManagerCurrentDevice(),

     {(int) pageSizeInVecs, (int) this->d},

     defaultStream);

   DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};


   // Copy completion events for the pinned buffers

   std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];


   // Execute completion events for the GPU buffers

   std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];


   // All offsets are in terms of number of vectors; they remain within

   // int bounds (as this function only handles max in vectors)


   // Current start offset for buffer 1

   int cur1 = 0;

   int cur1BufIndex = 0;


   // Current start offset for buffer 2

   int cur2 = -1;

   int cur2BufIndex = 0;


   // Current start offset for buffer 3

   int cur3 = -1;

   int cur3BufIndex = 0;


   while (cur3 < n) {

     // Start async pinned -> GPU copy first (buf 2)

     if (cur2 != -1 && cur2 < n) {

       // Copy pinned to GPU

       int numToCopy = std::min(pageSizeInVecs, n - cur2);


       // Make sure any previous execution has completed before continuing

       auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];

       if (eventPrev.get()) {

         eventPrev->streamWaitOnEvent(copyStream);

       }


       CUDA_VERIFY(cudaMemcpyAsync(bufGpus[cur2BufIndex]->data(),

                                   bufPinned[cur2BufIndex],

                                   (size_t) numToCopy * this->d * sizeof(float),

                                   cudaMemcpyHostToDevice,

                                   copyStream));


       // Mark a completion event in this stream

       eventPinnedCopyDone[cur2BufIndex] =

         std::move(std::unique_ptr<CudaEvent>(new CudaEvent(copyStream)));


       // We pick up from here

       cur3 = cur2;

       cur2 += numToCopy;

       cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;

     }


     if (cur3 != -1 && cur3 < n) {

       // Process on GPU

       int numToProcess = std::min(pageSizeInVecs, n - cur3);


       // Make sure the previous copy has completed before continuing

       auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];

       FAISS_ASSERT(eventPrev.get());


       eventPrev->streamWaitOnEvent(defaultStream);


       // Create tensor wrappers

       // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),

       //                                    {numToProcess, this->d});

       auto outDistancesSlice = outDistances.narrowOutermost(cur3, numToProcess);

       auto outIndicesSlice = outIndices.narrowOutermost(cur3, numToProcess);


       searchImpl_(numToProcess,

                   bufGpus[cur3BufIndex]->data(),

                   k,

                   outDistancesSlice.data(),

                   outIndicesSlice.data());


       // Create completion event

       eventGpuExecuteDone[cur3BufIndex] =

         std::move(std::unique_ptr<CudaEvent>(new CudaEvent(defaultStream)));


       // We pick up from here

       cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;

       cur3 += numToProcess;

     }


     if (cur1 < n) {

       // Copy CPU mem to CPU pinned

       int numToCopy = std::min(pageSizeInVecs, n - cur1);


       // Make sure any previous copy has completed before continuing

       auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];

       if (eventPrev.get()) {

         eventPrev->cpuWaitOnEvent();

       }


       memcpy(bufPinned[cur1BufIndex],

              x + (size_t) cur1 * this->d,

              (size_t) numToCopy * this->d * sizeof(float));


       // We pick up from here

       cur2 = cur1;

       cur1 += numToCopy;

       cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;

     }

   }

 }


 } } // namespace

faiss::gpu::GpuResources::getDefaultStreamCurrentDevice
cudaStream_t getDefaultStreamCurrentDevice()
Calls getDefaultStream with the current device.
Definition: GpuResources.cpp:23

faiss::gpu::GpuIndex::getMinPagingSize
size_t getMinPagingSize() const
Returns the current minimum data size for paged searches.
Definition: GpuIndex.cu:74

faiss::gpu::GpuIndex::searchImpl_
virtual void searchImpl_(int n, const float *x, int k, float *distances, Index::idx_t *labels) const =0

faiss::gpu::GpuIndex::addImplRequiresIDs_
virtual bool addImplRequiresIDs_() const =0

faiss::gpu::GpuIndex::add_with_ids
void add_with_ids(Index::idx_t n, const float *x, const Index::idx_t *ids) override
Definition: GpuIndex.cu:85

faiss::Index::d
int d
vector dimension
Definition: Index.h:66

faiss::Index::idx_t
long idx_t
all indices are this type
Definition: Index.h:62

faiss::gpu::GpuIndex::device_
const int device_
The GPU device we are resident on.
Definition: GpuIndex.h:126

faiss::gpu::GpuIndex::resources_
GpuResources * resources_
Manages streams, cuBLAS handles and scratch memory for devices.
Definition: GpuIndex.h:123

faiss::Index::ntotal
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67

faiss::gpu::GpuIndex::add
void add(faiss::Index::idx_t, const float *x) override
Definition: GpuIndex.cu:79

faiss::gpu::DeviceScope
Definition: DeviceUtils.h:68

faiss::Index::is_trained
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71

faiss::gpu::GpuIndex::minPagedSize_
size_t minPagedSize_
Size above which we page copies from the CPU to GPU.
Definition: GpuIndex.h:132

faiss::gpu::GpuIndex::setMinPagingSize
void setMinPagingSize(size_t size)
Definition: GpuIndex.cu:69

faiss::gpu::GpuIndex::addImpl_
virtual void addImpl_(int n, const float *x, const Index::idx_t *ids)=0

faiss::gpu::GpuIndex::search
void search(Index::idx_t n, const float *x, Index::idx_t k, float *distances, Index::idx_t *labels) const override
Definition: GpuIndex.cu:176

faiss::MetricType
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44