docs/html/StandardGpuResources_8cpp_source.html

 /**

  * Copyright (c) 2015-present, Facebook, Inc.

  * All rights reserved.

  *

  * This source code is licensed under the BSD+Patents license found in the

  * LICENSE file in the root directory of this source tree.

  */


 #include "StandardGpuResources.h"

 #include "../FaissAssert.h"


 namespace faiss { namespace gpu {


 namespace {


 constexpr int kNumStreams = 2;


 /// Use 18% of GPU memory for temporary space by default

 constexpr float kDefaultTempMemFraction = 0.18f;


 /// Default pinned memory allocation size

 constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;


 }


 StandardGpuResources::StandardGpuResources() :

     pinnedMemAlloc_(nullptr),

     pinnedMemAllocSize_(0),

     tempMemFraction_(kDefaultTempMemFraction),

     tempMemSize_(0),

     useFraction_(true),

     pinnedMemSize_(kDefaultPinnedMemoryAllocation),

     cudaMallocWarning_(true) {

 }


 StandardGpuResources::~StandardGpuResources() {

   for (auto& entry : defaultStreams_) {

     DeviceScope scope(entry.first);


     auto it = userDefaultStreams_.find(entry.first);

     if (it == userDefaultStreams_.end()) {

       // The user did not specify this stream, thus we are the ones

       // who have created it

       CUDA_VERIFY(cudaStreamDestroy(entry.second));

     }

   }


   for (auto& entry : alternateStreams_) {

     DeviceScope scope(entry.first);


     for (auto stream : entry.second) {

       CUDA_VERIFY(cudaStreamDestroy(stream));

     }

   }


   for (auto& entry : asyncCopyStreams_) {

     DeviceScope scope(entry.first);


     CUDA_VERIFY(cudaStreamDestroy(entry.second));

   }


   for (auto& entry : blasHandles_) {

     DeviceScope scope(entry.first);


     auto blasStatus = cublasDestroy(entry.second);

     FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);

   }


   if (pinnedMemAlloc_) {

     CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));

   }

 }


 void

 StandardGpuResources::noTempMemory() {

   setTempMemory(0);

   setCudaMallocWarning(false);

 }


 void

 StandardGpuResources::setTempMemory(size_t size) {

   useFraction_ = false;

   tempMemSize_ = size;

 }


 void

 StandardGpuResources::setTempMemoryFraction(float fraction) {

   FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);

   useFraction_ = true;

   tempMemFraction_ = fraction;

 }


 void

 StandardGpuResources::setPinnedMemory(size_t size) {

   // Should not call this after devices have been initialized

   FAISS_ASSERT(defaultStreams_.size() == 0);

   FAISS_ASSERT(!pinnedMemAlloc_);


   pinnedMemSize_ = size;

 }


 void

 StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {

   auto it = defaultStreams_.find(device);

   if (it != defaultStreams_.end()) {

     // Replace this stream with the user stream

     CUDA_VERIFY(cudaStreamDestroy(it->second));

     it->second = stream;

   }


   userDefaultStreams_[device] = stream;

 }


 void

 StandardGpuResources::setDefaultNullStreamAllDevices() {

   for (int dev = 0; dev < getNumDevices(); ++dev) {

     setDefaultStream(dev, nullptr);

   }

 }


 void

 StandardGpuResources::setCudaMallocWarning(bool b) {

   cudaMallocWarning_ = b;


   for (auto& v : memory_) {

     v.second->setCudaMallocWarning(b);

   }

 }


 void

 StandardGpuResources::initializeForDevice(int device) {

   // Use default streams as a marker for whether or not a certain

   // device has been initialized

   if (defaultStreams_.count(device) != 0) {

     return;

   }


   // If this is the first device that we're initializing, create our

   // pinned memory allocation

   if (defaultStreams_.empty() && pinnedMemSize_ > 0) {

     CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,

                               pinnedMemSize_,

                               cudaHostAllocDefault));

     pinnedMemAllocSize_ = pinnedMemSize_;

   }


   FAISS_ASSERT(device < getNumDevices());

   DeviceScope scope(device);


   // Make sure that device properties for all devices are cached

   auto& prop = getDeviceProperties(device);


   // Also check to make sure we meet our minimum compute capability (3.0)

   FAISS_ASSERT_FMT(prop.major >= 3,

                    "Device id %d with CC %d.%d not supported, "

                    "need 3.0+ compute capability",

                    device, prop.major, prop.minor);


   // Create streams

   cudaStream_t defaultStream = 0;

   auto it = userDefaultStreams_.find(device);

   if (it != userDefaultStreams_.end()) {

     // We already have a stream provided by the user

     defaultStream = it->second;

   } else {

     CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,

                                           cudaStreamNonBlocking));

   }


   defaultStreams_[device] = defaultStream;


   cudaStream_t asyncCopyStream = 0;

   CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,

                                         cudaStreamNonBlocking));


   asyncCopyStreams_[device] = asyncCopyStream;


   std::vector<cudaStream_t> deviceStreams;

   for (int j = 0; j < kNumStreams; ++j) {

     cudaStream_t stream = 0;

     CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,

                                           cudaStreamNonBlocking));


     deviceStreams.push_back(stream);

   }


   alternateStreams_[device] = std::move(deviceStreams);


   // Create cuBLAS handle

   cublasHandle_t blasHandle = 0;

   auto blasStatus = cublasCreate(&blasHandle);

   FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);

   blasHandles_[device] = blasHandle;


   size_t toAlloc = 0;

   if (useFraction_) {

     size_t devFree = 0;

     size_t devTotal = 0;


     CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));


     toAlloc = (size_t) (tempMemFraction_ * devTotal);

   } else {

     toAlloc = tempMemSize_;

   }


   FAISS_ASSERT(memory_.count(device) == 0);


   auto mem = std::unique_ptr<StackDeviceMemory>(

     new StackDeviceMemory(device, toAlloc));

   mem->setCudaMallocWarning(cudaMallocWarning_);


   memory_.emplace(device, std::move(mem));

 }


 cublasHandle_t

 StandardGpuResources::getBlasHandle(int device) {

   initializeForDevice(device);

   return blasHandles_[device];

 }


 cudaStream_t

 StandardGpuResources::getDefaultStream(int device) {

   initializeForDevice(device);

   return defaultStreams_[device];

 }


 std::vector<cudaStream_t>

 StandardGpuResources::getAlternateStreams(int device) {

   initializeForDevice(device);

   return alternateStreams_[device];

 }


 DeviceMemory& StandardGpuResources::getMemoryManager(int device) {

   initializeForDevice(device);

   return *memory_[device];

 }


 std::pair<void*, size_t>

 StandardGpuResources::getPinnedMemory() {

   return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);

 }


 cudaStream_t

 StandardGpuResources::getAsyncCopyStream(int device) {

   initializeForDevice(device);

   return asyncCopyStreams_[device];

 }


 } } // namespace

faiss::gpu::StandardGpuResources::setDefaultStream
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering.
Definition: StandardGpuResources.cpp:104

faiss::gpu::StandardGpuResources::getBlasHandle
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device.
Definition: StandardGpuResources.cpp:218

faiss::gpu::StandardGpuResources::setTempMemoryFraction
void setTempMemoryFraction(float fraction)
Definition: StandardGpuResources.cpp:88

faiss::gpu::StandardGpuResources::noTempMemory
void noTempMemory()
Definition: StandardGpuResources.cpp:76

faiss::gpu::StandardGpuResources::initializeForDevice
void initializeForDevice(int device) override
Internal system calls.
Definition: StandardGpuResources.cpp:132

faiss::gpu::StandardGpuResources::getAsyncCopyStream
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU &lt;-&gt; GPU copies.
Definition: StandardGpuResources.cpp:246

faiss::gpu::StandardGpuResources::getMemoryManager
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device.
Definition: StandardGpuResources.cpp:235

faiss::gpu::StandardGpuResources::setCudaMallocWarning
void setCudaMallocWarning(bool b)
Definition: StandardGpuResources.cpp:123

faiss::gpu::StandardGpuResources::setTempMemory
void setTempMemory(size_t size)
Definition: StandardGpuResources.cpp:82

faiss::gpu::StandardGpuResources::setPinnedMemory
void setPinnedMemory(size_t size)
Definition: StandardGpuResources.cpp:95

faiss::gpu::StandardGpuResources::getDefaultStream
cudaStream_t getDefaultStream(int device) override
Definition: StandardGpuResources.cpp:224

faiss::gpu::StandardGpuResources::setDefaultNullStreamAllDevices
void setDefaultNullStreamAllDevices()
Definition: StandardGpuResources.cpp:116

faiss::gpu::DeviceScope
Definition: DeviceUtils.h:64

faiss::gpu::StackDeviceMemory
Definition: StackDeviceMemory.h:21

faiss::gpu::DeviceMemory
Manages temporary memory allocations on a GPU device.
Definition: DeviceMemory.h:45

faiss::gpu::StandardGpuResources::getPinnedMemory
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer.
Definition: StandardGpuResources.cpp:241

faiss::gpu::StandardGpuResources::getAlternateStreams
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.
Definition: StandardGpuResources.cpp:230