10 #include "StandardGpuResources.h"
11 #include "../FaissAssert.h"
13 namespace faiss {
namespace gpu {
17 constexpr
int kNumStreams = 2;
20 constexpr
float kDefaultTempMemFraction = 0.18f;
23 constexpr
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
27 StandardGpuResources::StandardGpuResources() :
28 pinnedMemAlloc_(nullptr),
29 pinnedMemAllocSize_(0),
30 tempMemFraction_(kDefaultTempMemFraction),
33 pinnedMemSize_(kDefaultPinnedMemoryAllocation),
34 cudaMallocWarning_(true) {
37 StandardGpuResources::~StandardGpuResources() {
38 for (
auto& entry : defaultStreams_) {
39 DeviceScope scope(entry.first);
41 auto it = userDefaultStreams_.find(entry.first);
42 if (it == userDefaultStreams_.end()) {
45 CUDA_VERIFY(cudaStreamDestroy(entry.second));
49 for (
auto& entry : alternateStreams_) {
50 DeviceScope scope(entry.first);
52 for (
auto stream : entry.second) {
53 CUDA_VERIFY(cudaStreamDestroy(stream));
57 for (
auto& entry : asyncCopyStreams_) {
58 DeviceScope scope(entry.first);
60 CUDA_VERIFY(cudaStreamDestroy(entry.second));
63 for (
auto& entry : blasHandles_) {
64 DeviceScope scope(entry.first);
66 auto blasStatus = cublasDestroy(entry.second);
67 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
70 if (pinnedMemAlloc_) {
71 CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
89 FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
91 tempMemFraction_ = fraction;
97 FAISS_ASSERT(defaultStreams_.size() == 0);
98 FAISS_ASSERT(!pinnedMemAlloc_);
100 pinnedMemSize_ = size;
105 auto it = defaultStreams_.find(device);
106 if (it != defaultStreams_.end()) {
108 CUDA_VERIFY(cudaStreamDestroy(it->second));
112 userDefaultStreams_[device] = stream;
117 for (
int dev = 0; dev < getNumDevices(); ++dev) {
124 cudaMallocWarning_ = b;
126 for (
auto& v : memory_) {
127 v.second->setCudaMallocWarning(b);
135 if (defaultStreams_.count(device) != 0) {
141 if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
142 CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
144 cudaHostAllocDefault));
145 pinnedMemAllocSize_ = pinnedMemSize_;
148 FAISS_ASSERT(device < getNumDevices());
152 auto& prop = getDeviceProperties(device);
155 FAISS_ASSERT_FMT(prop.major >= 3,
156 "Device id %d with CC %d.%d not supported, "
157 "need 3.0+ compute capability",
158 device, prop.major, prop.minor);
161 cudaStream_t defaultStream = 0;
162 auto it = userDefaultStreams_.find(device);
163 if (it != userDefaultStreams_.end()) {
165 defaultStream = it->second;
167 CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
168 cudaStreamNonBlocking));
171 defaultStreams_[device] = defaultStream;
173 cudaStream_t asyncCopyStream = 0;
174 CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
175 cudaStreamNonBlocking));
177 asyncCopyStreams_[device] = asyncCopyStream;
179 std::vector<cudaStream_t> deviceStreams;
180 for (
int j = 0; j < kNumStreams; ++j) {
181 cudaStream_t stream = 0;
182 CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
183 cudaStreamNonBlocking));
185 deviceStreams.push_back(stream);
188 alternateStreams_[device] = std::move(deviceStreams);
191 cublasHandle_t blasHandle = 0;
192 auto blasStatus = cublasCreate(&blasHandle);
193 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
194 blasHandles_[device] = blasHandle;
201 CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
203 toAlloc = (size_t) (tempMemFraction_ * devTotal);
205 toAlloc = tempMemSize_;
208 FAISS_ASSERT(memory_.count(device) == 0);
210 auto mem = std::unique_ptr<StackDeviceMemory>(
212 mem->setCudaMallocWarning(cudaMallocWarning_);
214 memory_.emplace(device, std::move(mem));
220 return blasHandles_[device];
226 return defaultStreams_[device];
229 std::vector<cudaStream_t>
232 return alternateStreams_[device];
237 return *memory_[device];
240 std::pair<void*, size_t>
242 return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
248 return asyncCopyStreams_[device];
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering.
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device.
void setTempMemoryFraction(float fraction)
void initializeForDevice(int device) override
Internal system calls.
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU <-> GPU copies.
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device.
void setCudaMallocWarning(bool b)
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)
cudaStream_t getDefaultStream(int device) override
void setDefaultNullStreamAllDevices()
Manages temporary memory allocations on a GPU device.
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer.
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.