11 #include "StandardGpuResources.h"
12 #include "../FaissAssert.h"
14 namespace faiss {
namespace gpu {
18 constexpr
int kNumStreams = 2;
21 constexpr
float kDefaultTempMemFraction = 0.18f;
24 constexpr
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
28 StandardGpuResources::StandardGpuResources() :
29 pinnedMemAlloc_(nullptr),
30 pinnedMemAllocSize_(0),
31 tempMemFraction_(kDefaultTempMemFraction),
34 pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
37 StandardGpuResources::~StandardGpuResources() {
38 for (
auto& entry : defaultStreams_) {
39 DeviceScope scope(entry.first);
41 auto it = userDefaultStreams_.find(entry.first);
42 if (it == userDefaultStreams_.end()) {
45 CUDA_VERIFY(cudaStreamDestroy(entry.second));
49 for (
auto& entry : alternateStreams_) {
50 DeviceScope scope(entry.first);
52 for (
auto stream : entry.second) {
53 CUDA_VERIFY(cudaStreamDestroy(stream));
57 for (
auto& entry : asyncCopyStreams_) {
58 DeviceScope scope(entry.first);
60 CUDA_VERIFY(cudaStreamDestroy(entry.second));
63 for (
auto& entry : blasHandles_) {
64 DeviceScope scope(entry.first);
66 auto blasStatus = cublasDestroy(entry.second);
67 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
70 if (pinnedMemAlloc_) {
71 CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
88 FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
90 tempMemFraction_ = fraction;
96 FAISS_ASSERT(defaultStreams_.size() == 0);
97 FAISS_ASSERT(!pinnedMemAlloc_);
99 pinnedMemSize_ = size;
104 auto it = defaultStreams_.find(device);
105 if (it != defaultStreams_.end()) {
107 CUDA_VERIFY(cudaStreamDestroy(it->second));
111 userDefaultStreams_[device] = stream;
116 for (
int dev = 0; dev < getNumDevices(); ++dev) {
125 if (defaultStreams_.count(device) != 0) {
131 if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
132 CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
134 cudaHostAllocDefault));
135 pinnedMemAllocSize_ = pinnedMemSize_;
138 FAISS_ASSERT(device < getNumDevices());
142 auto& prop = getDeviceProperties(device);
145 FAISS_ASSERT_FMT(prop.major >= 3,
146 "Device id %d with CC %d.%d not supported, "
147 "need 3.0+ compute capability",
148 device, prop.major, prop.minor);
151 cudaStream_t defaultStream = 0;
152 auto it = userDefaultStreams_.find(device);
153 if (it != userDefaultStreams_.end()) {
155 defaultStream = it->second;
157 CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
158 cudaStreamNonBlocking));
161 defaultStreams_[device] = defaultStream;
163 cudaStream_t asyncCopyStream = 0;
164 CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
165 cudaStreamNonBlocking));
167 asyncCopyStreams_[device] = asyncCopyStream;
169 std::vector<cudaStream_t> deviceStreams;
170 for (
int j = 0; j < kNumStreams; ++j) {
171 cudaStream_t stream = 0;
172 CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
173 cudaStreamNonBlocking));
175 deviceStreams.push_back(stream);
178 alternateStreams_[device] = std::move(deviceStreams);
181 cublasHandle_t blasHandle = 0;
182 auto blasStatus = cublasCreate(&blasHandle);
183 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
184 blasHandles_[device] = blasHandle;
191 CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
193 toAlloc = (size_t) (tempMemFraction_ * devTotal);
195 toAlloc = tempMemSize_;
198 FAISS_ASSERT(memory_.count(device) == 0);
199 memory_.emplace(device,
200 std::unique_ptr<StackDeviceMemory>(
207 return blasHandles_[device];
213 return defaultStreams_[device];
216 std::vector<cudaStream_t>
219 return alternateStreams_[device];
224 return *memory_[device];
227 std::pair<void*, size_t>
229 return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
235 return asyncCopyStreams_[device];
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering.
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device.
void setTempMemoryFraction(float fraction)
void initializeForDevice(int device) override
Internal system calls.
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU <-> GPU copies.
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device.
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)
cudaStream_t getDefaultStream(int device) override
void setDefaultNullStreamAllDevices()
Manages temporary memory allocations on a GPU device.
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer.
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.