9 #include "StandardGpuResources.h"
10 #include "utils/MemorySpace.h"
11 #include "../FaissAssert.h"
14 namespace faiss {
namespace gpu {
19 constexpr
int kNumStreams = 2;
22 constexpr
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
25 constexpr
size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
28 constexpr
size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
31 constexpr
size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
35 StandardGpuResources::StandardGpuResources() :
36 pinnedMemAlloc_(nullptr),
37 pinnedMemAllocSize_(0),
40 tempMemSize_(getDefaultTempMemForGPU(-1,
41 std::numeric_limits<size_t>::max())),
42 pinnedMemSize_(kDefaultPinnedMemoryAllocation),
43 cudaMallocWarning_(true) {
46 StandardGpuResources::~StandardGpuResources() {
47 for (
auto& entry : defaultStreams_) {
48 DeviceScope scope(entry.first);
50 auto it = userDefaultStreams_.find(entry.first);
51 if (it == userDefaultStreams_.end()) {
54 CUDA_VERIFY(cudaStreamDestroy(entry.second));
58 for (
auto& entry : alternateStreams_) {
59 DeviceScope scope(entry.first);
61 for (
auto stream : entry.second) {
62 CUDA_VERIFY(cudaStreamDestroy(stream));
66 for (
auto& entry : asyncCopyStreams_) {
67 DeviceScope scope(entry.first);
69 CUDA_VERIFY(cudaStreamDestroy(entry.second));
72 for (
auto& entry : blasHandles_) {
73 DeviceScope scope(entry.first);
75 auto blasStatus = cublasDestroy(entry.second);
76 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
79 if (pinnedMemAlloc_) {
80 freeMemorySpace(MemorySpace::HostPinned, pinnedMemAlloc_);
85 StandardGpuResources::getDefaultTempMemForGPU(
int device,
87 auto totalMem = device != -1 ?
88 getDeviceProperties(device).totalGlobalMem :
89 std::numeric_limits<size_t>::max();
91 if (totalMem <= (
size_t) 4 * 1024 * 1024 * 1024) {
94 if (requested > k4GiBTempMem) {
97 }
else if (totalMem <= (
size_t) 8 * 1024 * 1024 * 1024) {
100 if (requested > k8GiBTempMem) {
105 if (requested > kMaxTempMem) {
122 if (tempMemSize_ != size) {
124 tempMemSize_ = getDefaultTempMemForGPU(-1, size);
131 for (
auto& p : memory_) {
132 int device = p.first;
137 p.second = std::unique_ptr<StackDeviceMemory>(
140 getDefaultTempMemForGPU(device, tempMemSize_)));
148 FAISS_ASSERT(defaultStreams_.size() == 0);
149 FAISS_ASSERT(!pinnedMemAlloc_);
151 pinnedMemSize_ = size;
156 auto it = defaultStreams_.find(device);
157 if (it != defaultStreams_.end()) {
159 CUDA_VERIFY(cudaStreamDestroy(it->second));
163 userDefaultStreams_[device] = stream;
168 for (
int dev = 0; dev < getNumDevices(); ++dev) {
175 cudaMallocWarning_ = b;
177 for (
auto& v : memory_) {
178 v.second->setCudaMallocWarning(b);
183 StandardGpuResources::isInitialized(
int device)
const {
186 return defaultStreams_.count(device) != 0;
191 if (isInitialized(device)) {
197 if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
198 allocMemorySpace(MemorySpace::HostPinned, &pinnedMemAlloc_, pinnedMemSize_);
199 pinnedMemAllocSize_ = pinnedMemSize_;
202 FAISS_ASSERT(device < getNumDevices());
206 auto& prop = getDeviceProperties(device);
209 FAISS_ASSERT_FMT(prop.major >= 3,
210 "Device id %d with CC %d.%d not supported, "
211 "need 3.0+ compute capability",
212 device, prop.major, prop.minor);
215 cudaStream_t defaultStream = 0;
216 auto it = userDefaultStreams_.find(device);
217 if (it != userDefaultStreams_.end()) {
219 defaultStream = it->second;
221 CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
222 cudaStreamNonBlocking));
225 defaultStreams_[device] = defaultStream;
227 cudaStream_t asyncCopyStream = 0;
228 CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
229 cudaStreamNonBlocking));
231 asyncCopyStreams_[device] = asyncCopyStream;
233 std::vector<cudaStream_t> deviceStreams;
234 for (
int j = 0; j < kNumStreams; ++j) {
235 cudaStream_t stream = 0;
236 CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
237 cudaStreamNonBlocking));
239 deviceStreams.push_back(stream);
242 alternateStreams_[device] = std::move(deviceStreams);
245 cublasHandle_t blasHandle = 0;
246 auto blasStatus = cublasCreate(&blasHandle);
247 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
248 blasHandles_[device] = blasHandle;
250 FAISS_ASSERT(memory_.count(device) == 0);
252 auto mem = std::unique_ptr<StackDeviceMemory>(
255 getDefaultTempMemForGPU(device, tempMemSize_)));
256 mem->setCudaMallocWarning(cudaMallocWarning_);
258 memory_.emplace(device, std::move(mem));
264 return blasHandles_[device];
270 return defaultStreams_[device];
273 std::vector<cudaStream_t>
276 return alternateStreams_[device];
281 return *memory_[device];
284 std::pair<void*, size_t>
286 return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
292 return asyncCopyStreams_[device];
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering.
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device.
void initializeForDevice(int device) override
Internal system calls.
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU <-> GPU copies.
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device.
void setCudaMallocWarning(bool b)
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)
cudaStream_t getDefaultStream(int device) override
void setDefaultNullStreamAllDevices()
Manages temporary memory allocations on a GPU device.
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer.
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.