12 #include "StandardGpuResources.h"
13 #include "../FaissAssert.h"
15 namespace faiss {
namespace gpu {
19 constexpr
int kNumStreams = 2;
22 constexpr
float kDefaultTempMemFraction = 0.18f;
25 constexpr
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
29 StandardGpuResources::StandardGpuResources() :
30 pinnedMemAlloc_(nullptr),
31 pinnedMemAllocSize_(0),
32 tempMemFraction_(kDefaultTempMemFraction),
35 pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
38 StandardGpuResources::~StandardGpuResources() {
39 for (
auto& entry : defaultStreams_) {
40 DeviceScope scope(entry.first);
42 CUDA_VERIFY(cudaStreamDestroy(entry.second));
45 for (
auto& entry : alternateStreams_) {
46 DeviceScope scope(entry.first);
48 for (
auto stream : entry.second) {
49 CUDA_VERIFY(cudaStreamDestroy(stream));
53 for (
auto& entry : asyncCopyStreams_) {
54 DeviceScope scope(entry.first);
56 CUDA_VERIFY(cudaStreamDestroy(entry.second));
59 for (
auto& entry : blasHandles_) {
60 DeviceScope scope(entry.first);
62 auto blasStatus = cublasDestroy(entry.second);
63 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
66 if (pinnedMemAlloc_) {
67 CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
84 FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
86 tempMemFraction_ = fraction;
92 FAISS_ASSERT(defaultStreams_.size() == 0);
93 FAISS_ASSERT(!pinnedMemAlloc_);
95 pinnedMemSize_ = size;
102 if (defaultStreams_.count(device) != 0) {
108 if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
109 CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
111 cudaHostAllocDefault));
112 pinnedMemAllocSize_ = pinnedMemSize_;
115 FAISS_ASSERT(device < getNumDevices());
119 auto& prop = getDeviceProperties(device);
122 FAISS_ASSERT(prop.major > 3 || (prop.major == 3 && prop.minor >= 5));
125 cudaStream_t defaultStream = 0;
126 CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
127 cudaStreamNonBlocking));
129 defaultStreams_[device] = defaultStream;
131 cudaStream_t asyncCopyStream = 0;
132 CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
133 cudaStreamNonBlocking));
135 asyncCopyStreams_[device] = asyncCopyStream;
137 std::vector<cudaStream_t> deviceStreams;
138 for (
int j = 0; j < kNumStreams; ++j) {
139 cudaStream_t stream = 0;
140 CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
141 cudaStreamNonBlocking));
143 deviceStreams.push_back(stream);
146 alternateStreams_[device] = std::move(deviceStreams);
149 cublasHandle_t blasHandle = 0;
150 auto blasStatus = cublasCreate(&blasHandle);
151 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
152 blasHandles_[device] = blasHandle;
159 CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
161 toAlloc = (size_t) (tempMemFraction_ * devTotal);
163 toAlloc = tempMemSize_;
166 FAISS_ASSERT(memory_.count(device) == 0);
167 memory_.emplace(device,
168 std::unique_ptr<StackDeviceMemory>(
173 StandardGpuResources::getBlasHandle(
int device) {
175 return blasHandles_[device];
179 StandardGpuResources::getDefaultStream(
int device) {
181 return defaultStreams_[device];
184 std::vector<cudaStream_t>
185 StandardGpuResources::getAlternateStreams(
int device) {
187 return alternateStreams_[device];
190 DeviceMemory& StandardGpuResources::getMemoryManager(
int device) {
192 return *memory_[device];
195 std::pair<void*, size_t>
196 StandardGpuResources::getPinnedMemory() {
197 return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
201 StandardGpuResources::getAsyncCopyStream(
int device) {
203 return asyncCopyStreams_[device];
void setTempMemoryFraction(float fraction)
void initializeForDevice(int device) override
Internal system calls.
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)