12 #include "StandardGpuResources.h"
13 #include "../FaissAssert.h"
15 namespace faiss {
namespace gpu {
19 constexpr
int kNumStreams = 2;
22 constexpr
float kDefaultTempMemFraction = 0.18f;
25 constexpr
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
29 StandardGpuResources::StandardGpuResources() :
30 pinnedMemAlloc_(nullptr),
31 pinnedMemAllocSize_(0),
32 tempMemFraction_(kDefaultTempMemFraction),
35 pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
38 StandardGpuResources::~StandardGpuResources() {
39 for (
auto& entry : defaultStreams_) {
40 DeviceScope scope(entry.first);
42 CUDA_VERIFY(cudaStreamDestroy(entry.second));
45 for (
auto& entry : alternateStreams_) {
46 DeviceScope scope(entry.first);
48 for (
auto stream : entry.second) {
49 CUDA_VERIFY(cudaStreamDestroy(stream));
53 for (
auto& entry : asyncCopyStreams_) {
54 DeviceScope scope(entry.first);
56 CUDA_VERIFY(cudaStreamDestroy(entry.second));
59 for (
auto& entry : blasHandles_) {
60 DeviceScope scope(entry.first);
62 auto blasStatus = cublasDestroy(entry.second);
63 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
66 if (pinnedMemAlloc_) {
67 CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
84 FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
86 tempMemFraction_ = fraction;
92 FAISS_ASSERT(defaultStreams_.size() == 0);
93 FAISS_ASSERT(!pinnedMemAlloc_);
95 pinnedMemSize_ = size;
102 if (defaultStreams_.count(device) != 0) {
108 if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
109 CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
111 cudaHostAllocDefault));
112 pinnedMemAllocSize_ = pinnedMemSize_;
115 FAISS_ASSERT(device < getNumDevices());
119 auto& prop = getDeviceProperties(device);
122 FAISS_ASSERT(prop.major > 3 || (prop.major == 3 && prop.minor >= 5) ||
123 !
"Device not supported, need 3.5+ compute capability");
126 cudaStream_t defaultStream = 0;
127 CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
128 cudaStreamNonBlocking));
130 defaultStreams_[device] = defaultStream;
132 cudaStream_t asyncCopyStream = 0;
133 CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
134 cudaStreamNonBlocking));
136 asyncCopyStreams_[device] = asyncCopyStream;
138 std::vector<cudaStream_t> deviceStreams;
139 for (
int j = 0; j < kNumStreams; ++j) {
140 cudaStream_t stream = 0;
141 CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
142 cudaStreamNonBlocking));
144 deviceStreams.push_back(stream);
147 alternateStreams_[device] = std::move(deviceStreams);
150 cublasHandle_t blasHandle = 0;
151 auto blasStatus = cublasCreate(&blasHandle);
152 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
153 blasHandles_[device] = blasHandle;
160 CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
162 toAlloc = (size_t) (tempMemFraction_ * devTotal);
164 toAlloc = tempMemSize_;
167 FAISS_ASSERT(memory_.count(device) == 0);
168 memory_.emplace(device,
169 std::unique_ptr<StackDeviceMemory>(
174 StandardGpuResources::getBlasHandle(
int device) {
176 return blasHandles_[device];
180 StandardGpuResources::getDefaultStream(
int device) {
182 return defaultStreams_[device];
185 std::vector<cudaStream_t>
186 StandardGpuResources::getAlternateStreams(
int device) {
188 return alternateStreams_[device];
191 DeviceMemory& StandardGpuResources::getMemoryManager(
int device) {
193 return *memory_[device];
196 std::pair<void*, size_t>
197 StandardGpuResources::getPinnedMemory() {
198 return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
202 StandardGpuResources::getAsyncCopyStream(
int device) {
204 return asyncCopyStreams_[device];
void setTempMemoryFraction(float fraction)
void initializeForDevice(int device) override
Internal system calls.
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)