11 #include "StandardGpuResources.h"
12 #include "../FaissAssert.h"
14 namespace faiss {
namespace gpu {
18 constexpr
int kNumStreams = 2;
21 constexpr
float kDefaultTempMemFraction = 0.18f;
24 constexpr
size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
28 StandardGpuResources::StandardGpuResources() :
29 pinnedMemAlloc_(nullptr),
30 pinnedMemAllocSize_(0),
31 tempMemFraction_(kDefaultTempMemFraction),
34 pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
37 StandardGpuResources::~StandardGpuResources() {
38 for (
auto& entry : defaultStreams_) {
39 DeviceScope scope(entry.first);
41 CUDA_VERIFY(cudaStreamDestroy(entry.second));
44 for (
auto& entry : alternateStreams_) {
45 DeviceScope scope(entry.first);
47 for (
auto stream : entry.second) {
48 CUDA_VERIFY(cudaStreamDestroy(stream));
52 for (
auto& entry : asyncCopyStreams_) {
53 DeviceScope scope(entry.first);
55 CUDA_VERIFY(cudaStreamDestroy(entry.second));
58 for (
auto& entry : blasHandles_) {
59 DeviceScope scope(entry.first);
61 auto blasStatus = cublasDestroy(entry.second);
62 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
65 if (pinnedMemAlloc_) {
66 CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
83 FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
85 tempMemFraction_ = fraction;
91 FAISS_ASSERT(defaultStreams_.size() == 0);
92 FAISS_ASSERT(!pinnedMemAlloc_);
94 pinnedMemSize_ = size;
101 if (defaultStreams_.count(device) != 0) {
107 if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
108 CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
110 cudaHostAllocDefault));
111 pinnedMemAllocSize_ = pinnedMemSize_;
114 FAISS_ASSERT(device < getNumDevices());
118 auto& prop = getDeviceProperties(device);
121 FAISS_ASSERT_FMT(prop.major >= 3,
122 "Device id %d with CC %d.%d not supported, "
123 "need 3.0+ compute capability",
124 device, prop.major, prop.minor);
127 cudaStream_t defaultStream = 0;
128 CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
129 cudaStreamNonBlocking));
131 defaultStreams_[device] = defaultStream;
133 cudaStream_t asyncCopyStream = 0;
134 CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
135 cudaStreamNonBlocking));
137 asyncCopyStreams_[device] = asyncCopyStream;
139 std::vector<cudaStream_t> deviceStreams;
140 for (
int j = 0; j < kNumStreams; ++j) {
141 cudaStream_t stream = 0;
142 CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
143 cudaStreamNonBlocking));
145 deviceStreams.push_back(stream);
148 alternateStreams_[device] = std::move(deviceStreams);
151 cublasHandle_t blasHandle = 0;
152 auto blasStatus = cublasCreate(&blasHandle);
153 FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
154 blasHandles_[device] = blasHandle;
161 CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
163 toAlloc = (size_t) (tempMemFraction_ * devTotal);
165 toAlloc = tempMemSize_;
168 FAISS_ASSERT(memory_.count(device) == 0);
169 memory_.emplace(device,
170 std::unique_ptr<StackDeviceMemory>(
175 StandardGpuResources::getBlasHandle(
int device) {
177 return blasHandles_[device];
181 StandardGpuResources::getDefaultStream(
int device) {
183 return defaultStreams_[device];
186 std::vector<cudaStream_t>
187 StandardGpuResources::getAlternateStreams(
int device) {
189 return alternateStreams_[device];
192 DeviceMemory& StandardGpuResources::getMemoryManager(
int device) {
194 return *memory_[device];
197 std::pair<void*, size_t>
198 StandardGpuResources::getPinnedMemory() {
199 return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
203 StandardGpuResources::getAsyncCopyStream(
int device) {
205 return asyncCopyStreams_[device];
void setTempMemoryFraction(float fraction)
void initializeForDevice(int device) override
Internal system calls.
void setTempMemory(size_t size)
void setPinnedMemory(size_t size)