Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
StandardGpuResources.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 
10 #include "StandardGpuResources.h"
11 #include "../FaissAssert.h"
12 
13 namespace faiss { namespace gpu {
14 
15 namespace {
16 
17 constexpr int kNumStreams = 2;
18 
19 /// Use 18% of GPU memory for temporary space by default
20 constexpr float kDefaultTempMemFraction = 0.18f;
21 
22 /// Default pinned memory allocation size
23 constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
24 
25 }
26 
27 StandardGpuResources::StandardGpuResources() :
28  pinnedMemAlloc_(nullptr),
29  pinnedMemAllocSize_(0),
30  tempMemFraction_(kDefaultTempMemFraction),
31  tempMemSize_(0),
32  useFraction_(true),
33  pinnedMemSize_(kDefaultPinnedMemoryAllocation),
34  cudaMallocWarning_(true) {
35 }
36 
37 StandardGpuResources::~StandardGpuResources() {
38  for (auto& entry : defaultStreams_) {
39  DeviceScope scope(entry.first);
40 
41  auto it = userDefaultStreams_.find(entry.first);
42  if (it == userDefaultStreams_.end()) {
43  // The user did not specify this stream, thus we are the ones
44  // who have created it
45  CUDA_VERIFY(cudaStreamDestroy(entry.second));
46  }
47  }
48 
49  for (auto& entry : alternateStreams_) {
50  DeviceScope scope(entry.first);
51 
52  for (auto stream : entry.second) {
53  CUDA_VERIFY(cudaStreamDestroy(stream));
54  }
55  }
56 
57  for (auto& entry : asyncCopyStreams_) {
58  DeviceScope scope(entry.first);
59 
60  CUDA_VERIFY(cudaStreamDestroy(entry.second));
61  }
62 
63  for (auto& entry : blasHandles_) {
64  DeviceScope scope(entry.first);
65 
66  auto blasStatus = cublasDestroy(entry.second);
67  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
68  }
69 
70  if (pinnedMemAlloc_) {
71  CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
72  }
73 }
74 
75 void
77  setTempMemory(0);
78  setCudaMallocWarning(false);
79 }
80 
81 void
83  useFraction_ = false;
84  tempMemSize_ = size;
85 }
86 
87 void
89  FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
90  useFraction_ = true;
91  tempMemFraction_ = fraction;
92 }
93 
94 void
96  // Should not call this after devices have been initialized
97  FAISS_ASSERT(defaultStreams_.size() == 0);
98  FAISS_ASSERT(!pinnedMemAlloc_);
99 
100  pinnedMemSize_ = size;
101 }
102 
103 void
104 StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
105  auto it = defaultStreams_.find(device);
106  if (it != defaultStreams_.end()) {
107  // Replace this stream with the user stream
108  CUDA_VERIFY(cudaStreamDestroy(it->second));
109  it->second = stream;
110  }
111 
112  userDefaultStreams_[device] = stream;
113 }
114 
115 void
117  for (int dev = 0; dev < getNumDevices(); ++dev) {
118  setDefaultStream(dev, nullptr);
119  }
120 }
121 
122 void
124  cudaMallocWarning_ = b;
125 
126  for (auto& v : memory_) {
127  v.second->setCudaMallocWarning(b);
128  }
129 }
130 
131 void
133  // Use default streams as a marker for whether or not a certain
134  // device has been initialized
135  if (defaultStreams_.count(device) != 0) {
136  return;
137  }
138 
139  // If this is the first device that we're initializing, create our
140  // pinned memory allocation
141  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
142  CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
143  pinnedMemSize_,
144  cudaHostAllocDefault));
145  pinnedMemAllocSize_ = pinnedMemSize_;
146  }
147 
148  FAISS_ASSERT(device < getNumDevices());
149  DeviceScope scope(device);
150 
151  // Make sure that device properties for all devices are cached
152  auto& prop = getDeviceProperties(device);
153 
154  // Also check to make sure we meet our minimum compute capability (3.0)
155  FAISS_ASSERT_FMT(prop.major >= 3,
156  "Device id %d with CC %d.%d not supported, "
157  "need 3.0+ compute capability",
158  device, prop.major, prop.minor);
159 
160  // Create streams
161  cudaStream_t defaultStream = 0;
162  auto it = userDefaultStreams_.find(device);
163  if (it != userDefaultStreams_.end()) {
164  // We already have a stream provided by the user
165  defaultStream = it->second;
166  } else {
167  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
168  cudaStreamNonBlocking));
169  }
170 
171  defaultStreams_[device] = defaultStream;
172 
173  cudaStream_t asyncCopyStream = 0;
174  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
175  cudaStreamNonBlocking));
176 
177  asyncCopyStreams_[device] = asyncCopyStream;
178 
179  std::vector<cudaStream_t> deviceStreams;
180  for (int j = 0; j < kNumStreams; ++j) {
181  cudaStream_t stream = 0;
182  CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
183  cudaStreamNonBlocking));
184 
185  deviceStreams.push_back(stream);
186  }
187 
188  alternateStreams_[device] = std::move(deviceStreams);
189 
190  // Create cuBLAS handle
191  cublasHandle_t blasHandle = 0;
192  auto blasStatus = cublasCreate(&blasHandle);
193  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
194  blasHandles_[device] = blasHandle;
195 
196  size_t toAlloc = 0;
197  if (useFraction_) {
198  size_t devFree = 0;
199  size_t devTotal = 0;
200 
201  CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
202 
203  toAlloc = (size_t) (tempMemFraction_ * devTotal);
204  } else {
205  toAlloc = tempMemSize_;
206  }
207 
208  FAISS_ASSERT(memory_.count(device) == 0);
209 
210  auto mem = std::unique_ptr<StackDeviceMemory>(
211  new StackDeviceMemory(device, toAlloc));
212  mem->setCudaMallocWarning(cudaMallocWarning_);
213 
214  memory_.emplace(device, std::move(mem));
215 }
216 
217 cublasHandle_t
219  initializeForDevice(device);
220  return blasHandles_[device];
221 }
222 
223 cudaStream_t
225  initializeForDevice(device);
226  return defaultStreams_[device];
227 }
228 
229 std::vector<cudaStream_t>
231  initializeForDevice(device);
232  return alternateStreams_[device];
233 }
234 
236  initializeForDevice(device);
237  return *memory_[device];
238 }
239 
240 std::pair<void*, size_t>
242  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
243 }
244 
245 cudaStream_t
247  initializeForDevice(device);
248  return asyncCopyStreams_[device];
249 }
250 
251 } } // namespace
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering.
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device.
void initializeForDevice(int device) override
Internal system calls.
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU &lt;-&gt; GPU copies.
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device.
cudaStream_t getDefaultStream(int device) override
Manages temporary memory allocations on a GPU device.
Definition: DeviceMemory.h:45
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer.
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.