Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
StandardGpuResources.cpp
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 
9 #include "StandardGpuResources.h"
10 #include "utils/MemorySpace.h"
11 #include "../FaissAssert.h"
12 #include <limits>
13 
14 namespace faiss { namespace gpu {
15 
16 namespace {
17 
18 // How many streams per device we allocate by default (for multi-streaming)
19 constexpr int kNumStreams = 2;
20 
21 // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
22 constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
23 
24 // Default temporary memory allocation for <= 4 GiB memory GPUs
25 constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
26 
27 // Default temporary memory allocation for <= 8 GiB memory GPUs
28 constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
29 
30 // Maximum temporary memory allocation for all GPUs
31 constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
32 
33 }
34 
35 StandardGpuResources::StandardGpuResources() :
36  pinnedMemAlloc_(nullptr),
37  pinnedMemAllocSize_(0),
38  // let the adjustment function determine the memory size for us by passing
39  // in a huge value that will then be adjusted
40  tempMemSize_(getDefaultTempMemForGPU(-1,
41  std::numeric_limits<size_t>::max())),
42  pinnedMemSize_(kDefaultPinnedMemoryAllocation),
43  cudaMallocWarning_(true) {
44 }
45 
46 StandardGpuResources::~StandardGpuResources() {
47  for (auto& entry : defaultStreams_) {
48  DeviceScope scope(entry.first);
49 
50  auto it = userDefaultStreams_.find(entry.first);
51  if (it == userDefaultStreams_.end()) {
52  // The user did not specify this stream, thus we are the ones
53  // who have created it
54  CUDA_VERIFY(cudaStreamDestroy(entry.second));
55  }
56  }
57 
58  for (auto& entry : alternateStreams_) {
59  DeviceScope scope(entry.first);
60 
61  for (auto stream : entry.second) {
62  CUDA_VERIFY(cudaStreamDestroy(stream));
63  }
64  }
65 
66  for (auto& entry : asyncCopyStreams_) {
67  DeviceScope scope(entry.first);
68 
69  CUDA_VERIFY(cudaStreamDestroy(entry.second));
70  }
71 
72  for (auto& entry : blasHandles_) {
73  DeviceScope scope(entry.first);
74 
75  auto blasStatus = cublasDestroy(entry.second);
76  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
77  }
78 
79  if (pinnedMemAlloc_) {
80  freeMemorySpace(MemorySpace::HostPinned, pinnedMemAlloc_);
81  }
82 }
83 
84 size_t
85 StandardGpuResources::getDefaultTempMemForGPU(int device,
86  size_t requested) {
87  auto totalMem = device != -1 ?
88  getDeviceProperties(device).totalGlobalMem :
89  std::numeric_limits<size_t>::max();
90 
91  if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
92  // If the GPU has <= 4 GiB of memory, reserve 512 MiB
93 
94  if (requested > k4GiBTempMem) {
95  return k4GiBTempMem;
96  }
97  } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
98  // If the GPU has <= 8 GiB of memory, reserve 1 GiB
99 
100  if (requested > k8GiBTempMem) {
101  return k8GiBTempMem;
102  }
103  } else {
104  // Never use more than 1.5 GiB
105  if (requested > kMaxTempMem) {
106  return kMaxTempMem;
107  }
108  }
109 
110  // use whatever lower limit the user requested
111  return requested;
112 }
113 
114 void
116  setTempMemory(0);
117  setCudaMallocWarning(false);
118 }
119 
120 void
122  if (tempMemSize_ != size) {
123  // adjust based on general limits
124  tempMemSize_ = getDefaultTempMemForGPU(-1, size);
125 
126  // We need to re-initialize memory resources for all current devices that
127  // have been initialized.
128  // This should be safe to do, even if we are currently running work, because
129  // the cudaFree call that this implies will force-synchronize all GPUs with
130  // the CPU
131  for (auto& p : memory_) {
132  int device = p.first;
133  // Free the existing memory first
134  p.second.reset();
135 
136  // Allocate new
137  p.second = std::unique_ptr<StackDeviceMemory>(
138  new StackDeviceMemory(p.first,
139  // adjust for this specific device
140  getDefaultTempMemForGPU(device, tempMemSize_)));
141  }
142  }
143 }
144 
145 void
147  // Should not call this after devices have been initialized
148  FAISS_ASSERT(defaultStreams_.size() == 0);
149  FAISS_ASSERT(!pinnedMemAlloc_);
150 
151  pinnedMemSize_ = size;
152 }
153 
154 void
155 StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
156  auto it = defaultStreams_.find(device);
157  if (it != defaultStreams_.end()) {
158  // Replace this stream with the user stream
159  CUDA_VERIFY(cudaStreamDestroy(it->second));
160  it->second = stream;
161  }
162 
163  userDefaultStreams_[device] = stream;
164 }
165 
166 void
168  for (int dev = 0; dev < getNumDevices(); ++dev) {
169  setDefaultStream(dev, nullptr);
170  }
171 }
172 
173 void
175  cudaMallocWarning_ = b;
176 
177  for (auto& v : memory_) {
178  v.second->setCudaMallocWarning(b);
179  }
180 }
181 
182 bool
183 StandardGpuResources::isInitialized(int device) const {
184  // Use default streams as a marker for whether or not a certain
185  // device has been initialized
186  return defaultStreams_.count(device) != 0;
187 }
188 
189 void
191  if (isInitialized(device)) {
192  return;
193  }
194 
195  // If this is the first device that we're initializing, create our
196  // pinned memory allocation
197  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
198  allocMemorySpace(MemorySpace::HostPinned, &pinnedMemAlloc_, pinnedMemSize_);
199  pinnedMemAllocSize_ = pinnedMemSize_;
200  }
201 
202  FAISS_ASSERT(device < getNumDevices());
203  DeviceScope scope(device);
204 
205  // Make sure that device properties for all devices are cached
206  auto& prop = getDeviceProperties(device);
207 
208  // Also check to make sure we meet our minimum compute capability (3.0)
209  FAISS_ASSERT_FMT(prop.major >= 3,
210  "Device id %d with CC %d.%d not supported, "
211  "need 3.0+ compute capability",
212  device, prop.major, prop.minor);
213 
214  // Create streams
215  cudaStream_t defaultStream = 0;
216  auto it = userDefaultStreams_.find(device);
217  if (it != userDefaultStreams_.end()) {
218  // We already have a stream provided by the user
219  defaultStream = it->second;
220  } else {
221  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
222  cudaStreamNonBlocking));
223  }
224 
225  defaultStreams_[device] = defaultStream;
226 
227  cudaStream_t asyncCopyStream = 0;
228  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
229  cudaStreamNonBlocking));
230 
231  asyncCopyStreams_[device] = asyncCopyStream;
232 
233  std::vector<cudaStream_t> deviceStreams;
234  for (int j = 0; j < kNumStreams; ++j) {
235  cudaStream_t stream = 0;
236  CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
237  cudaStreamNonBlocking));
238 
239  deviceStreams.push_back(stream);
240  }
241 
242  alternateStreams_[device] = std::move(deviceStreams);
243 
244  // Create cuBLAS handle
245  cublasHandle_t blasHandle = 0;
246  auto blasStatus = cublasCreate(&blasHandle);
247  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
248  blasHandles_[device] = blasHandle;
249 
250  FAISS_ASSERT(memory_.count(device) == 0);
251 
252  auto mem = std::unique_ptr<StackDeviceMemory>(
253  new StackDeviceMemory(device,
254  // adjust for this specific device
255  getDefaultTempMemForGPU(device, tempMemSize_)));
256  mem->setCudaMallocWarning(cudaMallocWarning_);
257 
258  memory_.emplace(device, std::move(mem));
259 }
260 
261 cublasHandle_t
263  initializeForDevice(device);
264  return blasHandles_[device];
265 }
266 
267 cudaStream_t
269  initializeForDevice(device);
270  return defaultStreams_[device];
271 }
272 
273 std::vector<cudaStream_t>
275  initializeForDevice(device);
276  return alternateStreams_[device];
277 }
278 
280  initializeForDevice(device);
281  return *memory_[device];
282 }
283 
284 std::pair<void*, size_t>
286  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
287 }
288 
289 cudaStream_t
291  initializeForDevice(device);
292  return asyncCopyStreams_[device];
293 }
294 
295 } } // namespace
void setDefaultStream(int device, cudaStream_t stream)
Called to change the stream for work ordering.
cublasHandle_t getBlasHandle(int device) override
Returns the cuBLAS handle that we use for the given device.
void initializeForDevice(int device) override
Internal system calls.
cudaStream_t getAsyncCopyStream(int device) override
Returns the stream on which we perform async CPU &lt;-&gt; GPU copies.
DeviceMemory & getMemoryManager(int device) override
Returns the temporary memory manager for the given device.
cudaStream_t getDefaultStream(int device) override
Manages temporary memory allocations on a GPU device.
Definition: DeviceMemory.h:44
std::pair< void *, size_t > getPinnedMemory() override
Returns the available CPU pinned memory buffer.
std::vector< cudaStream_t > getAlternateStreams(int device) override
Returns the set of alternative streams that we use for the given device.