Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
StandardGpuResources.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "StandardGpuResources.h"
13 #include "../FaissAssert.h"
14 
15 namespace faiss { namespace gpu {
16 
17 namespace {
18 
19 constexpr int kNumStreams = 2;
20 
21 /// Use 18% of GPU memory for temporary space by default
22 constexpr float kDefaultTempMemFraction = 0.18f;
23 
24 /// Default pinned memory allocation size
25 constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
26 
27 }
28 
29 StandardGpuResources::StandardGpuResources() :
30  pinnedMemAlloc_(nullptr),
31  pinnedMemAllocSize_(0),
32  tempMemFraction_(kDefaultTempMemFraction),
33  tempMemSize_(0),
34  useFraction_(true),
35  pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
36 }
37 
38 StandardGpuResources::~StandardGpuResources() {
39  for (auto& entry : defaultStreams_) {
40  DeviceScope scope(entry.first);
41 
42  CUDA_VERIFY(cudaStreamDestroy(entry.second));
43  }
44 
45  for (auto& entry : alternateStreams_) {
46  DeviceScope scope(entry.first);
47 
48  for (auto stream : entry.second) {
49  CUDA_VERIFY(cudaStreamDestroy(stream));
50  }
51  }
52 
53  for (auto& entry : asyncCopyStreams_) {
54  DeviceScope scope(entry.first);
55 
56  CUDA_VERIFY(cudaStreamDestroy(entry.second));
57  }
58 
59  for (auto& entry : blasHandles_) {
60  DeviceScope scope(entry.first);
61 
62  auto blasStatus = cublasDestroy(entry.second);
63  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
64  }
65 
66  if (pinnedMemAlloc_) {
67  CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
68  }
69 }
70 
71 void
73  setTempMemory(0);
74 }
75 
76 void
78  useFraction_ = false;
79  tempMemSize_ = size;
80 }
81 
82 void
84  FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
85  useFraction_ = true;
86  tempMemFraction_ = fraction;
87 }
88 
89 void
91  // Should not call this after devices have been initialized
92  FAISS_ASSERT(defaultStreams_.size() == 0);
93  FAISS_ASSERT(!pinnedMemAlloc_);
94 
95  pinnedMemSize_ = size;
96 }
97 
98 void
100  // Use default streams as a marker for whether or not a certain
101  // device has been initialized
102  if (defaultStreams_.count(device) != 0) {
103  return;
104  }
105 
106  // If this is the first device that we're initializing, create our
107  // pinned memory allocation
108  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
109  CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
110  pinnedMemSize_,
111  cudaHostAllocDefault));
112  pinnedMemAllocSize_ = pinnedMemSize_;
113  }
114 
115  FAISS_ASSERT(device < getNumDevices());
116  DeviceScope scope(device);
117 
118  // Make sure that device properties for all devices are cached
119  auto& prop = getDeviceProperties(device);
120 
121  // Also check to make sure we meet our minimum compute capability (3.5)
122  FAISS_ASSERT(prop.major > 3 || (prop.major == 3 && prop.minor >= 5) ||
123  !"Device not supported, need 3.5+ compute capability");
124 
125  // Create streams
126  cudaStream_t defaultStream = 0;
127  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
128  cudaStreamNonBlocking));
129 
130  defaultStreams_[device] = defaultStream;
131 
132  cudaStream_t asyncCopyStream = 0;
133  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
134  cudaStreamNonBlocking));
135 
136  asyncCopyStreams_[device] = asyncCopyStream;
137 
138  std::vector<cudaStream_t> deviceStreams;
139  for (int j = 0; j < kNumStreams; ++j) {
140  cudaStream_t stream = 0;
141  CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
142  cudaStreamNonBlocking));
143 
144  deviceStreams.push_back(stream);
145  }
146 
147  alternateStreams_[device] = std::move(deviceStreams);
148 
149  // Create cuBLAS handle
150  cublasHandle_t blasHandle = 0;
151  auto blasStatus = cublasCreate(&blasHandle);
152  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
153  blasHandles_[device] = blasHandle;
154 
155  size_t toAlloc = 0;
156  if (useFraction_) {
157  size_t devFree = 0;
158  size_t devTotal = 0;
159 
160  CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
161 
162  toAlloc = (size_t) (tempMemFraction_ * devTotal);
163  } else {
164  toAlloc = tempMemSize_;
165  }
166 
167  FAISS_ASSERT(memory_.count(device) == 0);
168  memory_.emplace(device,
169  std::unique_ptr<StackDeviceMemory>(
170  new StackDeviceMemory(device, toAlloc)));
171 }
172 
173 cublasHandle_t
174 StandardGpuResources::getBlasHandle(int device) {
175  initializeForDevice(device);
176  return blasHandles_[device];
177 }
178 
179 cudaStream_t
180 StandardGpuResources::getDefaultStream(int device) {
181  initializeForDevice(device);
182  return defaultStreams_[device];
183 }
184 
185 std::vector<cudaStream_t>
186 StandardGpuResources::getAlternateStreams(int device) {
187  initializeForDevice(device);
188  return alternateStreams_[device];
189 }
190 
191 DeviceMemory& StandardGpuResources::getMemoryManager(int device) {
192  initializeForDevice(device);
193  return *memory_[device];
194 }
195 
196 std::pair<void*, size_t>
197 StandardGpuResources::getPinnedMemory() {
198  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
199 }
200 
201 cudaStream_t
202 StandardGpuResources::getAsyncCopyStream(int device) {
203  initializeForDevice(device);
204  return asyncCopyStreams_[device];
205 }
206 
207 } } // namespace
void initializeForDevice(int device) override
Internal system calls.