Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
StandardGpuResources.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #include "StandardGpuResources.h"
13 #include "../FaissAssert.h"
14 
15 namespace faiss { namespace gpu {
16 
17 namespace {
18 
19 constexpr int kNumStreams = 2;
20 
21 /// Use 18% of GPU memory for temporary space by default
22 constexpr float kDefaultTempMemFraction = 0.18f;
23 
24 /// Default pinned memory allocation size
25 constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
26 
27 }
28 
29 StandardGpuResources::StandardGpuResources() :
30  pinnedMemAlloc_(nullptr),
31  pinnedMemAllocSize_(0),
32  tempMemFraction_(kDefaultTempMemFraction),
33  tempMemSize_(0),
34  useFraction_(true),
35  pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
36 }
37 
38 StandardGpuResources::~StandardGpuResources() {
39  for (auto& entry : defaultStreams_) {
40  DeviceScope scope(entry.first);
41 
42  CUDA_VERIFY(cudaStreamDestroy(entry.second));
43  }
44 
45  for (auto& entry : alternateStreams_) {
46  DeviceScope scope(entry.first);
47 
48  for (auto stream : entry.second) {
49  CUDA_VERIFY(cudaStreamDestroy(stream));
50  }
51  }
52 
53  for (auto& entry : asyncCopyStreams_) {
54  DeviceScope scope(entry.first);
55 
56  CUDA_VERIFY(cudaStreamDestroy(entry.second));
57  }
58 
59  for (auto& entry : blasHandles_) {
60  DeviceScope scope(entry.first);
61 
62  auto blasStatus = cublasDestroy(entry.second);
63  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
64  }
65 
66  if (pinnedMemAlloc_) {
67  CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
68  }
69 }
70 
71 void
73  setTempMemory(0);
74 }
75 
76 void
78  useFraction_ = false;
79  tempMemSize_ = size;
80 }
81 
82 void
84  FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
85  useFraction_ = true;
86  tempMemFraction_ = fraction;
87 }
88 
89 void
91  // Should not call this after devices have been initialized
92  FAISS_ASSERT(defaultStreams_.size() == 0);
93  FAISS_ASSERT(!pinnedMemAlloc_);
94 
95  pinnedMemSize_ = size;
96 }
97 
98 void
100  // Use default streams as a marker for whether or not a certain
101  // device has been initialized
102  if (defaultStreams_.count(device) != 0) {
103  return;
104  }
105 
106  // If this is the first device that we're initializing, create our
107  // pinned memory allocation
108  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
109  CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
110  pinnedMemSize_,
111  cudaHostAllocDefault));
112  pinnedMemAllocSize_ = pinnedMemSize_;
113  }
114 
115  FAISS_ASSERT(device < getNumDevices());
116  DeviceScope scope(device);
117 
118  // Make sure that device properties for all devices are cached
119  auto& prop = getDeviceProperties(device);
120 
121  // Also check to make sure we meet our minimum compute capability (3.5)
122  FAISS_ASSERT(prop.major > 3 || (prop.major == 3 && prop.minor >= 5));
123 
124  // Create streams
125  cudaStream_t defaultStream = 0;
126  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
127  cudaStreamNonBlocking));
128 
129  defaultStreams_[device] = defaultStream;
130 
131  cudaStream_t asyncCopyStream = 0;
132  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
133  cudaStreamNonBlocking));
134 
135  asyncCopyStreams_[device] = asyncCopyStream;
136 
137  std::vector<cudaStream_t> deviceStreams;
138  for (int j = 0; j < kNumStreams; ++j) {
139  cudaStream_t stream = 0;
140  CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
141  cudaStreamNonBlocking));
142 
143  deviceStreams.push_back(stream);
144  }
145 
146  alternateStreams_[device] = std::move(deviceStreams);
147 
148  // Create cuBLAS handle
149  cublasHandle_t blasHandle = 0;
150  auto blasStatus = cublasCreate(&blasHandle);
151  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
152  blasHandles_[device] = blasHandle;
153 
154  size_t toAlloc = 0;
155  if (useFraction_) {
156  size_t devFree = 0;
157  size_t devTotal = 0;
158 
159  CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
160 
161  toAlloc = (size_t) (tempMemFraction_ * devTotal);
162  } else {
163  toAlloc = tempMemSize_;
164  }
165 
166  FAISS_ASSERT(memory_.count(device) == 0);
167  memory_.emplace(device,
168  std::unique_ptr<StackDeviceMemory>(
169  new StackDeviceMemory(device, toAlloc)));
170 }
171 
172 cublasHandle_t
173 StandardGpuResources::getBlasHandle(int device) {
174  initializeForDevice(device);
175  return blasHandles_[device];
176 }
177 
178 cudaStream_t
179 StandardGpuResources::getDefaultStream(int device) {
180  initializeForDevice(device);
181  return defaultStreams_[device];
182 }
183 
184 std::vector<cudaStream_t>
185 StandardGpuResources::getAlternateStreams(int device) {
186  initializeForDevice(device);
187  return alternateStreams_[device];
188 }
189 
190 DeviceMemory& StandardGpuResources::getMemoryManager(int device) {
191  initializeForDevice(device);
192  return *memory_[device];
193 }
194 
195 std::pair<void*, size_t>
196 StandardGpuResources::getPinnedMemory() {
197  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
198 }
199 
200 cudaStream_t
201 StandardGpuResources::getAsyncCopyStream(int device) {
202  initializeForDevice(device);
203  return asyncCopyStreams_[device];
204 }
205 
206 } } // namespace
void initializeForDevice(int device) override
Internal system calls.