Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
StandardGpuResources.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the CC-by-NC license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #include "StandardGpuResources.h"
12 #include "../FaissAssert.h"
13 
14 namespace faiss { namespace gpu {
15 
16 namespace {
17 
18 constexpr int kNumStreams = 2;
19 
20 /// Use 18% of GPU memory for temporary space by default
21 constexpr float kDefaultTempMemFraction = 0.18f;
22 
23 /// Default pinned memory allocation size
24 constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
25 
26 }
27 
28 StandardGpuResources::StandardGpuResources() :
29  pinnedMemAlloc_(nullptr),
30  pinnedMemAllocSize_(0),
31  tempMemFraction_(kDefaultTempMemFraction),
32  tempMemSize_(0),
33  useFraction_(true),
34  pinnedMemSize_(kDefaultPinnedMemoryAllocation) {
35 }
36 
37 StandardGpuResources::~StandardGpuResources() {
38  for (auto& entry : defaultStreams_) {
39  DeviceScope scope(entry.first);
40 
41  CUDA_VERIFY(cudaStreamDestroy(entry.second));
42  }
43 
44  for (auto& entry : alternateStreams_) {
45  DeviceScope scope(entry.first);
46 
47  for (auto stream : entry.second) {
48  CUDA_VERIFY(cudaStreamDestroy(stream));
49  }
50  }
51 
52  for (auto& entry : asyncCopyStreams_) {
53  DeviceScope scope(entry.first);
54 
55  CUDA_VERIFY(cudaStreamDestroy(entry.second));
56  }
57 
58  for (auto& entry : blasHandles_) {
59  DeviceScope scope(entry.first);
60 
61  auto blasStatus = cublasDestroy(entry.second);
62  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
63  }
64 
65  if (pinnedMemAlloc_) {
66  CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_));
67  }
68 }
69 
70 void
72  setTempMemory(0);
73 }
74 
75 void
77  useFraction_ = false;
78  tempMemSize_ = size;
79 }
80 
81 void
83  FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f);
84  useFraction_ = true;
85  tempMemFraction_ = fraction;
86 }
87 
88 void
90  // Should not call this after devices have been initialized
91  FAISS_ASSERT(defaultStreams_.size() == 0);
92  FAISS_ASSERT(!pinnedMemAlloc_);
93 
94  pinnedMemSize_ = size;
95 }
96 
97 void
99  // Use default streams as a marker for whether or not a certain
100  // device has been initialized
101  if (defaultStreams_.count(device) != 0) {
102  return;
103  }
104 
105  // If this is the first device that we're initializing, create our
106  // pinned memory allocation
107  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
108  CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_,
109  pinnedMemSize_,
110  cudaHostAllocDefault));
111  pinnedMemAllocSize_ = pinnedMemSize_;
112  }
113 
114  FAISS_ASSERT(device < getNumDevices());
115  DeviceScope scope(device);
116 
117  // Make sure that device properties for all devices are cached
118  auto& prop = getDeviceProperties(device);
119 
120  // Also check to make sure we meet our minimum compute capability (3.0)
121  FAISS_ASSERT_FMT(prop.major >= 3,
122  "Device id %d with CC %d.%d not supported, "
123  "need 3.0+ compute capability",
124  device, prop.major, prop.minor);
125 
126  // Create streams
127  cudaStream_t defaultStream = 0;
128  CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
129  cudaStreamNonBlocking));
130 
131  defaultStreams_[device] = defaultStream;
132 
133  cudaStream_t asyncCopyStream = 0;
134  CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
135  cudaStreamNonBlocking));
136 
137  asyncCopyStreams_[device] = asyncCopyStream;
138 
139  std::vector<cudaStream_t> deviceStreams;
140  for (int j = 0; j < kNumStreams; ++j) {
141  cudaStream_t stream = 0;
142  CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
143  cudaStreamNonBlocking));
144 
145  deviceStreams.push_back(stream);
146  }
147 
148  alternateStreams_[device] = std::move(deviceStreams);
149 
150  // Create cuBLAS handle
151  cublasHandle_t blasHandle = 0;
152  auto blasStatus = cublasCreate(&blasHandle);
153  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
154  blasHandles_[device] = blasHandle;
155 
156  size_t toAlloc = 0;
157  if (useFraction_) {
158  size_t devFree = 0;
159  size_t devTotal = 0;
160 
161  CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal));
162 
163  toAlloc = (size_t) (tempMemFraction_ * devTotal);
164  } else {
165  toAlloc = tempMemSize_;
166  }
167 
168  FAISS_ASSERT(memory_.count(device) == 0);
169  memory_.emplace(device,
170  std::unique_ptr<StackDeviceMemory>(
171  new StackDeviceMemory(device, toAlloc)));
172 }
173 
174 cublasHandle_t
175 StandardGpuResources::getBlasHandle(int device) {
176  initializeForDevice(device);
177  return blasHandles_[device];
178 }
179 
180 cudaStream_t
181 StandardGpuResources::getDefaultStream(int device) {
182  initializeForDevice(device);
183  return defaultStreams_[device];
184 }
185 
186 std::vector<cudaStream_t>
187 StandardGpuResources::getAlternateStreams(int device) {
188  initializeForDevice(device);
189  return alternateStreams_[device];
190 }
191 
192 DeviceMemory& StandardGpuResources::getMemoryManager(int device) {
193  initializeForDevice(device);
194  return *memory_[device];
195 }
196 
197 std::pair<void*, size_t>
198 StandardGpuResources::getPinnedMemory() {
199  return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
200 }
201 
202 cudaStream_t
203 StandardGpuResources::getAsyncCopyStream(int device) {
204  initializeForDevice(device);
205  return asyncCopyStreams_[device];
206 }
207 
208 } } // namespace
void initializeForDevice(int device) override
Internal system calls.