/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the CC-by-NC license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #include "StandardGpuResources.h" #include "../FaissAssert.h" namespace faiss { namespace gpu { namespace { constexpr int kNumStreams = 2; /// Use 18% of GPU memory for temporary space by default constexpr float kDefaultTempMemFraction = 0.18f; /// Default pinned memory allocation size constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024; } StandardGpuResources::StandardGpuResources() : pinnedMemAlloc_(nullptr), pinnedMemAllocSize_(0), tempMemFraction_(kDefaultTempMemFraction), tempMemSize_(0), useFraction_(true), pinnedMemSize_(kDefaultPinnedMemoryAllocation) { } StandardGpuResources::~StandardGpuResources() { for (auto& entry : defaultStreams_) { DeviceScope scope(entry.first); CUDA_VERIFY(cudaStreamDestroy(entry.second)); } for (auto& entry : alternateStreams_) { DeviceScope scope(entry.first); for (auto stream : entry.second) { CUDA_VERIFY(cudaStreamDestroy(stream)); } } for (auto& entry : asyncCopyStreams_) { DeviceScope scope(entry.first); CUDA_VERIFY(cudaStreamDestroy(entry.second)); } for (auto& entry : blasHandles_) { DeviceScope scope(entry.first); auto blasStatus = cublasDestroy(entry.second); FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); } if (pinnedMemAlloc_) { CUDA_VERIFY(cudaFreeHost(pinnedMemAlloc_)); } } void StandardGpuResources::noTempMemory() { setTempMemory(0); } void StandardGpuResources::setTempMemory(size_t size) { useFraction_ = false; tempMemSize_ = size; } void StandardGpuResources::setTempMemoryFraction(float fraction) { FAISS_ASSERT(fraction >= 0.0f && fraction <= 0.5f); useFraction_ = true; tempMemFraction_ = fraction; } void StandardGpuResources::setPinnedMemory(size_t size) { // Should not call this after devices have been initialized FAISS_ASSERT(defaultStreams_.size() == 0); FAISS_ASSERT(!pinnedMemAlloc_); pinnedMemSize_ = size; } void StandardGpuResources::initializeForDevice(int device) { // Use default streams as a marker for whether or not a certain // device has been initialized if (defaultStreams_.count(device) != 0) { return; } // If this is the first device that we're initializing, create our // pinned memory allocation if (defaultStreams_.empty() && pinnedMemSize_ > 0) { CUDA_VERIFY(cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault)); pinnedMemAllocSize_ = pinnedMemSize_; } FAISS_ASSERT(device < getNumDevices()); DeviceScope scope(device); // Make sure that device properties for all devices are cached auto& prop = getDeviceProperties(device); // Also check to make sure we meet our minimum compute capability (3.0) FAISS_ASSERT_FMT(prop.major >= 3, "Device id %d with CC %d.%d not supported, " "need 3.0+ compute capability", device, prop.major, prop.minor); // Create streams cudaStream_t defaultStream = 0; CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking)); defaultStreams_[device] = defaultStream; cudaStream_t asyncCopyStream = 0; CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking)); asyncCopyStreams_[device] = asyncCopyStream; std::vector deviceStreams; for (int j = 0; j < kNumStreams; ++j) { cudaStream_t stream = 0; CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); deviceStreams.push_back(stream); } alternateStreams_[device] = std::move(deviceStreams); // Create cuBLAS handle cublasHandle_t blasHandle = 0; auto blasStatus = cublasCreate(&blasHandle); FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); blasHandles_[device] = blasHandle; size_t toAlloc = 0; if (useFraction_) { size_t devFree = 0; size_t devTotal = 0; CUDA_VERIFY(cudaMemGetInfo(&devFree, &devTotal)); toAlloc = (size_t) (tempMemFraction_ * devTotal); } else { toAlloc = tempMemSize_; } FAISS_ASSERT(memory_.count(device) == 0); memory_.emplace(device, std::unique_ptr( new StackDeviceMemory(device, toAlloc))); } cublasHandle_t StandardGpuResources::getBlasHandle(int device) { initializeForDevice(device); return blasHandles_[device]; } cudaStream_t StandardGpuResources::getDefaultStream(int device) { initializeForDevice(device); return defaultStreams_[device]; } std::vector StandardGpuResources::getAlternateStreams(int device) { initializeForDevice(device); return alternateStreams_[device]; } DeviceMemory& StandardGpuResources::getMemoryManager(int device) { initializeForDevice(device); return *memory_[device]; } std::pair StandardGpuResources::getPinnedMemory() { return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_); } cudaStream_t StandardGpuResources::getAsyncCopyStream(int device) { initializeForDevice(device); return asyncCopyStreams_[device]; } } } // namespace