/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the CC-by-NC license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once #include #include #include namespace faiss { namespace gpu { /// Returns the current thread-local GPU device int getCurrentDevice(); /// Sets the current thread-local GPU device void setCurrentDevice(int device); /// Returns the number of available GPU devices int getNumDevices(); /// Synchronizes the CPU against all devices (equivalent to /// cudaDeviceSynchronize for each device) void synchronizeAllDevices(); /// Returns a cached cudaDeviceProp for the given device cudaDeviceProp& getDeviceProperties(int device); /// Returns the maximum number of threads available for the given GPU /// device int getMaxThreads(int device); /// Equivalent to getMaxThreads(getCurrentDevice()) int getMaxThreadsCurrentDevice(); /// Returns the maximum smem available for the given GPU device size_t getMaxSharedMemPerBlock(int device); /// Equivalent to getMaxSharedMemPerBlock(getCurrentDevice()) size_t getMaxSharedMemPerBlockCurrentDevice(); /// For a given pointer, returns whether or not it is located on /// a device (deviceId >= 0) or the host (-1). int getDeviceForAddress(const void* p); /// RAII object to set the current device, and restore the previous /// device upon destruction class DeviceScope { public: explicit DeviceScope(int device); ~DeviceScope(); private: int prevDevice_; }; /// RAII object to manage a cublasHandle_t class CublasHandleScope { public: CublasHandleScope(); ~CublasHandleScope(); cublasHandle_t get() { return blasHandle_; } private: cublasHandle_t blasHandle_; }; // RAII object to manage a cudaEvent_t class CudaEvent { public: /// Creates an event and records it in this stream explicit CudaEvent(cudaStream_t stream); CudaEvent(const CudaEvent& event) = delete; CudaEvent(CudaEvent&& event) noexcept; ~CudaEvent(); inline cudaEvent_t get() { return event_; } /// Wait on this event in this stream void streamWaitOnEvent(cudaStream_t stream); /// Have the CPU wait for the completion of this event void cpuWaitOnEvent(); CudaEvent& operator=(CudaEvent&& event) noexcept; CudaEvent& operator=(CudaEvent& event) = delete; private: cudaEvent_t event_; }; /// Wrapper to test return status of CUDA functions #if DEBUG #define CUDA_VERIFY(X) \ do { \ auto err = (X); \ FAISS_ASSERT(err == cudaSuccess); \ } while (0) #else #define CUDA_VERIFY(X) do { (X); } while (0) #endif /// Call for a collection of streams to wait on template void streamWaitBase(const L1& listWaiting, const L2& listWaitOn) { // For all the streams we are waiting on, create an event std::vector events; for (auto& stream : listWaitOn) { cudaEvent_t event; CUDA_VERIFY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); CUDA_VERIFY(cudaEventRecord(event, stream)); events.push_back(event); } // For all the streams that are waiting, issue a wait for (auto& stream : listWaiting) { for (auto& event : events) { CUDA_VERIFY(cudaStreamWaitEvent(stream, event, 0)); } } for (auto& event : events) { CUDA_VERIFY(cudaEventDestroy(event)); } } /// These versions allow usage of initializer_list as arguments, since /// otherwise {...} doesn't have a type template void streamWait(const L1& a, const std::initializer_list& b) { streamWaitBase(a, b); } template void streamWait(const std::initializer_list& a, const L2& b) { streamWaitBase(a, b); } inline void streamWait(const std::initializer_list& a, const std::initializer_list& b) { streamWaitBase(a, b); } } } // namespace