// Copyright (c) OpenMMLab. All rights reserved. #include "cuda_device.h" #include #include "device/device_allocator.h" namespace mmdeploy { inline void* OffsetPtr(void* ptr, size_t offset) { return static_cast(static_cast(ptr) + offset); } inline const void* OffsetPtr(const void* ptr, size_t offset) { return static_cast(static_cast(ptr) + offset); } class CudaDeviceGuard { public: explicit CudaDeviceGuard(Device device) : CudaDeviceGuard(device.device_id()) {} explicit CudaDeviceGuard(int device_id) : device_id_(device_id), prev_device_id_(-1) { CUcontext ctx{}; cuCtxGetCurrent(&ctx); if (ctx) { cudaGetDevice(&prev_device_id_); } if (prev_device_id_ != device_id_) cudaSetDevice(device_id_); } ~CudaDeviceGuard() { if (prev_device_id_ >= 0 && prev_device_id_ != device_id_) { cudaSetDevice(prev_device_id_); } } private: int device_id_; int prev_device_id_; }; cudaMemcpyKind MapMemcpyKindToCuda(MemcpyKind kind) { switch (kind) { case MemcpyKind::HtoD: return cudaMemcpyHostToDevice; case MemcpyKind::DtoH: return cudaMemcpyDeviceToHost; case MemcpyKind::DtoD: return cudaMemcpyDeviceToDevice; default: return cudaMemcpyDefault; } } namespace cuda { class Mallocator : public AllocatorImpl { public: Block Allocate(size_t size) noexcept override { if (size == 0) { return Block{}; } Block block; if (auto status = cudaMalloc(&block.handle, size); status != cudaSuccess) { // log error } block.size = size; return block; } void Deallocate(Block& block) noexcept override { if (!block.handle) { return; } cudaFree(block.handle); } bool Owns(const Block& block) const noexcept override { return true; } }; Allocator CreateDefaultAllocator() { using namespace device_allocator; AllocatorImplPtr allocator = std::make_shared(); allocator = std::make_shared(allocator, "cudaMalloc"); allocator = std::make_shared(allocator, -1, .0); allocator = std::make_shared(allocator, "Tree"); INFO("Default CUDA allocator initialized"); return Access::create(allocator); } } // namespace cuda // ! this class doesn't handle device id class CudaDeviceMemory : public NonCopyable { public: explicit CudaDeviceMemory(int device_id) : device_id_(device_id), size_(), owned_block_() {} Result Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) { if (alignment != 1) { return Status(eNotSupported); } allocator_ = std::move(allocator); CudaDeviceGuard guard(device_id_); block_ = Access::get(allocator_).Allocate(size); if (!block_.handle) { return Status(eOutOfMemory); } size_ = size; owned_block_ = true; return success(); } Result Init(size_t size, std::shared_ptr data, uint64_t flags) { size_ = size; external_ = std::move(data); block_.handle = external_.get(); block_.size = size; owned_block_ = false; return success(); } ~CudaDeviceMemory() { if (block_.handle) { if (owned_block_) { CudaDeviceGuard guard(device_id_); Access::get(allocator_).Deallocate(block_); owned_block_ = false; } block_.handle = nullptr; } external_.reset(); size_ = 0; } size_t size() const { return size_; } void* data() const { return block_.handle; } const Allocator& allocator() const { return allocator_; } private: int device_id_; size_t size_; AllocatorImpl::Block block_; bool owned_block_; Allocator allocator_; std::shared_ptr external_; }; shared_ptr CudaPlatformImpl::CreateBuffer(Device device) { return std::make_shared(device); } shared_ptr CudaPlatformImpl::CreateStream(Device device) { return std::make_shared(device); } shared_ptr CudaPlatformImpl::CreateEvent(Device device) { return std::make_shared(device); } bool CudaPlatformImpl::CheckCopyDevice(const Device& src, const Device& dst, const Device& st) { return st.is_device() && (src.is_host() || src == st) && (dst.is_host() || dst == st); } Result CudaPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) { if (!CheckCopyDevice(Device{0, 0}, dst.GetDevice(), stream.GetDevice())) { return Status(eInvalidArgument); } auto dst_ptr = dst.GetNative(); if (!dst_ptr) { return Status(eInvalidArgument); } // auto device = dst.GetDevice(); return CopyImpl(stream.GetDevice(), host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream); } Result CudaPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) { if (!CheckCopyDevice(src.GetDevice(), Device{0, 0}, stream.GetDevice())) { return Status(eInvalidArgument); } auto src_ptr = src.GetNative(); if (!src_ptr) { return Status(eInvalidArgument); } // auto device = src.GetDevice(); return CopyImpl(stream.GetDevice(), src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream); } Result CudaPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) { if (!CheckCopyDevice(src.GetDevice(), dst.GetDevice(), stream.GetDevice())) { return Status(eInvalidArgument); } auto src_ptr = src.GetNative(); auto dst_ptr = dst.GetNative(); if (!src_ptr || !dst_ptr) { return Status(eInvalidArgument); } return CopyImpl(stream.GetDevice(), src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size, stream); } bool CudaPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size) { if (src_offset + copy_size > src_size) { return false; } if (dst_offset + copy_size > dst_size) { return false; } return true; } Result CudaPlatformImpl::CopyImpl(Device device, const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st) { if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) { return Status(eInvalidArgument); } auto p_dst = OffsetPtr(dst, dst_offset); auto p_src = OffsetPtr(src, src_offset); CudaDeviceGuard guard(device); if (st) { auto cuda_stream = ::mmdeploy::GetNative(st); // TODO: how about default stream cudaStream_t(0)? if (!cuda_stream) { return Status(eInvalidArgument); } auto err = cudaMemcpyAsync(p_dst, p_src, size, cudaMemcpyDefault, cuda_stream); if (err != cudaSuccess) { return Status(eFail); } } else { auto err = cudaMemcpy(p_dst, p_src, size, cudaMemcpyDefault); if (err != cudaSuccess) { return Status(eFail); } } return success(); } Result CudaPlatformImpl::GetDefaultStream(int32_t device_id) { if (device_id >= per_device_data_.size()) { return Status(eInvalidArgument); } return per_device_data_[device_id]->default_stream(); } void CudaPlatformImpl::PerDeviceData::init() { std::call_once(init_flag_, [&] { CudaDeviceGuard guard(device_id_); default_stream_ = Stream(gCudaPlatform().GetDevice(device_id_)); default_allocator_ = cuda::CreateDefaultAllocator(); }); } CudaPlatformImpl::CudaPlatformImpl() { int count{}; if (auto err = cudaGetDeviceCount(&count); err != cudaSuccess) { ERROR("error getting device count: {}", cudaGetErrorString(err)); throw_exception(eFail); } per_device_data_storage_.reserve(count); per_device_data_.reserve(count); for (int device_id = 0; device_id < count; ++device_id) { per_device_data_storage_.push_back(std::make_unique(device_id)); per_device_data_.push_back(per_device_data_storage_.back().get()); } } Allocator CudaPlatformImpl::GetDefaultAllocator(int32_t device_id) { return per_device_data_[device_id]->default_allocator(); } //////////////////////////////////////////////////////////////////////////////// /// CudaStreamImpl CudaStreamImpl::CudaStreamImpl(Device device) : StreamImpl(device), stream_(), owned_stream_() {} CudaStreamImpl::~CudaStreamImpl() { CudaDeviceGuard guard(device_.device_id()); if (owned_stream_) { if (auto status = cudaStreamDestroy(stream_); status != cudaSuccess) { // TODO: signal error } owned_stream_ = false; } external_.reset(); } Result CudaStreamImpl::Init(uint64_t flags) { CudaDeviceGuard guard(device_); if (auto status = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking); status != cudaSuccess) { return Status(eFail); } owned_stream_ = true; return success(); } Result CudaStreamImpl::Init(std::shared_ptr native, uint64_t flags) { // ! nullptr is valid for cudaStream_t external_ = std::move(native); stream_ = static_cast(external_.get()); owned_stream_ = false; return success(); } Result CudaStreamImpl::DependsOn(Event& event) { if (event.GetDevice() == device_) { CudaDeviceGuard guard(device_); auto native_event = ::mmdeploy::GetNative(event); cudaStreamWaitEvent(stream_, native_event, 0); return success(); } return Status(eInvalidArgument); } Result CudaStreamImpl::Query() { CudaDeviceGuard guard(device_); if (cudaStreamQuery(stream_) == cudaSuccess) { return success(); } else { return Status(eFail); } } Result CudaStreamImpl::Wait() { CudaDeviceGuard guard(device_); if (cudaStreamSynchronize(stream_) == cudaSuccess) { return success(); } else { return Status(eFail); } } Result CudaStreamImpl::Submit(Kernel& kernel) { auto task = ::mmdeploy::GetNative(kernel); if (task) { CudaDeviceGuard guard(device_); (*task)(stream_); return success(); } return Status(eInvalidArgument); } void* CudaStreamImpl::GetNative(ErrorCode* ec) { if (ec) *ec = ErrorCode::eSuccess; return stream_; } //////////////////////////////////////////////////////////////////////////////// /// CudaEventImpl CudaEventImpl::CudaEventImpl(Device device) : EventImpl(device), event_(), owned_event_() {} CudaEventImpl::~CudaEventImpl() { CudaDeviceGuard guard(device_.device_id()); if (owned_event_) { if (auto status = cudaEventDestroy(event_); status != cudaSuccess) { // TODO: signal error } owned_event_ = false; } external_.reset(); } Result CudaEventImpl::Init(uint64_t flags) { CudaDeviceGuard guard(device_); if (auto status = cudaEventCreateWithFlags(&event_, 0); status != cudaSuccess) { return Status(eFail); } owned_event_ = true; return success(); } Result CudaEventImpl::Init(std::shared_ptr native, uint64_t flags) { if (!native) { return Status(eInvalidArgument); } external_ = std::move(native); event_ = static_cast(external_.get()); owned_event_ = false; return success(); } Result CudaEventImpl::Query() { if (cudaEventQuery(event_) == cudaSuccess) { return success(); } else { return Status(eFail); } } Result CudaEventImpl::Record(Stream& stream) { if (stream.GetDevice() != device_) { return Status(eInvalidArgument); } CudaDeviceGuard guard(device_); auto native_stream = ::mmdeploy::GetNative(stream); cudaEventRecord(event_, native_stream); return success(); } Result CudaEventImpl::Wait() { CudaDeviceGuard guard(device_); if (cudaEventSynchronize(event_) == cudaSuccess) { return success(); } else { return Status(eFail); } } void* CudaEventImpl::GetNative(ErrorCode* ec) { if (ec) *ec = ErrorCode::eSuccess; return event_; } //////////////////////////////////////////////////////////////////////////////// /// CudaBufferImpl CudaBufferImpl::CudaBufferImpl(Device device) : BufferImpl(device) {} Result CudaBufferImpl::Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) { memory_ = std::make_shared(device_.device_id()); if (!allocator) { allocator = gCudaPlatform().GetDefaultAllocator(device_.device_id()); } OUTCOME_TRY(memory_->Init(size, std::move(allocator), alignment, flags)); size_ = size; return success(); } Result CudaBufferImpl::Init(size_t size, std::shared_ptr native, uint64_t flags) { memory_ = std::make_shared(device_.device_id()); OUTCOME_TRY(memory_->Init(size, std::move(native), flags)); size_ = size; return success(); } Result CudaBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) { if (offset_ + offset + size > memory_->size()) { return Status(eInvalidArgument); } auto impl = std::make_shared(device_); impl->memory_ = memory_; impl->offset_ = offset_ + offset; impl->size_ = size; return impl; } size_t CudaBufferImpl::GetSize(ErrorCode* ec) { return size_; } void* CudaBufferImpl::GetNative(ErrorCode* ec) { if (!memory_) { if (ec) *ec = eInvalidArgument; return nullptr; } if (ec) *ec = ErrorCode::eSuccess; return OffsetPtr(memory_->data(), offset_); } Allocator CudaBufferImpl::GetAllocator() const { return memory_->allocator(); } //////////////////////////////////////////////////////////////////////////////// /// CudaKernelImpl void* CudaKernelImpl::GetNative(ErrorCode* ec) { if (ec) *ec = ErrorCode::eSuccess; return &task_; } CudaKernelImpl::CudaKernelImpl(Device device, CudaTask task) : KernelImpl(device), task_(std::move(task)) {} //////////////////////////////////////////////////////////////////////////////// /// CudaPlatformRegisterer class CudaPlatformRegisterer { public: CudaPlatformRegisterer() { gPlatformRegistry().Register([] { return std::make_shared(); }); } }; CudaPlatformRegisterer g_cuda_platform_registerer; CudaPlatformImpl& gCudaPlatform() { static Platform platform("cuda"); return Access::get(platform); } } // namespace mmdeploy