// Copyright (c) OpenMMLab. All rights reserved. #include #include #include "core/device_impl.h" #include "core/types.h" #include "cuda.h" #include "cuda_runtime.h" namespace mmdeploy { using CudaTask = std::function; class CudaPlatformImpl : public PlatformImpl { public: CudaPlatformImpl(); const char* GetPlatformName() const noexcept override { return "cuda"; } shared_ptr CreateBuffer(Device device) override; shared_ptr CreateStream(Device device) override; shared_ptr CreateEvent(Device device) override; Result Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) override; Result Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) override; Result Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) override; Result GetDefaultStream(int32_t device_id) override; Allocator GetDefaultAllocator(int32_t device_id); Device GetDevice(int device_id) { return Device(platform_id_, device_id); } private: static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size); static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st); static Result CopyImpl(Device device, const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st); class PerDeviceData { public: explicit PerDeviceData(int device_id) : device_id_(device_id) {} void init(); Stream& default_stream() { init(); return default_stream_; } Allocator& default_allocator() { init(); return default_allocator_; } private: int device_id_; std::once_flag init_flag_; Stream default_stream_; Allocator default_allocator_; }; std::vector> per_device_data_storage_; std::vector per_device_data_; }; CudaPlatformImpl& gCudaPlatform(); class CudaDeviceMemory; class CudaBufferImpl : public BufferImpl { public: explicit CudaBufferImpl(Device device); Result Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override; Result Init(size_t size, std::shared_ptr native, uint64_t flags) override; Result SubBuffer(size_t offset, size_t size, uint64_t flags) override; void* GetNative(ErrorCode* ec) override; Allocator GetAllocator() const override; size_t GetSize(ErrorCode* ec) override; private: std::shared_ptr memory_; size_t offset_{0}; size_t size_{0}; }; class CudaStreamImpl : public StreamImpl { public: explicit CudaStreamImpl(Device device); ~CudaStreamImpl() override; Result Init(uint64_t flags) override; Result Init(std::shared_ptr native, uint64_t flags) override; Result DependsOn(Event& event) override; Result Query() override; Result Wait() override; Result Submit(Kernel& kernel) override; void* GetNative(ErrorCode* ec) override; private: cudaStream_t stream_; bool owned_stream_; std::shared_ptr external_; }; class CudaEventImpl : public EventImpl { public: explicit CudaEventImpl(Device device); ~CudaEventImpl() override; Result Init(uint64_t flags) override; Result Init(std::shared_ptr native, uint64_t flags) override; Result Query() override; Result Record(Stream& stream) override; Result Wait() override; void* GetNative(ErrorCode* ec) override; private: cudaEvent_t event_; bool owned_event_; std::shared_ptr external_; }; class CudaKernelImpl : public KernelImpl { public: explicit CudaKernelImpl(Device device, CudaTask task); void* GetNative(ErrorCode* ec) override; private: CudaTask task_; }; } // namespace mmdeploy