mirror of
https://github.com/open-mmlab/mmdeploy.git
synced 2025-01-14 08:09:43 +08:00
* minor changes * support windows * fix GCC build * fix lint * reformat * fix Windows build * fix GCC build * search backend ops for onnxruntime * fix lint * fix lint * code clean-up * code clean-up * fix clang build * fix trt support * fix cmake for ncnn * fix cmake for openvino * fix SDK Python API * handle ops for other backends (ncnn, trt) * handle SDK Python API library location * robustify linkage * fix cuda * minor fix for openvino & ncnn * use CMAKE_CUDA_ARCHITECTURES if set * fix cuda preprocessor * fix misc * fix pplnn & pplcv, drop support for pplcv<0.6.0 * robustify cmake * update build.md (#2) * build dynamic modules as module library & fix demo (partially) * fix candidate path for mmdeploy_python * move "enable CUDA" to cmake config for demo * refine demo cmake * add comment * fix ubuntu build * revert docs/en/build.md * fix C API * fix lint * Windows build doc (#3) * check in docs related to mmdeploy build on windows * update build guide on windows platform * update build guide on windows platform * make path of thirdparty libraries consistent * make path consistency * correct build command for custom ops * correct build command for sdk * update sdk build instructions * update doc * correct build command * fix lint * correct build command and fix lint Co-authored-by: lvhan <lvhan@pjlab.org> * trailing whitespace (#4) * minor fix * fix sr sdk model * fix type deduction * fix cudaFree after driver shutting down * update ppl.cv installation warning (#5) * fix device allocator threshold & fix lint * update doc (#6) * update ppl.cv installation warning * missing 'git clone' Co-authored-by: chenxin <chenxin2@sensetime.com> Co-authored-by: zhangli <zhangli@sensetime.com> Co-authored-by: lvhan028 <lvhan_028@163.com> Co-authored-by: lvhan <lvhan@pjlab.org>
173 lines
4.4 KiB
C++
173 lines
4.4 KiB
C++
// Copyright (c) OpenMMLab. All rights reserved.
|
|
|
|
#include <any>
|
|
#include <mutex>
|
|
|
|
#include "core/device_impl.h"
|
|
#include "core/types.h"
|
|
#include "cuda.h"
|
|
#include "cuda_runtime.h"
|
|
|
|
namespace mmdeploy {
|
|
|
|
using CudaTask = std::function<void(cudaStream_t)>;
|
|
|
|
class CudaPlatformImpl : public PlatformImpl {
|
|
public:
|
|
CudaPlatformImpl();
|
|
|
|
~CudaPlatformImpl() override {
|
|
// The CUDA driver may have already shutdown before the platform dtor is called.
|
|
// As a workaround, simply leak per device resources and let the driver handle it
|
|
// FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
|
|
// better solution
|
|
for (auto& data : per_device_data_storage_) {
|
|
data.release();
|
|
}
|
|
}
|
|
|
|
const char* GetPlatformName() const noexcept override { return "cuda"; }
|
|
|
|
shared_ptr<BufferImpl> CreateBuffer(Device device) override;
|
|
|
|
shared_ptr<StreamImpl> CreateStream(Device device) override;
|
|
|
|
shared_ptr<EventImpl> CreateEvent(Device device) override;
|
|
|
|
Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
|
|
Stream stream) override;
|
|
|
|
Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
|
|
Stream stream) override;
|
|
|
|
Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
|
|
Stream stream) override;
|
|
|
|
Result<Stream> GetDefaultStream(int32_t device_id) override;
|
|
|
|
Allocator GetDefaultAllocator(int32_t device_id);
|
|
|
|
Device GetDevice(int device_id) { return Device(platform_id_, device_id); }
|
|
|
|
private:
|
|
static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
|
|
size_t copy_size);
|
|
|
|
static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
|
|
|
|
static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,
|
|
size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,
|
|
Stream st);
|
|
|
|
class PerDeviceData {
|
|
public:
|
|
explicit PerDeviceData(int device_id) : device_id_(device_id) {}
|
|
void init();
|
|
Stream& default_stream() {
|
|
init();
|
|
return default_stream_;
|
|
}
|
|
Allocator& default_allocator() {
|
|
init();
|
|
return default_allocator_;
|
|
}
|
|
|
|
private:
|
|
int device_id_;
|
|
std::once_flag init_flag_;
|
|
Stream default_stream_;
|
|
Allocator default_allocator_;
|
|
};
|
|
|
|
std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
|
|
std::vector<PerDeviceData*> per_device_data_;
|
|
};
|
|
|
|
CudaPlatformImpl& gCudaPlatform();
|
|
|
|
class CudaDeviceMemory;
|
|
|
|
class CudaBufferImpl : public BufferImpl {
|
|
public:
|
|
explicit CudaBufferImpl(Device device);
|
|
|
|
Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
|
|
|
|
Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
|
|
|
|
Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
|
|
|
|
void* GetNative(ErrorCode* ec) override;
|
|
|
|
Allocator GetAllocator() const override;
|
|
|
|
size_t GetSize(ErrorCode* ec) override;
|
|
|
|
private:
|
|
std::shared_ptr<CudaDeviceMemory> memory_;
|
|
size_t offset_{0};
|
|
size_t size_{0};
|
|
};
|
|
|
|
class CudaStreamImpl : public StreamImpl {
|
|
public:
|
|
explicit CudaStreamImpl(Device device);
|
|
|
|
~CudaStreamImpl() override;
|
|
|
|
Result<void> Init(uint64_t flags) override;
|
|
|
|
Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
|
|
|
|
Result<void> DependsOn(Event& event) override;
|
|
|
|
Result<void> Query() override;
|
|
|
|
Result<void> Wait() override;
|
|
|
|
Result<void> Submit(Kernel& kernel) override;
|
|
|
|
void* GetNative(ErrorCode* ec) override;
|
|
|
|
private:
|
|
cudaStream_t stream_;
|
|
bool owned_stream_;
|
|
std::shared_ptr<void> external_;
|
|
};
|
|
|
|
class CudaEventImpl : public EventImpl {
|
|
public:
|
|
explicit CudaEventImpl(Device device);
|
|
|
|
~CudaEventImpl() override;
|
|
|
|
Result<void> Init(uint64_t flags) override;
|
|
|
|
Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
|
|
|
|
Result<void> Query() override;
|
|
|
|
Result<void> Record(Stream& stream) override;
|
|
|
|
Result<void> Wait() override;
|
|
|
|
void* GetNative(ErrorCode* ec) override;
|
|
|
|
private:
|
|
cudaEvent_t event_;
|
|
bool owned_event_;
|
|
std::shared_ptr<void> external_;
|
|
};
|
|
|
|
class CudaKernelImpl : public KernelImpl {
|
|
public:
|
|
explicit CudaKernelImpl(Device device, CudaTask task);
|
|
|
|
void* GetNative(ErrorCode* ec) override;
|
|
|
|
private:
|
|
CudaTask task_;
|
|
};
|
|
|
|
} // namespace mmdeploy
|