mmdeploy/csrc/device/cuda/cuda_device.h
lzhangzz 640aa03538
Support Windows (#106)
* minor changes

* support windows

* fix GCC build

* fix lint

* reformat

* fix Windows build

* fix GCC build

* search backend ops for onnxruntime

* fix lint

* fix lint

* code clean-up

* code clean-up

* fix clang build

* fix trt support

* fix cmake for ncnn

* fix cmake for openvino

* fix SDK Python API

* handle ops for other backends (ncnn, trt)

* handle SDK Python API library location

* robustify linkage

* fix cuda

* minor fix for openvino & ncnn

* use CMAKE_CUDA_ARCHITECTURES if set

* fix cuda preprocessor

* fix misc

* fix pplnn & pplcv, drop support for pplcv<0.6.0

* robustify cmake

* update build.md (#2)

* build dynamic modules as module library & fix demo (partially)

* fix candidate path for mmdeploy_python

* move "enable CUDA" to cmake config for demo

* refine demo cmake

* add comment

* fix ubuntu build

* revert docs/en/build.md

* fix C API

* fix lint

* Windows build doc (#3)

* check in docs related to mmdeploy build on windows

* update build guide on windows platform

* update build guide on windows platform

* make path of thirdparty libraries consistent

* make path consistency

* correct build command for custom ops

* correct build command for sdk

* update sdk build instructions

* update doc

* correct build command

* fix lint

* correct build command and fix lint

Co-authored-by: lvhan <lvhan@pjlab.org>

* trailing whitespace (#4)

* minor fix

* fix sr sdk model

* fix type deduction

* fix cudaFree after driver shutting down

* update ppl.cv installation warning (#5)

* fix device allocator threshold & fix lint

* update doc (#6)

* update ppl.cv installation warning

* missing 'git clone'

Co-authored-by: chenxin <chenxin2@sensetime.com>
Co-authored-by: zhangli <zhangli@sensetime.com>
Co-authored-by: lvhan028 <lvhan_028@163.com>
Co-authored-by: lvhan <lvhan@pjlab.org>
2022-02-24 20:08:44 +08:00

173 lines
4.4 KiB
C++

// Copyright (c) OpenMMLab. All rights reserved.
#include <any>
#include <mutex>
#include "core/device_impl.h"
#include "core/types.h"
#include "cuda.h"
#include "cuda_runtime.h"
namespace mmdeploy {
using CudaTask = std::function<void(cudaStream_t)>;
class CudaPlatformImpl : public PlatformImpl {
public:
CudaPlatformImpl();
~CudaPlatformImpl() override {
// The CUDA driver may have already shutdown before the platform dtor is called.
// As a workaround, simply leak per device resources and let the driver handle it
// FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
// better solution
for (auto& data : per_device_data_storage_) {
data.release();
}
}
const char* GetPlatformName() const noexcept override { return "cuda"; }
shared_ptr<BufferImpl> CreateBuffer(Device device) override;
shared_ptr<StreamImpl> CreateStream(Device device) override;
shared_ptr<EventImpl> CreateEvent(Device device) override;
Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
Stream stream) override;
Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
Stream stream) override;
Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
Stream stream) override;
Result<Stream> GetDefaultStream(int32_t device_id) override;
Allocator GetDefaultAllocator(int32_t device_id);
Device GetDevice(int device_id) { return Device(platform_id_, device_id); }
private:
static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
size_t copy_size);
static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,
size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,
Stream st);
class PerDeviceData {
public:
explicit PerDeviceData(int device_id) : device_id_(device_id) {}
void init();
Stream& default_stream() {
init();
return default_stream_;
}
Allocator& default_allocator() {
init();
return default_allocator_;
}
private:
int device_id_;
std::once_flag init_flag_;
Stream default_stream_;
Allocator default_allocator_;
};
std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
std::vector<PerDeviceData*> per_device_data_;
};
CudaPlatformImpl& gCudaPlatform();
class CudaDeviceMemory;
class CudaBufferImpl : public BufferImpl {
public:
explicit CudaBufferImpl(Device device);
Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
void* GetNative(ErrorCode* ec) override;
Allocator GetAllocator() const override;
size_t GetSize(ErrorCode* ec) override;
private:
std::shared_ptr<CudaDeviceMemory> memory_;
size_t offset_{0};
size_t size_{0};
};
class CudaStreamImpl : public StreamImpl {
public:
explicit CudaStreamImpl(Device device);
~CudaStreamImpl() override;
Result<void> Init(uint64_t flags) override;
Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
Result<void> DependsOn(Event& event) override;
Result<void> Query() override;
Result<void> Wait() override;
Result<void> Submit(Kernel& kernel) override;
void* GetNative(ErrorCode* ec) override;
private:
cudaStream_t stream_;
bool owned_stream_;
std::shared_ptr<void> external_;
};
class CudaEventImpl : public EventImpl {
public:
explicit CudaEventImpl(Device device);
~CudaEventImpl() override;
Result<void> Init(uint64_t flags) override;
Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
Result<void> Query() override;
Result<void> Record(Stream& stream) override;
Result<void> Wait() override;
void* GetNative(ErrorCode* ec) override;
private:
cudaEvent_t event_;
bool owned_event_;
std::shared_ptr<void> external_;
};
class CudaKernelImpl : public KernelImpl {
public:
explicit CudaKernelImpl(Device device, CudaTask task);
void* GetNative(ErrorCode* ec) override;
private:
CudaTask task_;
};
} // namespace mmdeploy