mmdeploy/csrc/device/cuda/cuda_device.h

// Copyright (c) OpenMMLab. All rights reserved.

#include <any>
#include <mutex>

#include "core/device_impl.h"
#include "core/types.h"
#include "cuda.h"
#include "cuda_runtime.h"

namespace mmdeploy {

using CudaTask = std::function<void(cudaStream_t)>;

class CudaPlatformImpl : public PlatformImpl {
 public:
  CudaPlatformImpl();

  ~CudaPlatformImpl() override {
    // The CUDA driver may have already shutdown before the platform dtor is called.
    // As a workaround, simply leak per device resources and let the driver handle it
    // FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
    //  better solution
    for (auto& data : per_device_data_storage_) {
      data.release();
    }
  }

  const char* GetPlatformName() const noexcept override { return "cuda"; }

  shared_ptr<BufferImpl> CreateBuffer(Device device) override;

  shared_ptr<StreamImpl> CreateStream(Device device) override;

  shared_ptr<EventImpl> CreateEvent(Device device) override;

  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
                    Stream stream) override;

  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
                    Stream stream) override;

  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
                    Stream stream) override;

  Result<Stream> GetDefaultStream(int32_t device_id) override;

  Allocator GetDefaultAllocator(int32_t device_id);

  Device GetDevice(int device_id) { return Device(platform_id_, device_id); }

 private:
  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
                             size_t copy_size);

  static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);

  static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,
                               size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,
                               Stream st);

  class PerDeviceData {
   public:
    explicit PerDeviceData(int device_id) : device_id_(device_id) {}
    void init();
    Stream& default_stream() {
      init();
      return default_stream_;
    }
    Allocator& default_allocator() {
      init();
      return default_allocator_;
    }

   private:
    int device_id_;
    std::once_flag init_flag_;
    Stream default_stream_;
    Allocator default_allocator_;
  };

  std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
  std::vector<PerDeviceData*> per_device_data_;
};

CudaPlatformImpl& gCudaPlatform();

class CudaDeviceMemory;

class CudaBufferImpl : public BufferImpl {
 public:
  explicit CudaBufferImpl(Device device);

  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;

  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;

  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;

  void* GetNative(ErrorCode* ec) override;

  Allocator GetAllocator() const override;

  size_t GetSize(ErrorCode* ec) override;

 private:
  std::shared_ptr<CudaDeviceMemory> memory_;
  size_t offset_{0};
  size_t size_{0};
};

class CudaStreamImpl : public StreamImpl {
 public:
  explicit CudaStreamImpl(Device device);

  ~CudaStreamImpl() override;

  Result<void> Init(uint64_t flags) override;

  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;

  Result<void> DependsOn(Event& event) override;

  Result<void> Query() override;

  Result<void> Wait() override;

  Result<void> Submit(Kernel& kernel) override;

  void* GetNative(ErrorCode* ec) override;

 private:
  cudaStream_t stream_;
  bool owned_stream_;
  std::shared_ptr<void> external_;
};

class CudaEventImpl : public EventImpl {
 public:
  explicit CudaEventImpl(Device device);

  ~CudaEventImpl() override;

  Result<void> Init(uint64_t flags) override;

  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;

  Result<void> Query() override;

  Result<void> Record(Stream& stream) override;

  Result<void> Wait() override;

  void* GetNative(ErrorCode* ec) override;

 private:
  cudaEvent_t event_;
  bool owned_event_;
  std::shared_ptr<void> external_;
};

class CudaKernelImpl : public KernelImpl {
 public:
  explicit CudaKernelImpl(Device device, CudaTask task);

  void* GetNative(ErrorCode* ec) override;

 private:
  CudaTask task_;
};

class CudaDeviceGuard {
 public:
  explicit CudaDeviceGuard(Device device) : CudaDeviceGuard(device.device_id()) {}
  explicit CudaDeviceGuard(int device_id) : device_id_(device_id), prev_device_id_(-1) {
    CUcontext ctx{};
    cuCtxGetCurrent(&ctx);
    if (ctx) {
      cudaGetDevice(&prev_device_id_);
    }
    if (prev_device_id_ != device_id_) cudaSetDevice(device_id_);
  }
  ~CudaDeviceGuard() {
    if (prev_device_id_ >= 0 && prev_device_id_ != device_id_) {
      cudaSetDevice(prev_device_id_);
    }
  }

 private:
  int device_id_;
  int prev_device_id_;
};

}  // namespace mmdeploy
Merge sdk (#251) * check in cmake * move backend_ops to csrc/backend_ops * check in preprocess, model, some codebase and their c-apis * check in CMakeLists.txt * check in parts of test_csrc * commit everything else * add readme * update core's BUILD_INTERFACE directory * skip codespell on third_party * update trt_net and ort_net's CMakeLists * ignore clion's build directory * check in pybind11 * add onnx.proto. Remove MMDeploy's dependency on ncnn's source code * export MMDeployTargets only when MMDEPLOY_BUILD_SDK is ON * remove useless message * target include directory is wrong * change target name from mmdeploy_ppl_net to mmdeploy_pplnn_net * skip install directory * update project's cmake * remove useless code * set CMAKE_BUILD_TYPE to Release by force if it isn't set by user * update custom ops CMakeLists * pass object target's source lists * fix lint end-of-file * fix lint: trailing whitespace * fix codespell hook * remove bicubic_interpolate to csrc/backend_ops/ * set MMDEPLOY_BUILD_SDK OFF * change custom ops build command * add spdlog installation command * update docs on how to checkout pybind11 * move bicubic_interpolate to backend_ops/tensorrt directory * remove useless code * correct cmake * fix typo * fix typo * fix install directory * correct sdk's readme * set cub dir when cuda version < 11.0 * change directory where clang-format will apply to * fix build command * add .clang-format * change clang-format style from google to file * reformat csrc/backend_ops * format sdk's code * turn off clang-format for some files * add -Xcompiler=-fno-gnu-unique * fix trt topk initialize * check in config for sdk demo * update cmake script and csrc's readme * correct config's path * add cuda include directory, otherwise compile failed in case of tensorrt8.2 * clang-format onnx2ncnn.cpp Co-authored-by: zhangli <lzhang329@gmail.com> Co-authored-by: grimoire <yaoqian@sensetime.com> 2021-12-07 10:57:55 +08:00			`// Copyright (c) OpenMMLab. All rights reserved.`

			`#include <any>`
			`#include <mutex>`

			`#include "core/device_impl.h"`
			`#include "core/types.h"`
			`#include "cuda.h"`
			`#include "cuda_runtime.h"`

			`namespace mmdeploy {`

			`using CudaTask = std::function<void(cudaStream_t)>;`

			`class CudaPlatformImpl : public PlatformImpl {`
			`public:`
			`CudaPlatformImpl();`

Support Windows (#106) * minor changes * support windows * fix GCC build * fix lint * reformat * fix Windows build * fix GCC build * search backend ops for onnxruntime * fix lint * fix lint * code clean-up * code clean-up * fix clang build * fix trt support * fix cmake for ncnn * fix cmake for openvino * fix SDK Python API * handle ops for other backends (ncnn, trt) * handle SDK Python API library location * robustify linkage * fix cuda * minor fix for openvino & ncnn * use CMAKE_CUDA_ARCHITECTURES if set * fix cuda preprocessor * fix misc * fix pplnn & pplcv, drop support for pplcv<0.6.0 * robustify cmake * update build.md (#2) * build dynamic modules as module library & fix demo (partially) * fix candidate path for mmdeploy_python * move "enable CUDA" to cmake config for demo * refine demo cmake * add comment * fix ubuntu build * revert docs/en/build.md * fix C API * fix lint * Windows build doc (#3) * check in docs related to mmdeploy build on windows * update build guide on windows platform * update build guide on windows platform * make path of thirdparty libraries consistent * make path consistency * correct build command for custom ops * correct build command for sdk * update sdk build instructions * update doc * correct build command * fix lint * correct build command and fix lint Co-authored-by: lvhan <lvhan@pjlab.org> * trailing whitespace (#4) * minor fix * fix sr sdk model * fix type deduction * fix cudaFree after driver shutting down * update ppl.cv installation warning (#5) * fix device allocator threshold & fix lint * update doc (#6) * update ppl.cv installation warning * missing 'git clone' Co-authored-by: chenxin <chenxin2@sensetime.com> Co-authored-by: zhangli <zhangli@sensetime.com> Co-authored-by: lvhan028 <lvhan_028@163.com> Co-authored-by: lvhan <lvhan@pjlab.org> 2022-02-24 20:08:44 +08:00			`~CudaPlatformImpl() override {`
			`// The CUDA driver may have already shutdown before the platform dtor is called.`
			`// As a workaround, simply leak per device resources and let the driver handle it`
			`// FIXME: maybe a pair of global mmdeploy_init/deinit function would be a`
			`// better solution`
			`for (auto& data : per_device_data_storage_) {`
			`data.release();`
			`}`
			`}`

Merge sdk (#251) * check in cmake * move backend_ops to csrc/backend_ops * check in preprocess, model, some codebase and their c-apis * check in CMakeLists.txt * check in parts of test_csrc * commit everything else * add readme * update core's BUILD_INTERFACE directory * skip codespell on third_party * update trt_net and ort_net's CMakeLists * ignore clion's build directory * check in pybind11 * add onnx.proto. Remove MMDeploy's dependency on ncnn's source code * export MMDeployTargets only when MMDEPLOY_BUILD_SDK is ON * remove useless message * target include directory is wrong * change target name from mmdeploy_ppl_net to mmdeploy_pplnn_net * skip install directory * update project's cmake * remove useless code * set CMAKE_BUILD_TYPE to Release by force if it isn't set by user * update custom ops CMakeLists * pass object target's source lists * fix lint end-of-file * fix lint: trailing whitespace * fix codespell hook * remove bicubic_interpolate to csrc/backend_ops/ * set MMDEPLOY_BUILD_SDK OFF * change custom ops build command * add spdlog installation command * update docs on how to checkout pybind11 * move bicubic_interpolate to backend_ops/tensorrt directory * remove useless code * correct cmake * fix typo * fix typo * fix install directory * correct sdk's readme * set cub dir when cuda version < 11.0 * change directory where clang-format will apply to * fix build command * add .clang-format * change clang-format style from google to file * reformat csrc/backend_ops * format sdk's code * turn off clang-format for some files * add -Xcompiler=-fno-gnu-unique * fix trt topk initialize * check in config for sdk demo * update cmake script and csrc's readme * correct config's path * add cuda include directory, otherwise compile failed in case of tensorrt8.2 * clang-format onnx2ncnn.cpp Co-authored-by: zhangli <lzhang329@gmail.com> Co-authored-by: grimoire <yaoqian@sensetime.com> 2021-12-07 10:57:55 +08:00			`const char* GetPlatformName() const noexcept override { return "cuda"; }`

			`shared_ptr<BufferImpl> CreateBuffer(Device device) override;`

			`shared_ptr<StreamImpl> CreateStream(Device device) override;`

			`shared_ptr<EventImpl> CreateEvent(Device device) override;`

			`Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,`
			`Stream stream) override;`

			`Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,`
			`Stream stream) override;`

			`Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,`
			`Stream stream) override;`

			`Result<Stream> GetDefaultStream(int32_t device_id) override;`

			`Allocator GetDefaultAllocator(int32_t device_id);`

			`Device GetDevice(int device_id) { return Device(platform_id_, device_id); }`

			`private:`
			`static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,`
			`size_t copy_size);`

			`static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);`

			`static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,`
			`size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,`
			`Stream st);`

			`class PerDeviceData {`
			`public:`
			`explicit PerDeviceData(int device_id) : device_id_(device_id) {}`
			`void init();`
			`Stream& default_stream() {`
			`init();`
			`return default_stream_;`
			`}`
			`Allocator& default_allocator() {`
			`init();`
			`return default_allocator_;`
			`}`

			`private:`
			`int device_id_;`
			`std::once_flag init_flag_;`
			`Stream default_stream_;`
			`Allocator default_allocator_;`
			`};`

			`std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;`
			`std::vector<PerDeviceData*> per_device_data_;`
			`};`

			`CudaPlatformImpl& gCudaPlatform();`

			`class CudaDeviceMemory;`

			`class CudaBufferImpl : public BufferImpl {`
			`public:`
			`explicit CudaBufferImpl(Device device);`

			`Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;`

			`Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;`

			`Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;`

			`void* GetNative(ErrorCode* ec) override;`

			`Allocator GetAllocator() const override;`

			`size_t GetSize(ErrorCode* ec) override;`

			`private:`
			`std::shared_ptr<CudaDeviceMemory> memory_;`
			`size_t offset_{0};`
			`size_t size_{0};`
			`};`

			`class CudaStreamImpl : public StreamImpl {`
			`public:`
			`explicit CudaStreamImpl(Device device);`

			`~CudaStreamImpl() override;`

			`Result<void> Init(uint64_t flags) override;`

			`Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;`

			`Result<void> DependsOn(Event& event) override;`

			`Result<void> Query() override;`

			`Result<void> Wait() override;`

			`Result<void> Submit(Kernel& kernel) override;`

			`void* GetNative(ErrorCode* ec) override;`

			`private:`
			`cudaStream_t stream_;`
			`bool owned_stream_;`
			`std::shared_ptr<void> external_;`
			`};`

			`class CudaEventImpl : public EventImpl {`
			`public:`
			`explicit CudaEventImpl(Device device);`

			`~CudaEventImpl() override;`

			`Result<void> Init(uint64_t flags) override;`

			`Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;`

			`Result<void> Query() override;`

			`Result<void> Record(Stream& stream) override;`

			`Result<void> Wait() override;`

			`void* GetNative(ErrorCode* ec) override;`

			`private:`
			`cudaEvent_t event_;`
			`bool owned_event_;`
			`std::shared_ptr<void> external_;`
			`};`

			`class CudaKernelImpl : public KernelImpl {`
			`public:`
			`explicit CudaKernelImpl(Device device, CudaTask task);`

			`void* GetNative(ErrorCode* ec) override;`

			`private:`
			`CudaTask task_;`
			`};`

[Feature] Support DBNet, PANet and PSENet for SDK, with GPU aided post-processing (#526) * add SDK support for PANet * fix panet * fix panet * simplify panet * add PSENet support * fix-psenet * add CUDA post-processing for DBNet * fix dbnet * fix dbnet * add cpu support for PANet * fix panet * add CUDA support for PANet * fix panet * format * add cpu impls for PSENet * fix psenet * add cuda impl for PSENet * fix psenet * add param parsing * simplify impls * simplify impls * clean-up * fix lint * fix cuda-10 build * fix cuda-10 build 2022-05-31 21:24:09 +08:00			`class CudaDeviceGuard {`
			`public:`
			`explicit CudaDeviceGuard(Device device) : CudaDeviceGuard(device.device_id()) {}`
			`explicit CudaDeviceGuard(int device_id) : device_id_(device_id), prev_device_id_(-1) {`
			`CUcontext ctx{};`
			`cuCtxGetCurrent(&ctx);`
			`if (ctx) {`
			`cudaGetDevice(&prev_device_id_);`
			`}`
			`if (prev_device_id_ != device_id_) cudaSetDevice(device_id_);`
			`}`
			`~CudaDeviceGuard() {`
			`if (prev_device_id_ >= 0 && prev_device_id_ != device_id_) {`
			`cudaSetDevice(prev_device_id_);`
			`}`
			`}`

			`private:`
			`int device_id_;`
			`int prev_device_id_;`
			`};`

Merge sdk (#251) * check in cmake * move backend_ops to csrc/backend_ops * check in preprocess, model, some codebase and their c-apis * check in CMakeLists.txt * check in parts of test_csrc * commit everything else * add readme * update core's BUILD_INTERFACE directory * skip codespell on third_party * update trt_net and ort_net's CMakeLists * ignore clion's build directory * check in pybind11 * add onnx.proto. Remove MMDeploy's dependency on ncnn's source code * export MMDeployTargets only when MMDEPLOY_BUILD_SDK is ON * remove useless message * target include directory is wrong * change target name from mmdeploy_ppl_net to mmdeploy_pplnn_net * skip install directory * update project's cmake * remove useless code * set CMAKE_BUILD_TYPE to Release by force if it isn't set by user * update custom ops CMakeLists * pass object target's source lists * fix lint end-of-file * fix lint: trailing whitespace * fix codespell hook * remove bicubic_interpolate to csrc/backend_ops/ * set MMDEPLOY_BUILD_SDK OFF * change custom ops build command * add spdlog installation command * update docs on how to checkout pybind11 * move bicubic_interpolate to backend_ops/tensorrt directory * remove useless code * correct cmake * fix typo * fix typo * fix install directory * correct sdk's readme * set cub dir when cuda version < 11.0 * change directory where clang-format will apply to * fix build command * add .clang-format * change clang-format style from google to file * reformat csrc/backend_ops * format sdk's code * turn off clang-format for some files * add -Xcompiler=-fno-gnu-unique * fix trt topk initialize * check in config for sdk demo * update cmake script and csrc's readme * correct config's path * add cuda include directory, otherwise compile failed in case of tensorrt8.2 * clang-format onnx2ncnn.cpp Co-authored-by: zhangli <lzhang329@gmail.com> Co-authored-by: grimoire <yaoqian@sensetime.com> 2021-12-07 10:57:55 +08:00			`} // namespace mmdeploy`