[Refactor] Decouple preprocess operation and transformation (#1353)

* refactor SDK registry * fix lint * decouple transform logic and operations * data management * improve data management * improve data management * context management * fix ResizeOCR * fix operation fallback logic * fix MSVC build * clean-up * sync master * fix lint * Normalize - add `to_float`, merge `cvtcolor` operations * fix macOS build * rename * cleanup * fix lint * fix macOS build * fix MSVC build * support elena * fix * fix * optimize normalize * fix * fix MSVC build * simplify * profiler * use `throw_exception` * misc * fix typo
2025-01-14 08:09:43 +08:00 · 2022-11-28 14:46:05 +08:00 · 2022-11-28 14:46:05 +08:00 · d77aeaa480
commit d77aeaa480
parent 3d1c135297
126 changed files with 2911 additions and 3724 deletions
--- a/csrc/mmdeploy/CMakeLists.txt
+++ b/csrc/mmdeploy/CMakeLists.txt
@ -13,6 +13,7 @@ if (MMDEPLOY_BUILD_SDK)
    add_subdirectory(device)
    add_subdirectory(graph)
    add_subdirectory(model)
+    add_subdirectory(operation)
    add_subdirectory(preprocess)
    add_subdirectory(net)
    add_subdirectory(codebase)
--- a/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
@ -7,7 +7,8 @@ mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 add_subdirectory(cpu)
 add_subdirectory(cuda)
 target_link_libraries(${PROJECT_NAME} PRIVATE
-    mmdeploy::transform
+    mmdeploy_operation
+    mmdeploy_transform
    mmdeploy_opencv_utils)

 add_library(mmdeploy::mmaction ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/codebase/mmaction/cpu/format_shape_impl.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/cpu/format_shape_impl.cpp
@ -5,69 +5,63 @@

 using namespace std;

-namespace mmdeploy {
-namespace cpu {
+namespace mmdeploy::mmaction::cpu {

-class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
+class FormatShapeImpl : public FormatShapeOp {
 public:
-  explicit FormatShapeImpl(const Value& args) : ::mmdeploy::FormatShapeImpl(args) {}
+  explicit FormatShapeImpl(std::string input_format) : FormatShapeOp(std::move(input_format)) {}

 protected:
-  Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len, int num_clips) {
-    int N = tensors.size();
-    int H = tensors[0].shape(1);
-    int W = tensors[0].shape(2);
-    int C = tensors[0].shape(3);
-
-    std::vector<Tensor> host_tensors;
-    host_tensors.reserve(N);
-    for (int i = 0; i < N; i++) {
-      OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensors[i], kHost, stream_));
-      host_tensors.push_back(std::move(src_tensor));
-    }
-    OUTCOME_TRY(stream_.Wait());
+  Result<void> apply(const std::vector<Tensor>& tensors, Tensor& output, int clip_len,
+                     int num_clips) override {
+    auto N = static_cast<int64_t>(tensors.size());
+    auto H = tensors[0].shape(1);
+    auto W = tensors[0].shape(2);
+    auto C = tensors[0].shape(3);

    TensorDesc desc = {kHost, DataType::kFLOAT, {N, H, W, C}};
    Tensor imgs(desc);
-    int offset = 0;
-    int n_item = H * W * C;
-    int copy_size = n_item * sizeof(float);
+    auto offset = 0UL;
+    auto n_item = H * W * C;
+    auto copy_size = n_item * sizeof(float);
    for (int i = 0; i < N; i++) {
-      auto src_buffer = host_tensors[i].buffer();
+      auto src_buffer = tensors[i].buffer();
      auto dst_buffer = imgs.buffer();
-      OUTCOME_TRY(stream_.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+      OUTCOME_TRY(stream().Copy(src_buffer, dst_buffer, copy_size, 0, offset));
      offset += copy_size;
    }
-    OUTCOME_TRY(stream_.Wait());
+
+    OUTCOME_TRY(stream().Wait());

    Tensor dst;
-    if (arg_.input_format == "NCHW") {
+    if (input_format_ == "NCHW") {
      OUTCOME_TRY(dst, FormatNCHW(imgs, clip_len, num_clips));
    }
-    if (arg_.input_format == "NCTHW") {
+    if (input_format_ == "NCTHW") {
      OUTCOME_TRY(dst, FormatNCTHW(imgs, clip_len, num_clips));
    }
    TensorShape expand_dim = dst.shape();
    expand_dim.insert(expand_dim.begin(), 1);
    dst.Reshape(expand_dim);
+    output = std::move(dst);

-    return dst;
+    return success();
  }

  Result<Tensor> FormatNCHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
    return Transpose(src, {N, H, W, C}, {0, 3, 1, 2});
  };

  Result<Tensor> FormatNCTHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
-    int L = clip_len;
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
+    auto L = clip_len;
    if (N % L != 0) {
      return Status(eInvalidArgument);
    }
@ -77,7 +71,7 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
    return Transpose(src, {M, L, H, W, C}, {0, 4, 1, 2, 3});
  };

-  Result<Tensor> Transpose(Tensor& src, const std::vector<int>& src_dims,
+  Result<Tensor> Transpose(Tensor& src, const TensorShape& src_dims,
                           const std::vector<int>& permutation) {
    Tensor dst(src.desc());
    TensorShape shape(src.shape().size());
@ -123,7 +117,8 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
  constexpr static Device kHost{0, 0};
 };

-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::FormatShapeImpl, (cpu, 0), FormatShapeImpl);
+MMDEPLOY_REGISTER_FACTORY_FUNC(FormatShapeOp, (cpu, 0), [](std::string input_format) {
+  return std::make_unique<FormatShapeImpl>(std::move(input_format));
+});

-}  // namespace cpu
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmaction::cpu
--- a/csrc/mmdeploy/codebase/mmaction/cuda/format_shape_impl.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/cuda/format_shape_impl.cpp
@ -6,8 +6,7 @@

 using namespace std;

-namespace mmdeploy {
-namespace cuda {
+namespace mmdeploy::mmaction::cuda {

 #define CUDNN_CHECK(condition)                                                 \
  do {                                                                         \
@ -16,27 +15,28 @@ namespace cuda {
    }                                                                          \
  } while (0);

-class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
+class FormatShapeImpl : public FormatShapeOp {
 public:
-  explicit FormatShapeImpl(const Value& args) : ::mmdeploy::FormatShapeImpl(args) {
+  explicit FormatShapeImpl(std::string input_format) : FormatShapeOp(std::move(input_format)) {
    CUDNN_CHECK(cudnnCreate(&handle_));
-    CUDNN_CHECK(cudnnSetStream(handle_, (cudaStream_t)stream_.GetNative()));
+    CUDNN_CHECK(cudnnSetStream(handle_, GetNative<cudaStream_t>(stream())));
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&src_desc_));
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&dst_desc_));
  }

-  ~FormatShapeImpl() {
+  ~FormatShapeImpl() override {
    CUDNN_CHECK(cudnnDestroy(handle_));
    CUDNN_CHECK(cudnnDestroyTensorDescriptor(src_desc_));
    CUDNN_CHECK(cudnnDestroyTensorDescriptor(dst_desc_));
  }

 protected:
-  Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len, int num_clips) {
-    int N = tensors.size();
-    int H = tensors[0].shape(1);
-    int W = tensors[0].shape(2);
-    int C = tensors[0].shape(3);
+  Result<void> apply(const std::vector<Tensor>& inputs, Tensor& output, int clip_len,
+                     int num_clips) override {
+    auto N = static_cast<int64_t>(inputs.size());
+    auto H = inputs[0].shape(1);
+    auto W = inputs[0].shape(2);
+    auto C = inputs[0].shape(3);

    auto t0 = std::chrono::high_resolution_clock::now();
    TensorDesc desc = {device_, DataType::kFLOAT, {N, H, W, C}};
@ -45,39 +45,39 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
    int n_item = H * W * C;
    int copy_size = n_item * sizeof(float);
    for (int i = 0; i < N; i++) {
-      auto src_buffer = tensors[i].buffer();
+      auto src_buffer = inputs[i].buffer();
      auto dst_buffer = imgs.buffer();
-      OUTCOME_TRY(stream_.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+      OUTCOME_TRY(stream().Copy(src_buffer, dst_buffer, copy_size, 0, offset));
      offset += copy_size;
    }

-    Tensor dst;
-    if (arg_.input_format == "NCHW") {
-      OUTCOME_TRY(dst, FormatNCHW(imgs, clip_len, num_clips));
+    // Tensor dst;
+    if (input_format_ == "NCHW") {
+      OUTCOME_TRY(output, FormatNCHW(imgs, clip_len, num_clips));
    }
-    if (arg_.input_format == "NCTHW") {
-      OUTCOME_TRY(dst, FormatNCTHW(imgs, clip_len, num_clips));
+    if (input_format_ == "NCTHW") {
+      OUTCOME_TRY(output, FormatNCTHW(imgs, clip_len, num_clips));
    }
-    TensorShape expand_dim = dst.shape();
+    TensorShape expand_dim = output.shape();
    expand_dim.insert(expand_dim.begin(), 1);
-    dst.Reshape(expand_dim);
+    output.Reshape(expand_dim);

-    return dst;
+    return success();
  }

  Result<Tensor> FormatNCHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
    return Transpose(src, {N, H, W, C}, {0, 3, 1, 2});
  };

  Result<Tensor> FormatNCTHW(Tensor& src, int clip_len, int num_clips) {
-    int N = src.shape(0);
-    int H = src.shape(1);
-    int W = src.shape(2);
-    int C = src.shape(3);
+    auto N = src.shape(0);
+    auto H = src.shape(1);
+    auto W = src.shape(2);
+    auto C = src.shape(3);
    int L = clip_len;
    if (N % L != 0) {
      return Status(eInvalidArgument);
@ -88,7 +88,7 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
    return Transpose(src, {M, L, H, W, C}, {0, 4, 1, 2, 3});
  };

-  Result<Tensor> Transpose(Tensor& src, const std::vector<int>& src_dims,
+  Result<Tensor> Transpose(Tensor& src, const TensorShape& src_dims,
                           const std::vector<int>& permutation) {
    Tensor dst(src.desc());
    TensorShape shape(src.shape().size());
@ -104,9 +104,8 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
    return dst;
  }

-  void SetCudnnTensorDescriptor(const std::vector<int> src_dims,
-                                const std::vector<int>& permutation) {
-    int ndim = src_dims.size();
+  void SetCudnnTensorDescriptor(const TensorShape& src_dims, const std::vector<int>& permutation) {
+    auto ndim = src_dims.size();
    std::vector<int> dst_dims(ndim);
    for (int i = 0; i < ndim; i++) {
      dst_dims[i] = src_dims[permutation[i]];
@ -133,12 +132,13 @@ class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {

  constexpr static float one_{1.0};
  constexpr static float zero_{0.0};
-  cudnnHandle_t handle_;
-  cudnnTensorDescriptor_t src_desc_;
-  cudnnTensorDescriptor_t dst_desc_;
+  cudnnHandle_t handle_{};
+  cudnnTensorDescriptor_t src_desc_{};
+  cudnnTensorDescriptor_t dst_desc_{};
 };

-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::FormatShapeImpl, (cuda, 0), FormatShapeImpl);
+MMDEPLOY_REGISTER_FACTORY_FUNC(FormatShapeOp, (cuda, 0), [](std::string input_format) {
+  return std::make_unique<FormatShapeImpl>(std::move(input_format));
+});

-}  // namespace cuda
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmaction::cuda
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
@ -2,81 +2,62 @@

 #include "mmdeploy/codebase/mmaction/format_shape.h"

-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"

 using namespace std;

-namespace mmdeploy {
+namespace mmdeploy::mmaction {

-FormatShapeImpl::FormatShapeImpl(const Value& args) : TransformImpl(args) {
-  arg_.input_format = args.value("input_format", std::string(""));
-  if (arg_.input_format != "NCHW" && arg_.input_format != "NCTHW") {
+FormatShape::FormatShape(const Value& args) {
+  auto input_format = args.value("input_format", std::string(""));
+  if (input_format != "NCHW" && input_format != "NCTHW") {
    throw std::domain_error("'input_format' should be 'NCHW' or 'NCTHW'");
  }
 }

-Result<Value> FormatShapeImpl::Process(const Value& input) {
-  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
+Result<void> FormatShape::Apply(Value& data) {
+  MMDEPLOY_DEBUG("input: {}", data);

-  if (!input.is_array()) {
+  if (!data.is_array()) {
    MMDEPLOY_ERROR("input of format shape should be array");
    return Status(eInvalidArgument);
  }
-  if (!(input[0].contains("img") || input[0].contains("img"))) {
+  if (!(data[0].contains("imgs") || data[0].contains("img"))) {
    MMDEPLOY_ERROR("input should contains imgs or img");
    return Status(eInvalidArgument);
  }

-  int n_image = input.size();
-  int clip_len = input[0]["clip_len"].get<int>();
-  int num_clips = input[0]["num_clips"].get<int>();
+  int n_image = data.size();
+  int clip_len = data[0]["clip_len"].get<int>();
+  int num_clips = data[0]["num_clips"].get<int>();
  std::vector<Tensor> images;

-  if (input[0].contains("imgs")) {
-    int n_crop = input[0]["imgs"].size();
+  if (data[0].contains("imgs")) {
+    int n_crop = data[0]["imgs"].size();
    int total = n_image * n_crop;
    images.reserve(total);
    for (int i = 0; i < n_crop; i++) {
      for (int j = 0; j < n_image; j++) {
-        images.push_back(input[j]["imgs"][i].get<Tensor>());
+        images.push_back(data[j]["imgs"][i].get<Tensor>());
      }
    }
-  } else if (input[0].contains("img")) {
+  } else if (data[0].contains("img")) {
    images.reserve(n_image);
    for (int i = 0; i < n_image; i++) {
-      images.push_back(input[i]["img"].get<Tensor>());
+      images.push_back(data[i]["img"].get<Tensor>());
    }
  }

-  Value output;
-  OUTCOME_TRY(auto img, Format(images, clip_len, num_clips));
-  SetTransformData(output, "img", std::move(img));
-  return output;
+  Tensor dst;
+  OUTCOME_TRY(format_.Apply(images, dst, clip_len, num_clips));
+  data["img"] = std::move(dst);
+
+  return success();
 }

-class FormatShape : public Transform {
- public:
-  explicit FormatShape(const Value& args, int version = 0) : Transform(args) {
-    auto impl_creator = gRegistry<FormatShapeImpl>().Get(specified_platform_, version);
-    if (nullptr == impl_creator) {
-      MMDEPLOY_ERROR("'FormatShape' is not supported on '{}' platform", specified_platform_);
-      throw std::domain_error("'FormatShape' is not supported on specified platform");
-    }
-    impl_ = impl_creator->Create(args);
-  }
-  ~FormatShape() override = default;
+MMDEPLOY_REGISTER_TRANSFORM(FormatShape);

-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
+MMDEPLOY_DEFINE_REGISTRY(FormatShapeOp);

- protected:
-  std::unique_ptr<FormatShapeImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (FormatShape, 0), [](const Value& config) {
-  return std::make_unique<FormatShape>(config, 0);
-});
-
-MMDEPLOY_DEFINE_REGISTRY(FormatShapeImpl);
-
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmaction
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.h
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.h
@ -7,31 +7,34 @@
 #include <vector>

 #include "mmdeploy/core/tensor.h"
+#include "mmdeploy/operation/managed.h"
 #include "mmdeploy/preprocess/transform/transform.h"

-namespace mmdeploy {
+namespace mmdeploy::mmaction {

-class FormatShapeImpl : public TransformImpl {
+class FormatShapeOp : public operation::Operation {
 public:
-  explicit FormatShapeImpl(const Value& args);
-  ~FormatShapeImpl() override = default;
+  explicit FormatShapeOp(std::string input_format) : input_format_(std::move(input_format)){};

-  Result<Value> Process(const Value& input) override;
+  virtual Result<void> apply(const std::vector<Tensor>& inputs, Tensor& output, int clip_len,
+                             int num_clips) = 0;

 protected:
-  virtual Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len,
-                                int num_clips) = 0;
-
- protected:
-  struct format_shape_arg_t {
-    std::string input_format;
-  };
-  using ArgType = struct format_shape_arg_t;
-  ArgType arg_;
+  std::string input_format_;
 };

-MMDEPLOY_DECLARE_REGISTRY(FormatShapeImpl, std::unique_ptr<FormatShapeImpl>(const Value& config));
+class FormatShape : public Transform {
+ public:
+  explicit FormatShape(const Value& args);

-}  // namespace mmdeploy
+  Result<void> Apply(Value& data) override;
+
+ private:
+  operation::Managed<FormatShapeOp> format_;
+};
+
+MMDEPLOY_DECLARE_REGISTRY(FormatShapeOp, std::unique_ptr<FormatShapeOp>(std::string input_format));
+
+}  // namespace mmdeploy::mmaction

 #endif
--- a/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
@ -15,6 +15,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE
        ${CMAKE_SOURCE_DIR}/third_party/clipper)
 target_link_libraries(${PROJECT_NAME} PRIVATE
        mmdeploy_opencv_utils
+        mmdeploy_operation
        mmdeploy::transform
        mmdeploy::core)
 add_library(mmdeploy::mmocr ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
@ -11,7 +11,9 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"

-namespace mmdeploy::mmocr {
+namespace mmdeploy {
+
+namespace mmocr {

 using std::string;
 using std::vector;
@ -137,4 +139,6 @@ MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, DBHead);

 MMDEPLOY_DEFINE_REGISTRY(DbHeadImpl);

-}  // namespace mmdeploy::mmocr
+}  // namespace mmocr
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
@ -1,25 +1,20 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include <set>
-
-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/archive/value_archive.h"
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/tensor.h"
-#include "mmdeploy/core/utils/device_utils.h"
 #include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/resize.h"
+#include "mmdeploy/operation/managed.h"
+#include "mmdeploy/operation/vision.h"
 #include "mmdeploy/preprocess/transform/transform.h"
-#include "opencv2/imgproc.hpp"
-#include "opencv_utils.h"

 using namespace std;

 namespace mmdeploy::mmocr {

-class ResizeOCRImpl : public Module {
+class ResizeOCR : public transform::Transform {
 public:
-  explicit ResizeOCRImpl(const Value& args) noexcept {
+  explicit ResizeOCR(const Value& args) noexcept {
    height_ = args.value("height", height_);
    min_width_ = args.contains("min_width") && args["min_width"].is_number_integer()
                     ? args["min_width"].get<int>()
@ -33,31 +28,30 @@ class ResizeOCRImpl : public Module {
                   : backend_;
    img_pad_value_ = args.value("img_pad_value", img_pad_value_);
    width_downsample_ratio_ = args.value("width_downsample_ratio", width_downsample_ratio_);
-    stream_ = args["context"]["stream"].get<Stream>();
+
+    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
+    pad_ = operation::Managed<operation::Pad>::Create("constant", img_pad_value_);
  }

-  ~ResizeOCRImpl() override = default;
+  ~ResizeOCR() override = default;

-  Result<Value> Process(const Value& input) override {
-    MMDEPLOY_DEBUG("input: {}", input);
+  Result<void> Apply(Value& data) override {
+    MMDEPLOY_DEBUG("input: {}", data);
    auto dst_height = height_;
    auto dst_min_width = min_width_;
    auto dst_max_width = max_width_;

    std::vector<int> img_shape;  // NHWC
-    from_value(input["img_shape"], img_shape);
+    from_value(data["img_shape"], img_shape);

    std::vector<int> ori_shape;  // NHWC
-    from_value(input["ori_shape"], ori_shape);
+    from_value(data["ori_shape"], ori_shape);

    auto ori_height = ori_shape[1];
    auto ori_width = ori_shape[2];
    auto valid_ratio = 1.f;

-    Device host{"cpu"};
-    auto _img = input["img"].get<Tensor>();
-    OUTCOME_TRY(auto img, MakeAvailableOnDevice(_img, host, stream_));
-    stream_.Wait().value();
+    auto img = data["img"].get<Tensor>();
    Tensor img_resize;
    if (keep_aspect_ratio_) {
      auto new_width = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
@ -71,55 +65,29 @@ class ResizeOCRImpl : public Module {
      if (dst_max_width > 0) {
        valid_ratio = std::min(1., 1. * new_width / dst_max_width);
        auto resize_width = std::min(dst_max_width, new_width);
-        img_resize = ResizeImage(img, dst_height, resize_width);
+        OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, resize_width));
        if (new_width < dst_max_width) {
-          img_resize = PadImage(img_resize, dst_height, dst_max_width);
+          auto pad_w = std::max(0, dst_max_width - resize_width);
+          OUTCOME_TRY(pad_.Apply(img_resize, img_resize, 0, 0, 0, pad_w));
        }
      } else {
-        img_resize = ResizeImage(img, dst_height, new_width);
+        OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
      }
    } else {
-      img_resize = ResizeImage(img, dst_height, dst_max_width);
+      OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_max_width));
    }
-    Value output = input;
-    output["img"] = img_resize;
-    output["resize_shape"] = to_value(img_resize.desc().shape);
-    output["pad_shape"] = output["resize_shape"];
-    output["valid_ratio"] = valid_ratio;
-    MMDEPLOY_DEBUG("output: {}", to_json(output).dump(2));
-    return output;
-  }

-  Tensor ResizeImage(const Tensor& img, int dst_h, int dst_w) {
-    TensorDesc desc = img.desc();
-    assert(desc.shape.size() == 4);
-    assert(desc.data_type == DataType::kINT8);
-    int h = desc.shape[1];
-    int w = desc.shape[2];
-    int c = desc.shape[3];
-    assert(c == 3 || c == 1);
-    cv::Mat src_mat, dst_mat;
-    if (3 == c) {  // rgb
-      src_mat = cv::Mat(h, w, CV_8UC3, const_cast<uint8_t*>(img.data<uint8_t>()));
-    } else {  // gray
-      src_mat = cv::Mat(h, w, CV_8UC1, const_cast<uint8_t*>(img.data<uint8_t>()));
-    }
-    cv::Size size{dst_w, dst_h};
-    cv::resize(src_mat, dst_mat, size, cv::INTER_LINEAR);
-    return Tensor({desc.device, desc.data_type, {1, dst_h, dst_w, c}, ""},
-                  {dst_mat.data, [mat = dst_mat](void* ptr) {}});
-  }
-
-  Tensor PadImage(const Tensor& src_img, int height, int width) {
-    cv::Mat src_mat = cpu::Tensor2CVMat(src_img);
-    cv::Mat dst_mat;
-    auto pad_h = std::max(0, height - src_mat.rows);
-    auto pad_w = std::max(0, width - src_mat.cols);
-    cv::copyMakeBorder(src_mat, dst_mat, 0, pad_h, 0, pad_w, cv::BORDER_CONSTANT, img_pad_value_);
-    return cpu::CVMat2Tensor(dst_mat);
+    data["img"] = img_resize;
+    data["resize_shape"] = to_value(img_resize.desc().shape);
+    data["pad_shape"] = data["resize_shape"];
+    data["valid_ratio"] = valid_ratio;
+    MMDEPLOY_DEBUG("output: {}", data);
+    return success();
  }

 protected:
+  operation::Managed<operation::Resize> resize_;
+  operation::Managed<operation::Pad> pad_;
  int height_{-1};
  int min_width_{-1};
  int max_width_{-1};
@ -127,33 +95,8 @@ class ResizeOCRImpl : public Module {
  float img_pad_value_{0};
  float width_downsample_ratio_{1. / 16};
  std::string backend_;
-  Stream stream_;
 };

-MMDEPLOY_CREATOR_SIGNATURE(ResizeOCRImpl, std::unique_ptr<ResizeOCRImpl>(const Value& config));
-
-MMDEPLOY_DEFINE_REGISTRY(ResizeOCRImpl);
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(ResizeOCRImpl, (cpu, 0), [](const Value& config) {
-  return std::make_unique<ResizeOCRImpl>(config);
-});
-
-class ResizeOCR : public Transform {
- public:
-  explicit ResizeOCR(const Value& args) : Transform(args) {
-    impl_ = Instantiate<ResizeOCRImpl>("ResizeOCR", args);
-  }
-  ~ResizeOCR() override = default;
-
-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
-
- private:
-  std::unique_ptr<ResizeOCRImpl> impl_;
-  static const std::string name_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (ResizeOCR, 0), [](const Value& config) {
-  return std::make_unique<ResizeOCR>(config);
-});
+MMDEPLOY_REGISTER_TRANSFORM(ResizeOCR);

 }  // namespace mmdeploy::mmocr
--- a/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
@ -2,31 +2,29 @@

 #include <set>

-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/archive/value_archive.h"
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/device_utils.h"
 #include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/resize.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 #include "opencv2/imgproc.hpp"
 #include "opencv_utils.h"

 using namespace std;

-namespace mmdeploy {
+namespace mmdeploy::mmpose {

-cv::Point2f operator*(cv::Point2f a, cv::Point2f b) {
+cv::Point2f operator*(const cv::Point2f& a, const cv::Point2f& b) {
  cv::Point2f c;
  c.x = a.x * b.x;
  c.y = a.y * b.y;
  return c;
 }

-class TopDownAffineImpl : public Module {
+class TopDownAffine : public transform::Transform {
 public:
-  explicit TopDownAffineImpl(const Value& args) noexcept {
+  explicit TopDownAffine(const Value& args) noexcept {
    use_udp_ = args.value("use_udp", use_udp_);
    backend_ = args.contains("backend") && args["backend"].is_string()
                   ? args["backend"].get<string>()
@ -36,13 +34,13 @@ class TopDownAffineImpl : public Module {
    from_value(args["image_size"], image_size_);
  }

-  ~TopDownAffineImpl() override = default;
+  ~TopDownAffine() override = default;

-  Result<Value> Process(const Value& input) override {
-    MMDEPLOY_DEBUG("top_down_affine input: {}", input);
+  Result<void> Apply(Value& data) override {
+    MMDEPLOY_DEBUG("top_down_affine input: {}", data);

    Device host{"cpu"};
-    auto _img = input["img"].get<Tensor>();
+    auto _img = data["img"].get<Tensor>();
    OUTCOME_TRY(auto img, MakeAvailableOnDevice(_img, host, stream_));
    stream_.Wait().value();
    auto src = cpu::Tensor2CVMat(img);
@ -51,18 +49,18 @@ class TopDownAffineImpl : public Module {
    vector<float> bbox;
    vector<float> c;  // center
    vector<float> s;  // scale
-    if (input.contains("center") && input.contains("scale")) {
+    if (data.contains("center") && data.contains("scale")) {
      // after mmpose v0.26.0
-      from_value(input["center"], c);
-      from_value(input["scale"], s);
+      from_value(data["center"], c);
+      from_value(data["scale"], s);
    } else {
      // before mmpose v0.26.0
-      from_value(input["bbox"], bbox);
+      from_value(data["bbox"], bbox);
      Box2cs(bbox, c, s);
    }
    // end prepare data

-    auto r = input["rotation"].get<float>();
+    auto r = data["rotation"].get<float>();

    cv::Mat dst;
    if (use_udp_) {
@ -77,13 +75,12 @@ class TopDownAffineImpl : public Module {
      cv::warpAffine(src, dst, trans, {image_size_[0], image_size_[1]}, cv::INTER_LINEAR);
    }

-    Value output = input;
-    output["img"] = cpu::CVMat2Tensor(dst);
-    output["img_shape"] = {1, image_size_[1], image_size_[0], dst.channels()};
-    output["center"] = to_value(c);
-    output["scale"] = to_value(s);
-    MMDEPLOY_DEBUG("output: {}", to_json(output).dump(2));
-    return output;
+    data["img"] = cpu::CVMat2Tensor(dst);
+    data["img_shape"] = {1, image_size_[1], image_size_[0], dst.channels()};
+    data["center"] = to_value(c);
+    data["scale"] = to_value(s);
+    MMDEPLOY_DEBUG("output: {}", data);
+    return success();
  }

  void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale) {
@ -169,31 +166,6 @@ class TopDownAffineImpl : public Module {
  Stream stream_;
 };

-MMDEPLOY_CREATOR_SIGNATURE(TopDownAffineImpl,
-                           std::unique_ptr<TopDownAffineImpl>(const Value& config));
+MMDEPLOY_REGISTER_TRANSFORM(TopDownAffine);

-MMDEPLOY_DEFINE_REGISTRY(TopDownAffineImpl);
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(TopDownAffineImpl, (cpu, 0), [](const Value& config) {
-  return std::make_unique<TopDownAffineImpl>(config);
-});
-
-class TopDownAffine : public Transform {
- public:
-  explicit TopDownAffine(const Value& args) : Transform(args) {
-    impl_ = Instantiate<TopDownAffineImpl>("TopDownAffine", args);
-  }
-  ~TopDownAffine() override = default;
-
-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
-
- private:
-  std::unique_ptr<TopDownAffineImpl> impl_;
-  static const std::string name_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (TopDownAffine, 0), [](const Value& config) {
-  return std::make_unique<TopDownAffine>(config);
-});
-
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmpose
--- a/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
@ -2,45 +2,38 @@

 #include <vector>

-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/archive/value_archive.h"
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/tensor.h"
-#include "mmdeploy/core/utils/device_utils.h"
 #include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/resize.h"
 #include "mmdeploy/preprocess/transform/transform.h"
-#include "opencv2/imgproc.hpp"
-#include "opencv_utils.h"

 using namespace std;

-namespace mmdeploy {
+namespace mmdeploy::mmpose {

-class TopDownGetBboxCenterScaleImpl : public TransformImpl {
+class TopDownGetBboxCenterScale : public transform::Transform {
 public:
-  TopDownGetBboxCenterScaleImpl(const Value& args) : TransformImpl(args) {
+  explicit TopDownGetBboxCenterScale(const Value& args) {
    padding_ = args.value("padding", 1.25);
    assert(args.contains("image_size"));
    from_value(args["image_size"], image_size_);
  }

-  ~TopDownGetBboxCenterScaleImpl() override = default;
-
-  Result<Value> Process(const Value& input) override {
-    Value output = input;
+  ~TopDownGetBboxCenterScale() override = default;

+  Result<void> Apply(Value& data) override {
    vector<float> bbox;
-    from_value(input["bbox"], bbox);
+    from_value(data["bbox"], bbox);

    vector<float> c;  // center
    vector<float> s;  // scale

    Box2cs(bbox, c, s, padding_, pixel_std_);
-    output["center"] = to_value(c);
-    output["scale"] = to_value(s);
+    data["center"] = to_value(c);
+    data["scale"] = to_value(s);

-    return output;
+    return success();
  }

  void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale, float padding,
@ -68,32 +61,6 @@ class TopDownGetBboxCenterScaleImpl : public TransformImpl {
  vector<int> image_size_;
 };

-MMDEPLOY_CREATOR_SIGNATURE(TopDownGetBboxCenterScaleImpl,
-                           std::unique_ptr<TopDownGetBboxCenterScaleImpl>(const Value& config));
+MMDEPLOY_REGISTER_TRANSFORM(TopDownGetBboxCenterScale);

-MMDEPLOY_DEFINE_REGISTRY(TopDownGetBboxCenterScaleImpl);
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(TopDownGetBboxCenterScaleImpl, (cpu, 0), [](const Value& config) {
-  return std::make_unique<TopDownGetBboxCenterScaleImpl>(config);
-});
-
-class TopDownGetBboxCenterScale : public Transform {
- public:
-  explicit TopDownGetBboxCenterScale(const Value& args) : Transform(args) {
-    auto impl_creator = gRegistry<TopDownGetBboxCenterScaleImpl>().Get("cpu");
-    impl_ = impl_creator->Create(args);
-  }
-  ~TopDownGetBboxCenterScale() override = default;
-
-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
-
- private:
-  std::unique_ptr<TopDownGetBboxCenterScaleImpl> impl_;
-  static const std::string name_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (TopDownGetBboxCenterScale, 0), [](const Value& config) {
-  return std::make_unique<TopDownGetBboxCenterScale>(config);
-});
-
-}  // namespace mmdeploy
+}  // namespace mmdeploy::mmpose
--- a/csrc/mmdeploy/core/device.h
+++ b/csrc/mmdeploy/core/device.h
@ -133,9 +133,7 @@ class MMDEPLOY_API Platform {
  std::shared_ptr<PlatformImpl> impl_;
 };

-Platform GetPlatform(int platform_id);
-
-Platform GetPlatform(const char* platform_name);
+MMDEPLOY_API const char* GetPlatformName(PlatformId id);

 class MMDEPLOY_API Stream {
 public:
--- a/csrc/mmdeploy/core/device_impl.cpp
+++ b/csrc/mmdeploy/core/device_impl.cpp
@ -54,6 +54,13 @@ Platform::Platform(int platform_id) {
  }
 }

+const char* GetPlatformName(PlatformId id) {
+  if (auto impl = gPlatformRegistry().GetPlatformImpl(id); impl) {
+    return impl->GetPlatformName();
+  }
+  return nullptr;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Buffer

--- a/csrc/mmdeploy/core/mat.cpp
+++ b/csrc/mmdeploy/core/mat.cpp
@ -6,33 +6,33 @@ namespace mmdeploy::framework {

 Mat::Mat(int h, int w, PixelFormat format, DataType type, Device device, Allocator allocator)
    : format_(format), type_(type), width_(w), height_(h) {
-  int bytes_per_pixel = 0;
+  int bits_per_pixel = 0;
  switch (format) {
    case PixelFormat::kGRAYSCALE:
      channel_ = 1;
-      bytes_per_pixel = 8;
+      bits_per_pixel = 8;
      break;
    case PixelFormat::kNV12:  // fall through
    case PixelFormat::kNV21:
      channel_ = 1;
-      bytes_per_pixel = 12;
+      bits_per_pixel = 12;
      assert(w % 2 == 0);
      break;
    case PixelFormat::kBGR:  // fall through
    case PixelFormat::kRGB:
      channel_ = 3;
-      bytes_per_pixel = 24;
+      bits_per_pixel = 24;
      break;
    case PixelFormat::kBGRA:
      channel_ = 4;
-      bytes_per_pixel = 32;
+      bits_per_pixel = 32;
      break;
    default:
      throw_exception(eNotSupported);
  }

  size_ = height_ * width_ * channel_;
-  bytes_ = height_ * width_ * bytes_per_pixel / 8;
+  bytes_ = height_ * width_ * bits_per_pixel / 8;

  switch (type) {
    case DataType::kFLOAT:
--- a/csrc/mmdeploy/core/model.h
+++ b/csrc/mmdeploy/core/model.h
@ -60,7 +60,7 @@ class MMDEPLOY_API Model {
  Result<void> Init(const void* buffer, size_t size);

  /**
-   * @brief Return the model's meta info
+   * @brief Get model's meta info
   * @param name the name of a model in the SDK model file
   * @return
   */
--- a/csrc/mmdeploy/core/status_code.h
+++ b/csrc/mmdeploy/core/status_code.h
@ -168,7 +168,9 @@ using Error = SYSTEM_ERROR2_NAMESPACE::errored_status_code<StatusDomain>;
 using Exception = SYSTEM_ERROR2_NAMESPACE::status_error<StatusDomain>;

 template <typename T>
-using Result = OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>;
+struct Result : OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error> {
+  using OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>::status_result;
+};

 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
 [[noreturn]] inline void throw_exception(ErrorCode ec,
@ -186,7 +188,13 @@ using Result = OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>;
 #endif

 template <typename T>
-inline constexpr bool is_result_v = OUTCOME_V2_NAMESPACE::is_basic_result_v<T>;
+struct is_result : std::false_type {};
+
+template <typename T>
+struct is_result<Result<T>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_result_v = is_result<T>::value;

 }  // namespace mmdeploy

--- a/csrc/mmdeploy/core/types.h
+++ b/csrc/mmdeploy/core/types.h
@ -17,19 +17,44 @@ enum class PixelFormat : int32_t {
  kGRAYSCALE,
  kNV12,
  kNV21,
-  kBGRA
+  kBGRA,
+  kCOUNT
 };

+
+
 enum class DataType : int32_t {
  kFLOAT,
  kHALF,
  kINT8,
  kINT32,
-  kINT64
+  kINT64,
+  kCOUNT
 };

 // clang-format on

+namespace pixel_formats {
+
+constexpr auto kBGR = PixelFormat::kBGR;
+constexpr auto kRGB = PixelFormat::kRGB;
+constexpr auto kGRAY = PixelFormat::kGRAYSCALE;
+constexpr auto kNV12 = PixelFormat::kNV12;
+constexpr auto kNV21 = PixelFormat::kNV21;
+constexpr auto kBGRA = PixelFormat::kBGRA;
+
+}  // namespace pixel_formats
+
+namespace data_types {
+
+constexpr auto kFLOAT = DataType::kFLOAT;
+constexpr auto kHALF = DataType::kHALF;
+constexpr auto kINT8 = DataType::kINT8;
+constexpr auto kINT32 = DataType::kINT32;
+constexpr auto kINT64 = DataType::kINT64;
+
+}  // namespace data_types
+
 class NonCopyable {
 public:
  NonCopyable() = default;
--- a/csrc/mmdeploy/core/utils/device_utils.cpp
+++ b/csrc/mmdeploy/core/utils/device_utils.cpp
@ -13,6 +13,13 @@ Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream&

  Mat dst{src.height(), src.width(), src.pixel_format(), src.type(), device};
  OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
+
+  // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
+  //   computation on dst won't be synchronized with stream
+  if (device != stream.GetDevice()) {
+    OUTCOME_TRY(stream.Wait());
+  }
+
  return dst;
 }

@ -26,17 +33,13 @@ Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, St

  OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), src.byte_size()));

+  // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
+  //   computation on dst won't be synchronized with stream
+  if (device != stream.GetDevice()) {
+    OUTCOME_TRY(stream.Wait());
+  }
+
  return dst;
 }

-SyncOnScopeExit::~SyncOnScopeExit() {
-  if (active_ && stream_) {
-    if (!stream_.Wait()) {
-      MMDEPLOY_ERROR("Implicit stream synchronization failed.");
-    } else {
-      MMDEPLOY_DEBUG("Implicit stream synchronization succeeded.");
-    }
-  }
-}
-
 }  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/core/utils/device_utils.h
+++ b/csrc/mmdeploy/core/utils/device_utils.h
@ -29,24 +29,6 @@ MMDEPLOY_API Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& dev
 MMDEPLOY_API Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device,
                                                  Stream& stream);

-// Calls stream.Wait() on destruction if active is true. This class is used to force a wait
-// operation before intermediate variables goes out of scope. Add variables in consideration to the
-// tailing parameter pack to ensure correctness (this make sure SyncOnScopeExit is created later
-// (thus will be destructed earlier) than the variables
-
-class MMDEPLOY_API SyncOnScopeExit {
- public:
-  template <typename... Ts>
-  explicit SyncOnScopeExit(Stream& stream, bool active, Ts&&...) noexcept
-      : stream_(stream), active_(active) {}
-
-  ~SyncOnScopeExit();
-
- private:
-  bool active_;
-  Stream& stream_;
-};
-
 }  // namespace mmdeploy::framework

 #endif  // MMDEPLOY_TRANSFORM_UTILS_H
--- a/csrc/mmdeploy/core/utils/scope_counter.h
+++ b/csrc/mmdeploy/core/utils/scope_counter.h
@ -1,48 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#ifndef MMDEPLOY_SRC_UITLS_SCOPECOUNTER_H_
-#define MMDEPLOY_SRC_UITLS_SCOPECOUNTER_H_
-
-#include <chrono>
-
-namespace mmdeploy {
-
-class ScopeCounter {
- public:
-  class State {
-    std::map<std::string, std::pair<double, int> > v;
-  };
-  ScopeCounter() : state_() {}
-  explicit ScopeCounter(State& state) : state_(&state) {}
-  ScopeCounter(const ScopeCounter&) = delete;
-  ScopeCounter(ScopeCounter&&) = delete;
-  ScopeCounter& operator=(const ScopeCounter&) = delete;
-  ScopeCounter& operator=(ScopeCounter&&) = delete;
-  void operator()(const std::string& tag) { operator()(tag.c_str()); }
-  void operator()(const char* tag) {
-    time_points_.emplace_back(tag, std::chrono::high_resolution_clock::now());
-  }
-  ~ScopeCounter() {
-    std::vector<std::pair<std::string, double> > durations;
-    for (int i = 1; i < time_points_.size(); ++i) {
-      auto& [n0, t0] = time_points_[i - 1];
-      auto& [n1, t1] = time_points_[i];
-      auto diff = std::chrono::duration<double, std::milli>(t1 - t0).count();
-      auto name = n0;
-      name += " -> ";
-      name += n1;
-      durations.emplace_back(name, diff);
-    }
-    if (state_) {
-    }
-  }
-
- private:
-  using time_point = std::chrono::high_resolution_clock::time_point;
-  std::vector<std::pair<std::string, time_point> > time_points_;
-  State* state_;
-};
-
-}  // namespace mmdeploy
-
-#endif  // MMDEPLOY_SRC_UITLS_SCOPECOUNTER_H_
--- a/csrc/mmdeploy/net/net_module.cpp
+++ b/csrc/mmdeploy/net/net_module.cpp
@ -11,7 +11,6 @@
 #include "mmdeploy/core/net.h"
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/core/utils/scope_counter.h"
 #include "mmdeploy/experimental/module_adapter.h"

 using std::string;
--- a/csrc/mmdeploy/operation/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/CMakeLists.txt
@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_operation)
+
+set(SRCS operation.cpp vision.cpp)
+mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
+
+add_subdirectory(cpu)
+add_subdirectory(cuda)
+add_subdirectory(dummy)
+
+add_library(mmdeploy::operation ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/operation/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/cpu/CMakeLists.txt
@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_operation_cpu)
+
+set(SRCS resize.cpp
+        cvtcolor.cpp
+        pad.cpp
+        to_float.cpp
+        hwc2chw.cpp
+        normalize.cpp
+        crop.cpp
+        flip.cpp)
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+
+target_link_libraries(${PROJECT_NAME} PRIVATE
+        mmdeploy_operation
+        mmdeploy_opencv_utils)
--- a/csrc/mmdeploy/operation/cpu/crop.cpp
+++ b/csrc/mmdeploy/operation/cpu/crop.cpp
@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class CropImpl : public Crop {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                     int right) override {
+    cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
+    cv::Mat cropped_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
+    dst = mmdeploy::cpu::CVMat2Tensor(cropped_mat);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cpu, 0), []() { return std::make_unique<CropImpl>(); });
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
+++ b/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class CvtColorImpl : public CvtColor {
+ public:
+  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
+    auto src_mat = mmdeploy::cpu::Mat2CVMat(src);
+    auto dst_mat = mmdeploy::cpu::CvtColor(src_mat, src.pixel_format(), dst_fmt);
+    dst = mmdeploy::cpu::CVMat2Mat(dst_mat, dst_fmt);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cpu, 0), [] { return std::make_unique<CvtColorImpl>(); });
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/flip.cpp
+++ b/csrc/mmdeploy/operation/cpu/flip.cpp
@ -0,0 +1,24 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class FlipImpl : public Flip {
+ public:
+  using Flip::Flip;
+
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
+    cv::Mat flipped_mat;
+    cv::flip(mat, flipped_mat, flip_code_);
+    dst = mmdeploy::cpu::CVMat2Tensor(flipped_mat);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cpu, 0),
+                               [](int flip_code) { return std::make_unique<FlipImpl>(flip_code); });
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
+++ b/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class HWC2CHWImpl : public HWC2CHW {
+ public:
+  Result<void> apply(const Tensor& img, Tensor& dst) override {
+    auto shape = img.shape();
+    auto height = shape[1];
+    auto width = shape[2];
+    auto channels = shape[3];
+
+    auto dst_mat = mmdeploy::cpu::Transpose(mmdeploy::cpu::Tensor2CVMat(img));
+
+    auto dst_tensor = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+    dst_tensor.Reshape({1, channels, height, width});
+
+    dst = std::move(dst_tensor);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cpu, 0), []() { return std::make_unique<HWC2CHWImpl>(); });
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/normalize.cpp
+++ b/csrc/mmdeploy/operation/cpu/normalize.cpp
@ -0,0 +1,29 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class NormalizeImpl : public Normalize {
+ public:
+  explicit NormalizeImpl(Param param) : param_(std::move(param)) {}
+
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    auto mat = mmdeploy::cpu::Tensor2CVMat(src);
+    auto dst_mat = mmdeploy::cpu::Normalize(mat, param_.mean, param_.std, param_.to_rgb, false);
+    auto output = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+
+    dst = std::move(output);
+    return success();
+  }
+
+ protected:
+  Param param_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cpu, 0), [](const Normalize::Param& param) {
+  return std::make_unique<NormalizeImpl>(param);
+});
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/pad.cpp
+++ b/csrc/mmdeploy/operation/cpu/pad.cpp
@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <map>
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class PadImpl : public Pad {
+ public:
+  PadImpl(cv::BorderTypes border_type, float pad_val)
+      : border_type_(border_type), pad_val_(pad_val) {}
+
+  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                     int right) override {
+    cv::Mat dst_mat = mmdeploy::cpu::Pad(mmdeploy::cpu::Tensor2CVMat(src), top, left, bottom, right,
+                                         border_type_, pad_val_);
+    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+    return success();
+  }
+
+ private:
+  cv::BorderTypes border_type_;
+  float pad_val_;
+};
+
+static auto Create(const string_view& border_type, float pad_val) {
+  static const std::map<string_view, cv::BorderTypes> border_map{
+      {"constant", cv::BORDER_CONSTANT},
+      {"edge", cv::BORDER_REPLICATE},
+      {"reflect", cv::BORDER_REFLECT_101},
+      {"symmetric", cv::BORDER_REFLECT}};
+  return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
+}
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cpu, 0), Create);
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/resize.cpp
+++ b/csrc/mmdeploy/operation/cpu/resize.cpp
@ -0,0 +1,27 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class ResizeImpl : public Resize {
+ public:
+  ResizeImpl(std::string interp) : interp_(std::move(interp)) {}
+
+  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
+    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+    auto dst_mat = mmdeploy::cpu::Resize(src_mat, dst_h, dst_w, interp_);
+    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+    return success();
+  }
+
+ private:
+  std::string interp_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cpu, 0), [](const string_view& interp) {
+  return std::make_unique<ResizeImpl>(std::string(interp));
+});
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cpu/to_float.cpp
+++ b/csrc/mmdeploy/operation/cpu/to_float.cpp
@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <map>
+
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+namespace mmdeploy::operation::cpu {
+
+class ToFloatImpl : public ToFloat {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    auto data_type = src.desc().data_type;
+    if (data_type == DataType::kFLOAT) {
+      dst = src;
+      return success();
+    }
+
+    if (data_type == DataType::kINT8) {
+      const auto size = src.size();
+      if (size > std::numeric_limits<int>::max()) {
+        throw_exception(eNotSupported);
+      }
+      cv::Mat uint8_mat(1, static_cast<int>(size), CV_8U, const_cast<void*>(src.data()));
+
+      auto desc = src.desc();
+      desc.data_type = DataType::kFLOAT;
+      Tensor dst_tensor(desc);
+
+      cv::Mat float_mat(1, static_cast<int>(size), CV_32F, dst_tensor.data());
+      uint8_mat.convertTo(float_mat, CV_32F);
+
+      dst = std::move(dst_tensor);
+      return success();
+    }
+    throw_exception(eNotSupported);
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cpu, 0), []() { return std::make_unique<ToFloatImpl>(); });
+
+}  // namespace mmdeploy::operation::cpu
--- a/csrc/mmdeploy/operation/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/cuda/CMakeLists.txt
@ -0,0 +1,28 @@
+if (NOT ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
+    return()
+endif ()
+
+project(mmdeploy_operation_cuda CUDA CXX)
+
+find_package(pplcv REQUIRED)
+
+set(SRCS resize.cpp
+        cvtcolor.cpp
+        pad.cpp
+        to_float.cpp
+        cast.cu
+        hwc2chw.cpp
+        transpose.cu
+        normalize.cpp
+        normalize.cu
+        crop.cpp
+        crop.cu
+        flip.cpp)
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+
+target_link_libraries(${PROJECT_NAME} PRIVATE
+        mmdeploy_operation
+        ${PPLCV_LIBRARIES})
+target_include_directories(${PROJECT_NAME}
+        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include ${PPLCV_INCLUDE_DIRS})
--- a/csrc/mmdeploy/operation/cuda/cast.cu
+++ b/csrc/mmdeploy/operation/cuda/cast.cu
@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cstdint>
+
+namespace mmdeploy {
+namespace operation {
+namespace cuda {
+
+template <typename From, typename To>
+__global__ void _Cast(const From* src, To* dst, size_t n) {
+  auto idx = threadIdx.x + static_cast<size_t>(blockIdx.x) * blockDim.x;
+  for (size_t i = idx; i < n; i += blockDim.x * gridDim.x) {
+    dst[i] = static_cast<To>(src[i]);
+  }
+}
+
+template <typename From, typename To>
+void Cast(const From* src, To* dst, size_t n, cudaStream_t stream) {
+  size_t n_threads = 256;
+  size_t n_blocks = (n + n_threads - 1) / n_threads;
+  _Cast<<<n_blocks, n_threads, 0, stream>>>(src, dst, n);
+}
+
+template void Cast(const uint8_t*, float*, size_t, cudaStream_t);
+
+}  // namespace cuda
+}  // namespace operation
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/operation/cuda/crop.cpp
+++ b/csrc/mmdeploy/operation/cuda/crop.cpp
@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::cuda {
+
+namespace impl {
+
+template <typename T, int channels>
+void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w,
+          cudaStream_t stream);
+
+}
+
+class CropImpl : public Crop {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                     int right) override {
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+    auto desc = src.desc();
+
+    int h = bottom - top + 1;
+    int w = right - left + 1;
+    int c = desc.shape[3];
+    auto type = desc.data_type;
+
+    TensorShape shape{1, bottom - top + 1, right - left + 1, src.desc().shape[3]};
+    TensorDesc dst_desc{device(), src.desc().data_type, shape, desc.name};
+    Tensor dst_tensor{dst_desc};
+
+    if (DataType::kINT8 == type) {
+      auto input = src.data<uint8_t>();
+      auto output = dst_tensor.data<uint8_t>();
+      if (3 == c) {
+        impl::Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+      } else if (1 == c) {
+        impl::Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+      } else {
+        MMDEPLOY_ERROR("unsupported channels {}", c);
+        return Status(eNotSupported);
+      }
+    } else if (DataType::kFLOAT == type) {
+      auto input = static_cast<float*>(src.buffer().GetNative());
+      auto output = static_cast<float*>(dst_tensor.buffer().GetNative());
+      if (3 == c) {
+        impl::Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+      } else if (1 == c) {
+        impl::Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+      } else {
+        MMDEPLOY_ERROR("unsupported channels {}", c);
+        return Status(eNotSupported);
+      }
+    } else {
+      MMDEPLOY_ERROR("unsupported type {}", type);
+      return Status(eNotSupported);
+    }
+
+    dst = std::move(dst_tensor);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cuda, 0), [] { return std::make_unique<CropImpl>(); });
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/preprocess/cuda/crop.cu
+++ b/csrc/mmdeploy/preprocess/cuda/crop.cu
@ -1,9 +1,11 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include <stdint.h>
+#include <cstdint>

 namespace mmdeploy {
+namespace operation {
 namespace cuda {
+namespace impl {

 template <typename T, int channels>
 __global__ void crop(const T *src, int src_w, T *dst, int dst_h, int dst_w, int offset_h,
@ -45,5 +47,7 @@ template void Crop<float, 3>(const float *src, int src_w, float *dst, int dst_h,
 template void Crop<float, 1>(const float *src, int src_w, float *dst, int dst_h, int dst_w,
                             int offset_h, int offset_w, cudaStream_t stream);

+}  // namespace impl
 }  // namespace cuda
+}  // namespace operation
 }  // namespace mmdeploy
--- a/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
+++ b/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
@ -0,0 +1,127 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "ppl/cv/cuda/cvtcolor.h"
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/vision.h"
+
+using namespace ppl::cv::cuda;
+
+namespace mmdeploy::operation::cuda {
+
+template <typename T>
+using Converter = ppl::common::RetCode (*)(cudaStream_t stream, int height, int width,
+                                           int inWidthStride, const T* inData, int outWidthStride,
+                                           T* outData);
+
+namespace {
+
+template <typename T>
+ppl::common::RetCode CopyLuma(cudaStream_t stream, int height, int width, int inWidthStride,
+                              const T* inData, int outWidthStride, T* outData) {
+  auto ec = cudaMemcpyAsync(outData, inData, height * width * sizeof(T), cudaMemcpyDefault, stream);
+  if (ec == cudaSuccess) {
+    return ppl::common::RC_SUCCESS;
+  }
+  return ppl::common::RC_OTHER_ERROR;
+}
+
+template <typename T>
+class ConverterTable {
+  static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
+
+  Converter<T> converters_[kSize][kSize]{};  // value-initialize to zeros
+
+  template <typename Self>
+  static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst) {
+    return self.converters_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
+  }
+
+ public:
+  auto& get(PixelFormat src, PixelFormat dst) noexcept { return get_impl(*this, src, dst); }
+  auto& get(PixelFormat src, PixelFormat dst) const noexcept { return get_impl(*this, src, dst); }
+
+  ConverterTable() {
+    using namespace pixel_formats;
+    // to BGR
+    get(kRGB, kBGR) = RGB2BGR<T>;
+    get(kGRAY, kBGR) = GRAY2BGR<T>;
+    if constexpr (std::is_same_v<T, uint8_t>) {
+      get(kNV21, kBGR) = NV212BGR<T>;
+      get(kNV12, kBGR) = NV122BGR<T>;
+    }
+    get(kBGRA, kBGR) = BGRA2BGR<T>;
+
+    // to RGB
+    get(kBGR, kRGB) = BGR2RGB<T>;
+    get(kGRAY, kRGB) = GRAY2RGB<T>;
+    if constexpr (std::is_same_v<T, uint8_t>) {
+      get(kNV21, kRGB) = NV212RGB<T>;
+      get(kNV12, kRGB) = NV122RGB<T>;
+    }
+    get(kBGRA, kRGB) = BGRA2RGB<T>;
+
+    // to GRAY
+    get(kBGR, kGRAY) = BGR2GRAY<T>;
+    get(kRGB, kGRAY) = RGB2GRAY<T>;
+    get(kNV21, kGRAY) = CopyLuma<T>;
+    get(kNV12, kGRAY) = CopyLuma<T>;
+    get(kBGRA, kGRAY) = BGRA2GRAY<T>;
+  }
+};
+
+template <typename T>
+Converter<T> GetConverter(PixelFormat src, PixelFormat dst) {
+  static const ConverterTable<T> table{};
+  return table.get(src, dst);
+}
+
+}  // namespace
+
+class CvtColorImpl : public CvtColor {
+ public:
+  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
+    if (src.pixel_format() == dst_fmt) {
+      dst = src;
+      return success();
+    }
+
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+    auto height = src.height();
+    auto width = src.width();
+    auto channels = src.channel();
+    auto stride = width * channels;
+
+    Mat dst_mat(height, width, dst_fmt, src.type(), device());
+
+    auto convert = [&](auto type) -> Result<void> {
+      using T = typename decltype(type)::type;
+      auto converter = GetConverter<T>(src.pixel_format(), dst_fmt);
+      if (!converter) {
+        return Status(eNotSupported);
+      }
+      auto ret =
+          converter(cuda_stream, height, width, stride, src.data<T>(), stride, dst_mat.data<T>());
+      if (ret != ppl::common::RC_SUCCESS) {
+        return Status(eFail);
+      }
+      dst = std::move(dst_mat);
+      return success();
+    };
+
+    switch (src.type()) {
+      case DataType::kINT8:
+        return convert(basic_type<uint8_t>{});
+      case DataType::kFLOAT:
+        return convert(basic_type<float>{});
+      default:
+        return Status(eNotSupported);
+    }
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cuda, 0),
+                               [] { return std::make_unique<CvtColorImpl>(); });
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/operation/cuda/flip.cpp
+++ b/csrc/mmdeploy/operation/cuda/flip.cpp
@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "ppl/cv/cuda/flip.h"
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::cuda {
+
+class FlipImpl : public Flip {
+ public:
+  using Flip::Flip;
+
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    Tensor dst_tensor(src.desc());
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+    auto h = static_cast<int>(src.shape(1));
+    auto w = static_cast<int>(src.shape(2));
+    auto c = static_cast<int>(src.shape(3));
+    ppl::common::RetCode ret;
+    if (src.data_type() == DataType::kINT8) {
+      auto input = src.data<uint8_t>();
+      auto output = dst_tensor.data<uint8_t>();
+      if (c == 1) {
+        ret = ppl::cv::cuda::Flip<uint8_t, 1>(cuda_stream, h, w, w * c, input, w * c, output,
+                                              flip_code_);
+      } else if (c == 3) {
+        ret = ppl::cv::cuda::Flip<uint8_t, 3>(cuda_stream, h, w, w * c, input, w * c, output,
+                                              flip_code_);
+      } else {
+        ret = ppl::common::RC_UNSUPPORTED;
+      }
+    } else if (src.data_type() == DataType::kFLOAT) {
+      auto input = src.data<float>();
+      auto output = dst_tensor.data<float>();
+      if (c == 1) {
+        ret = ppl::cv::cuda::Flip<float, 1>(cuda_stream, h, w, w * c, input, w * c, output,
+                                            flip_code_);
+      } else if (c == 3) {
+        ret = ppl::cv::cuda::Flip<float, 3>(cuda_stream, h, w, w * c, input, w * c, output,
+                                            flip_code_);
+      } else {
+        ret = ppl::common::RC_UNSUPPORTED;
+      }
+    } else {
+      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+      return Status(eNotSupported);
+    }
+
+    if (ret != 0) {
+      return Status(eFail);
+    }
+    dst = std::move(dst_tensor);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cuda, 0),
+                               [](int flip_code) { return std::make_unique<FlipImpl>(flip_code); });
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
+++ b/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::cuda {
+
+template <typename T>
+void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
+
+class HWC2CHWImpl : public HWC2CHW {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    auto h = src.shape(1);
+    auto w = src.shape(2);
+    auto c = src.shape(3);
+
+    Tensor dst_tensor(src.desc());
+    dst_tensor.Reshape({1, c, h, w});
+
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+    if (DataType::kINT8 == src.data_type()) {
+      auto input = src.data<uint8_t>();
+      auto output = dst_tensor.data<uint8_t>();
+      Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
+    } else if (DataType::kFLOAT == src.data_type()) {
+      auto input = src.data<float>();
+      auto output = dst_tensor.data<float>();
+      Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
+    } else {
+      assert(0);
+    }
+
+    dst = std::move(dst_tensor);
+    return success();
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cuda, 0), [] { return std::make_unique<HWC2CHWImpl>(); });
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/operation/cuda/normalize.cpp
+++ b/csrc/mmdeploy/operation/cuda/normalize.cpp
@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::cuda {
+
+namespace impl {
+template <typename T, int channels>
+void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean,
+               const float* std, bool to_rgb, cudaStream_t stream);
+}
+
+class NormalizeImpl : public Normalize {
+ public:
+  NormalizeImpl(Param param) : param_(std::move(param)) {}
+
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    auto src_desc = src.desc();
+    int h = (int)src_desc.shape[1];
+    int w = (int)src_desc.shape[2];
+    int c = (int)src_desc.shape[3];
+    int stride = w * c;
+
+    TensorDesc dst_desc{device(), DataType::kFLOAT, src_desc.shape, src_desc.name};
+    Tensor dst_tensor{dst_desc};
+    auto output = dst_tensor.data<float>();
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+    if (DataType::kINT8 == src_desc.data_type) {
+      auto input = src.data<uint8_t>();
+      if (3 == c) {
+        impl::Normalize<uint8_t, 3>(input, h, w, stride, output, param_.mean.data(),
+                                    param_.std.data(), param_.to_rgb, cuda_stream);
+      } else if (1 == c) {
+        impl::Normalize<uint8_t, 1>(input, h, w, stride, output, param_.mean.data(),
+                                    param_.std.data(), param_.to_rgb, cuda_stream);
+      } else {
+        MMDEPLOY_ERROR("unsupported channels {}", c);
+        return Status(eNotSupported);
+      }
+    } else if (DataType::kFLOAT == src_desc.data_type) {
+      auto input = src.data<float>();
+      if (3 == c) {
+        impl::Normalize<float, 3>(input, h, w, stride, output, param_.mean.data(),
+                                  param_.std.data(), param_.to_rgb, cuda_stream);
+      } else if (1 == c) {
+        impl::Normalize<float, 1>(input, h, w, stride, output, param_.mean.data(),
+                                  param_.std.data(), param_.to_rgb, cuda_stream);
+      } else {
+        MMDEPLOY_ERROR("unsupported channels {}", c);
+        return Status(eNotSupported);
+      }
+    } else {
+      MMDEPLOY_ERROR("unsupported data type {}", src_desc.data_type);
+      assert(0);
+      return Status(eNotSupported);
+    }
+
+    dst = std::move(dst_tensor);
+    return success();
+  }
+
+ protected:
+  Param param_;
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cuda, 0), [](const Normalize::Param& param) {
+  return std::make_unique<NormalizeImpl>(param);
+});
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/preprocess/cuda/normalize.cu
+++ b/csrc/mmdeploy/preprocess/cuda/normalize.cu
@ -3,10 +3,11 @@
 #include <cuda_runtime.h>

 #include <cstdint>
-#include <cstdio>

 namespace mmdeploy {
+namespace operation {
 namespace cuda {
+namespace impl {

 template <typename T, int channels>
 __global__ void normalize(const T* src, int height, int width, int stride, float* output,
@ -57,5 +58,7 @@ template void Normalize<float, 3>(const float* src, int height, int width, int s
 template void Normalize<float, 1>(const float* src, int height, int width, int stride,
                                  float* output, const float* mean, const float* std, bool to_rgb,
                                  cudaStream_t stream);
+}  // namespace impl
 }  // namespace cuda
+}  // namespace operation
 }  // namespace mmdeploy
--- a/csrc/mmdeploy/operation/cuda/pad.cpp
+++ b/csrc/mmdeploy/operation/cuda/pad.cpp
@ -0,0 +1,97 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <map>
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/vision.h"
+#include "ppl/cv/cuda/copymakeborder.h"
+
+using namespace ppl::cv::cuda;
+
+namespace mmdeploy::operation::cuda {
+
+class PadImpl : public Pad {
+ public:
+  PadImpl(ppl::cv::BorderType border_type, float pad_val)
+      : border_type_(border_type), pad_val_(pad_val) {}
+
+  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                     int right) override {
+    auto desc = src.desc();
+    int height = desc.shape[1];
+    int width = desc.shape[2];
+    int c = desc.shape[3];
+
+    auto dst_height = height + top + bottom;
+    auto dst_width = width + left + right;
+    TensorShape dst_shape{1, dst_height, dst_width, c};
+    TensorDesc dst_desc{device(), desc.data_type, dst_shape, ""};
+    Tensor dst_tensor(dst_desc);
+
+    ppl::common::RetCode ret = 0;
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+    if (desc.data_type == DataType::kFLOAT) {
+      auto src_buffer = src.data<float>();
+      auto dst_buffer = dst_tensor.data<float>();
+      if (3 == c) {
+        ret = CopyMakeBorder<float, 3>(cuda_stream, height, width, width * c, src_buffer,
+                                       dst_width * c, dst_buffer, top, bottom, left, right,
+                                       border_type_, pad_val_);
+      } else if (1 == c) {
+        ret = CopyMakeBorder<float, 1>(cuda_stream, height, width, width * c, src_buffer,
+                                       dst_width * c, dst_buffer, top, bottom, left, right,
+                                       border_type_, pad_val_);
+      } else {
+        MMDEPLOY_ERROR("unsupported channels {}", c);
+        assert(0);
+        return Status(eNotSupported);
+      }
+    } else if (desc.data_type == DataType::kINT8) {
+      auto src_buffer = src.data<uint8_t>();
+      auto dst_buffer = dst_tensor.data<uint8_t>();
+      if (3 == c) {
+        ret = CopyMakeBorder<ppl::cv::uchar, 3>(cuda_stream, height, width, width * c, src_buffer,
+                                                dst_width * c, dst_buffer, top, bottom, left, right,
+                                                border_type_, (ppl::cv::uchar)pad_val_);
+      } else if (1 == c) {
+        ret = CopyMakeBorder<ppl::cv::uchar, 1>(cuda_stream, height, width, width * c, src_buffer,
+                                                dst_width * c, dst_buffer, top, bottom, left, right,
+                                                border_type_, (ppl::cv::uchar)pad_val_);
+      } else {
+        MMDEPLOY_ERROR("unsupported channels {}", c);
+        assert(0);
+        return Status(eNotSupported);
+      }
+    } else {
+      MMDEPLOY_ERROR("unsupported data type {}", desc.data_type);
+      assert(0);
+      return Status(eNotSupported);
+    }
+    if (ret != 0) {
+      MMDEPLOY_ERROR("unexpected exception happened");
+      assert(0);
+      return Status(eNotSupported);
+    }
+
+    dst = std::move(dst_tensor);
+    return success();
+  }
+
+ private:
+  ppl::cv::BorderType border_type_;
+  float pad_val_;
+};
+
+static auto Create(const string_view& border_type, float pad_val) {
+  static const std::map<string_view, ppl::cv::BorderType> border_map{
+      {"constant", ppl::cv::BORDER_CONSTANT},
+      {"edge", ppl::cv::BORDER_REPLICATE},
+      {"reflect", ppl::cv::BORDER_REFLECT_101},
+      {"symmetric", ppl::cv::BORDER_REFLECT}};
+  return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
+}
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cuda, 0), Create);
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/operation/cuda/resize.cpp
+++ b/csrc/mmdeploy/operation/cuda/resize.cpp
@ -0,0 +1,90 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "ppl/cv/cuda/resize.h"
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::cuda {
+
+class ResizeImpl : public Resize {
+ public:
+  ResizeImpl(ppl::cv::InterpolationType interp) : interp_(interp) {}
+
+  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
+    assert(src.device() == device());
+
+    TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
+    Tensor dst_tensor(desc);
+
+    auto cuda_stream = GetNative<cudaStream_t>(stream());
+    if (src.data_type() == DataType::kINT8) {
+      OUTCOME_TRY(ResizeDispatch<uint8_t>(src, dst_tensor, cuda_stream));
+    } else if (src.data_type() == DataType::kFLOAT) {
+      OUTCOME_TRY(ResizeDispatch<float>(src, dst_tensor, cuda_stream));
+    } else {
+      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+      return Status(eNotSupported);
+    }
+
+    dst = std::move(dst_tensor);
+    return success();
+  }
+
+ private:
+  template <typename T>
+  auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>) {
+    switch (channels) {
+      case 1:
+        return &ppl::cv::cuda::Resize<T, 1>;
+      case 3:
+        return &ppl::cv::cuda::Resize<T, 3>;
+      case 4:
+        return &ppl::cv::cuda::Resize<T, 4>;
+      default:
+        MMDEPLOY_ERROR("unsupported channels {}", channels);
+        return nullptr;
+    }
+  }
+
+  template <class T>
+  Result<void> ResizeDispatch(const Tensor& src, Tensor& dst, cudaStream_t stream) {
+    int h = (int)src.shape(1);
+    int w = (int)src.shape(2);
+    int c = (int)src.shape(3);
+    int dst_h = (int)dst.shape(1);
+    int dst_w = (int)dst.shape(2);
+
+    auto input = src.data<T>();
+    auto output = dst.data<T>();
+
+    ppl::common::RetCode ret = 0;
+
+    if (auto resize = Select<T>(c); resize) {
+      ret = resize(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, interp_);
+    } else {
+      return Status(eNotSupported);
+    }
+
+    return ret == 0 ? success() : Result<void>(Status(eFail));
+  }
+
+  ppl::cv::InterpolationType interp_;
+};
+
+static auto Create(const string_view& interp) {
+  ppl::cv::InterpolationType type{};
+  if (interp == "bilinear") {
+    type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+  } else if (interp == "nearest") {
+    type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
+  } else {
+    MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
+    throw_exception(eNotSupported);
+  }
+  return std::make_unique<ResizeImpl>(type);
+}
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cuda, 0), Create);
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/operation/cuda/to_float.cpp
+++ b/csrc/mmdeploy/operation/cuda/to_float.cpp
@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::cuda {
+
+template <typename From, typename To>
+void Cast(const From* src, To* dst, size_t n, cudaStream_t stream);
+
+class ToFloatImpl : public ToFloat {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    auto data_type = src.desc().data_type;
+    if (data_type == DataType::kFLOAT) {
+      dst = src;
+      return success();
+    }
+
+    if (data_type == DataType::kINT8) {
+      auto desc = src.desc();
+      desc.data_type = DataType::kFLOAT;
+
+      Tensor dst_tensor(desc);
+      Cast(src.data<uint8_t>(), dst_tensor.data<float>(), src.size(),
+           GetNative<cudaStream_t>(stream()));
+
+      dst = std::move(dst_tensor);
+      return success();
+    }
+    throw_exception(eNotSupported);
+  }
+};
+
+MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cuda, 0), [] { return std::make_unique<ToFloatImpl>(); });
+
+}  // namespace mmdeploy::operation::cuda
--- a/csrc/mmdeploy/preprocess/cuda/transpose.cu
+++ b/csrc/mmdeploy/preprocess/cuda/transpose.cu
@ -3,6 +3,7 @@
 #include <cstdint>

 namespace mmdeploy {
+namespace operation {
 namespace cuda {

 template <typename T>
@ -37,4 +38,5 @@ template void Transpose<uint8_t>(const uint8_t* src, int height, int width, int
 template void Transpose<float>(const float* src, int height, int width, int channels, float* dst,
                               cudaStream_t stream);
 }  // namespace cuda
+}  // namespace operation
 }  // namespace mmdeploy
--- a/csrc/mmdeploy/operation/dummy/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/dummy/CMakeLists.txt
@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_operation_dummy)
+
+set(SRCS operations.cpp)
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation)
--- a/csrc/mmdeploy/operation/dummy/operations.cpp
+++ b/csrc/mmdeploy/operation/dummy/operations.cpp
@ -0,0 +1,98 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation::dummy {
+
+namespace {
+
+const Buffer& g_dummy_buffer() {
+  static Buffer buffer{Device(0), 0, nullptr};
+  return buffer;
+}
+
+}  // namespace
+
+class HWC2CHWImpl : public HWC2CHW {
+ public:
+  Result<void> apply(const Tensor& img, Tensor& dst) override {
+    auto& shape = img.shape();
+    dst = {{Device{0}, img.data_type(), {shape[0], shape[3], shape[1], shape[2]}},
+           g_dummy_buffer()};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (dummy, 0),
+                               []() { return std::make_unique<HWC2CHWImpl>(); });
+
+class CropImpl : public Crop {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                     int right) override {
+    auto shape = src.shape();
+    shape[1] = bottom - top + 1;  // h
+    shape[2] = right - left + 1;  // w
+    dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (dummy, 0), []() { return std::make_unique<CropImpl>(); });
+
+class ToFloatImpl : public ToFloat {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (dummy, 0),
+                               []() { return std::make_unique<ToFloatImpl>(); });
+
+class CvtColorImpl : public CvtColor {
+ public:
+  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
+    dst = {src.height(), src.width(), dst_fmt, src.type(), nullptr, Device{0}};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (dummy, 0),
+                               [] { return std::make_unique<CvtColorImpl>(); });
+
+class NormalizeImpl : public Normalize {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst) override {
+    dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (dummy, 0), [](const Normalize::Param& param) {
+  return std::make_unique<NormalizeImpl>();
+});
+
+class PadImpl : public Pad {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                     int right) override {
+    auto shape = src.shape();  // 1HWC
+    shape[1] += top + bottom;
+    shape[2] += left + right;
+    dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (dummy, 0), [](const string_view& border_type, float pad_val) {
+  return std::make_unique<PadImpl>();
+});
+
+class ResizeImpl : public Resize {
+ public:
+  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
+    dst = {{Device{0}, dst.data_type(), {1, dst_h, dst_w, src.shape(3)}}, g_dummy_buffer()};
+    return success();
+  }
+};
+MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (dummy, 0), [](const string_view& interp) {
+  return std::make_unique<ResizeImpl>();
+});
+
+}  // namespace mmdeploy::operation::dummy
--- a/csrc/mmdeploy/operation/managed.h
+++ b/csrc/mmdeploy/operation/managed.h
@ -0,0 +1,182 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_MANAGED_H_
+#define MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_MANAGED_H_
+
+#include "mmdeploy/operation/operation.h"
+
+namespace mmdeploy::operation {
+
+namespace _apply {
+
+inline Result<void> Copy(const Buffer& src, Buffer& dst, size_t size, Stream& stream) {
+  OUTCOME_TRY(stream.Copy(src, dst, size));
+  if (dst.GetDevice() != stream.GetDevice()) {
+    OUTCOME_TRY(stream.Wait());
+  }
+  return success();
+}
+
+inline Result<Tensor> Secure(const Tensor& val, const Device& device, Stream& stream) {
+  if (val.device() == device || gContext().use_dummy()) {
+    return val;
+  }
+
+  TensorDesc desc{device, val.data_type(), val.shape(), val.name()};
+  Tensor dst(desc);
+
+  OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
+
+  gContext().Track(dst);
+  return dst;
+}
+
+inline Result<Mat> Secure(const Mat& val, const Device& device, Stream& stream) {
+  if (val.device() == device || gContext().use_dummy()) {
+    return val;
+  }
+
+  Mat dst{val.height(), val.width(), val.pixel_format(), val.type(), device};
+
+  OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
+
+  gContext().Track(dst);
+  return dst;
+}
+
+template <typename T>
+struct _base_handler {
+  using type = T;
+  static T input(T x, const Device&, Stream&) { return x; }
+  static T pass(T x) { return x; }
+  static void output(T) {}
+};
+
+template <typename T>
+struct _handler : _base_handler<T> {};
+
+template <>
+struct _handler<const Tensor&> : _base_handler<const Tensor&> {
+  using type = Result<Tensor>;
+  static type input(const Tensor& tensor, const Device& device, Stream& stream) {
+    return Secure(tensor, device, stream);
+  }
+  static const Tensor& pass(const type& tensor) { return tensor.value(); }
+  static void output(const Result<Tensor>&) {}
+};
+
+template <>
+struct _handler<const Mat&> {
+  using type = Result<Mat>;
+  static type input(const Mat& mat, const Device& device, Stream& stream) {
+    return Secure(mat, device, stream);
+  }
+  static const Mat& pass(const type& mat) { return mat.value(); }
+  static void output(const type&) {}
+};
+
+template <>
+struct _handler<const std::vector<Tensor>&> {
+  using type = Result<std::vector<Tensor>>;
+  static type input(const std::vector<Tensor>& tensors, const Device& device, Stream& stream) {
+    std::vector<Tensor> rets(tensors.size());
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      OUTCOME_TRY(rets[i], Secure(tensors[i], device, stream));
+    }
+    return rets;
+  }
+  static const std::vector<Tensor>& pass(const type& tensors) { return tensors.value(); }
+  static void output(const type&) {}
+};
+
+template <>
+struct _handler<Tensor&> : _base_handler<Tensor&> {
+  static void output(Tensor& tensor) { gContext().Track(tensor); }
+};
+
+template <>
+struct _handler<Mat&> : _base_handler<Mat&> {
+  static void output(Mat& mat) { gContext().Track(mat); }
+};
+
+inline Result<void> _check() { return success(); }
+
+template <typename T, typename... Ts>
+Result<void> _check(T&& x, Ts&&... xs) {
+  return _check((Ts &&) xs...);
+}
+
+template <typename T, typename... Ts>
+Result<void> _check(Result<T>& x, Ts&&... xs) {
+  OUTCOME_TRY(x);
+  return _check((Ts &&) xs...);
+}
+
+template <typename Sig>
+struct apply_impl {
+  static_assert(!std::is_same_v<Sig, Sig>, "Not a member function pointer");
+};
+
+template <typename Ret, typename C, typename... Args>
+struct apply_impl<Ret (C::*)(Args...)> {
+  const Device& device;
+  Stream& stream;
+
+  template <typename Op, typename... As>
+  Result<void> operator()(Op& op, As&&... as) const {
+    return apply(op, std::index_sequence_for<Args...>{}, (As &&) as...);
+  }
+
+  template <typename Op, typename... As, size_t... Is>
+  Result<void> apply(Op& op, std::index_sequence<Is...>, As&&... as) const {
+    // transform input args and store them in a tuple
+    std::tuple<typename _handler<Args>::type...> tmps{
+        _handler<Args>::input((As &&) as, device, stream)...};
+
+    // check if any copy operations are failed
+    OUTCOME_TRY(_check(std::get<Is>(tmps)...));
+
+    // apply the operation
+    OUTCOME_TRY(op.apply(_handler<Args>::pass(std::get<Is>(tmps))...));
+
+    // track output data (Tensor& and Mat&)
+    (_handler<Args>::output(std::get<Is>(tmps)), ...);
+    return success();
+  }
+};
+
+template <typename Op, typename... Args>
+Result<void> apply(Op& op, Args&&... args) {
+  _apply::apply_impl<decltype(&Op::apply)> impl{op.device(), op.stream()};
+  return impl(op, (Args &&) args...);
+}
+
+}  // namespace _apply
+
+template <typename Op>
+class Managed {
+ public:
+  Managed() = default;
+
+  explicit Managed(std::unique_ptr<Op> op) : op_(std::move(op)) {}
+
+  template <typename... Args>
+  Result<void> Apply(Args&&... args) {
+    assert(op_);
+    return _apply::apply(*op_, (Args &&) args...);
+  }
+
+  template <typename... Args>
+  static Managed<Op> Create(Args&&... args) {
+    return Managed<Op>(operation::Create<Op>((Args &&) args...));
+  }
+
+ private:
+  std::unique_ptr<Op> op_;
+};
+
+using _apply::Secure;
+
+}  // namespace mmdeploy::operation
+
+#endif  // MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_MANAGED_H_
--- a/csrc/mmdeploy/operation/operation.cpp
+++ b/csrc/mmdeploy/operation/operation.cpp
@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/operation.h"
+
+#include "mmdeploy/core/logger.h"
+
+namespace mmdeploy::operation {
+
+thread_local Context* g_context{};
+
+Context::Context(Device device, Stream stream)
+    : device_(device), stream_(std::move(stream)), parent_(std::exchange(g_context, this)) {}
+
+Context::~Context() {
+  if (stream_) {
+    if (auto ec = stream_.Wait(); ec.has_error()) {
+      MMDEPLOY_ERROR("Stream synchronization failed: {}", ec.error().message().c_str());
+    }
+  }
+  g_context = std::exchange(parent_, nullptr);
+}
+
+static Stream GetCurrentStream() { return g_context ? g_context->stream() : Stream{}; }
+
+static Device GetCurrentDevice() { return g_context ? g_context->device() : Device{}; }
+
+Context::Context(Device device) : Context(device, GetCurrentStream()) {}
+
+Context::Context(Stream stream) : Context(GetCurrentDevice(), std::move(stream)) {}
+
+Context& gContext() {
+  if (g_context) {
+    return *g_context;
+  }
+  MMDEPLOY_ERROR("Operations must be used inside scopes guarded by operation::Context, aborting.");
+  std::abort();
+}
+
+}  // namespace mmdeploy::operation
--- a/csrc/mmdeploy/operation/operation.h
+++ b/csrc/mmdeploy/operation/operation.h
@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_OPERATION_H_
+#define MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_OPERATION_H_
+
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/logger.h"
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+
+namespace mmdeploy::operation {
+
+using namespace mmdeploy::framework;
+using std::string_view;
+using std::unique_ptr;
+
+class MMDEPLOY_API Context {
+ public:
+  explicit Context(Device device);
+  explicit Context(Stream stream);
+  explicit Context(Device device, Stream stream);
+  ~Context();
+
+  Context(const Context&) = delete;
+  Context(Context&&) noexcept = delete;
+  Context& operator=(const Context&) = delete;
+  Context& operator=(Context&&) noexcept = delete;
+
+  void Track(const Tensor& tensor) { buffers_.push_back(tensor.buffer()); }
+  void Track(const Mat& mat) { buffers_.push_back(mat.buffer()); };
+  void Track(const Buffer& buffer) { buffers_.push_back(buffer); };
+
+  template <typename T, typename... Args>
+  T Create(Args&&... args) {
+    return _track(T((Args &&) args...));
+  }
+
+  const Device& device() const noexcept { return device_; }
+  Stream& stream() noexcept { return stream_; }
+  const std::vector<Buffer>& buffers() const noexcept { return buffers_; }
+
+  bool use_dummy() const noexcept { return use_dummy_; }
+  void set_use_dummy(bool value) noexcept { use_dummy_ = value; }
+
+ private:
+  Tensor&& _track(Tensor&& tensor) {
+    Track(tensor);
+    return std::move(tensor);
+  }
+  Mat&& _track(Mat&& mat) {
+    Track(mat);
+    return std::move(mat);
+  }
+  Buffer&& _track(Buffer&& buffer) {
+    Track(buffer);
+    return std::move(buffer);
+  }
+
+ private:
+  Device device_;
+  Stream stream_;
+  std::vector<Buffer> buffers_;
+  bool use_dummy_{false};
+  Context* parent_;
+};
+
+MMDEPLOY_API Context& gContext();
+
+template <typename T, typename... Args>
+static unique_ptr<T> Create(Args&&... args) {
+  std::vector<string_view> tried;
+  if (!gContext().use_dummy()) {
+    std::vector<Device> candidates{gContext().device()};
+    if (candidates[0].is_device()) {
+      candidates.emplace_back(0);
+    }
+    for (const auto& device : candidates) {
+      if (auto platform = GetPlatformName(device)) {
+        tried.emplace_back(platform);
+        if (auto creator = gRegistry<T>().Get(platform)) {
+          Context context(device);
+          return creator->Create((Args &&) args...);
+        }
+      }
+    }
+  } else {
+    tried.emplace_back("dummy");
+    if (auto creator = gRegistry<T>().Get("dummy")) {
+      return creator->Create((Args &&) args...);
+    }
+  }
+  MMDEPLOY_ERROR("Unable to create operation, tried platforms: {}", tried);
+  throw_exception(eNotSupported);
+}
+
+class Operation {
+ public:
+  Operation() : device_(gContext().device()) {}
+  virtual ~Operation() = default;
+
+  const Device& device() const noexcept { return device_; }
+  static Stream& stream() noexcept { return gContext().stream(); }
+
+ protected:
+  Device device_;
+};
+
+}  // namespace mmdeploy::operation
+
+#endif  // MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_OPERATION_H_
--- a/csrc/mmdeploy/operation/vision.cpp
+++ b/csrc/mmdeploy/operation/vision.cpp
@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/operation/vision.h"
+
+namespace mmdeploy::operation {
+
+MMDEPLOY_DEFINE_REGISTRY(CvtColor);
+MMDEPLOY_DEFINE_REGISTRY(Resize);
+MMDEPLOY_DEFINE_REGISTRY(Pad);
+MMDEPLOY_DEFINE_REGISTRY(ToFloat);
+MMDEPLOY_DEFINE_REGISTRY(HWC2CHW);
+MMDEPLOY_DEFINE_REGISTRY(Normalize);
+MMDEPLOY_DEFINE_REGISTRY(Crop);
+MMDEPLOY_DEFINE_REGISTRY(Flip);
+
+}  // namespace mmdeploy::operation
--- a/csrc/mmdeploy/operation/vision.h
+++ b/csrc/mmdeploy/operation/vision.h
@ -0,0 +1,83 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_RESIZE_H_
+#define MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_RESIZE_H_
+
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/operation/operation.h"
+
+namespace mmdeploy::operation {
+
+class CvtColor : public Operation {
+ public:
+  virtual Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(CvtColor, unique_ptr<CvtColor>());
+
+// resize in HWC format
+class Resize : public Operation {
+ public:
+  virtual Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(Resize, unique_ptr<Resize>(const string_view& interp));
+
+// pad in HWC format
+class Pad : public Operation {
+ public:
+  virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                             int right) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(Pad, unique_ptr<Pad>(const string_view& border_type, float pad_val));
+
+// uint8 to float
+class ToFloat : public Operation {
+ public:
+  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(ToFloat, unique_ptr<ToFloat>());
+
+class HWC2CHW : public Operation {
+ public:
+  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(HWC2CHW, unique_ptr<HWC2CHW>());
+
+// normalize in HWC format
+class Normalize : public Operation {
+ public:
+  struct Param {
+    std::vector<float> mean;
+    std::vector<float> std;
+    bool to_rgb;
+  };
+
+  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(Normalize, unique_ptr<Normalize>(const Normalize::Param& param));
+
+// crop in HWC format
+class Crop : public Operation {
+ public:
+  virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
+                             int right) = 0;
+};
+MMDEPLOY_DECLARE_REGISTRY(Crop, unique_ptr<Crop>());
+
+class Flip : public Operation {
+ public:
+  explicit Flip(int flip_code) : flip_code_(flip_code) {}
+
+  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+
+ protected:
+  int flip_code_;
+};
+MMDEPLOY_DECLARE_REGISTRY(Flip, unique_ptr<Flip>(int flip_code));
+
+// TODO: warp affine
+
+}  // namespace mmdeploy::operation
+
+#endif  // MMDEPLOY_CSRC_MMDEPLOY_PREPROCESS_OPERATION_RESIZE_H_
--- a/csrc/mmdeploy/preprocess/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/CMakeLists.txt
@ -3,13 +3,10 @@
 project(mmdeploy_transform_module)

 add_subdirectory(transform)
-add_subdirectory(cpu)
+
 if (MMDEPLOY_ELENA_FUSION)
    add_subdirectory(elena)
 endif ()
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_subdirectory(cuda)
-endif ()

 mmdeploy_add_module(${PROJECT_NAME} transform_module.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::transform)
--- a/csrc/mmdeploy/preprocess/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/cpu/CMakeLists.txt
@ -1,21 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-project(mmdeploy_cpu_transform_impl)
-
-set(SRCS
-        collect_impl.cpp
-        crop_impl.cpp
-        ten_crop_impl.cpp
-        three_crop_impl.cpp
-        crop_utils.cpp
-        image2tensor_impl.cpp
-        default_format_bundle_impl.cpp
-        load_impl.cpp
-        normalize_impl.cpp
-        pad_impl.cpp
-        resize_impl.cpp)
-mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy::transform
-        mmdeploy_opencv_utils)
-add_library(mmdeploy::transform_impl::cpu ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/preprocess/cpu/collect_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/collect_impl.cpp
@ -1,15 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/collect.h"
-
-namespace mmdeploy::cpu {
-
-class CollectImpl : public ::mmdeploy::CollectImpl {
- public:
-  CollectImpl(const Value& args) : ::mmdeploy::CollectImpl(args) {}
-  ~CollectImpl() = default;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::CollectImpl, (cpu, 0), CollectImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/crop_impl.cpp
@ -1,30 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/crop.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy::cpu {
-
-class CenterCropImpl : public ::mmdeploy::CenterCropImpl {
- public:
-  explicit CenterCropImpl(const Value& args) : ::mmdeploy::CenterCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    cv::Mat mat = Tensor2CVMat(src_tensor);
-    cv::Mat cropped_mat = Crop(mat, top, left, bottom, right);
-    return CVMat2Tensor(cropped_mat);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::CenterCropImpl, (cpu, 0), CenterCropImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/crop_utils.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/crop_utils.cpp
@ -1,24 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/crop.h"
-#include "mmdeploy/utils/opencv/opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy {
-namespace cpu {
-
-Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
-                         int left, int bottom, int right) {
-  OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device, stream));
-
-  SyncOnScopeExit(stream, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-  cv::Mat mat = Tensor2CVMat(src_tensor);
-  cv::Mat cropped_mat = Crop(mat, top, left, bottom, right);
-  return CVMat2Tensor(cropped_mat);
-}
-
-}  // namespace cpu
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cpu/default_format_bundle_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/default_format_bundle_impl.cpp
@ -1,52 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/default_format_bundle.h"
-#include "opencv_utils.h"
-
-namespace mmdeploy::cpu {
-
-class DefaultFormatBundleImpl : public ::mmdeploy::DefaultFormatBundleImpl {
- public:
-  explicit DefaultFormatBundleImpl(const Value& args) : ::mmdeploy::DefaultFormatBundleImpl(args) {}
-
- protected:
-  Result<Tensor> ToFloat32(const Tensor& tensor, const bool& img_to_float) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto data_type = src_tensor.desc().data_type;
-
-    if (img_to_float && data_type == DataType::kINT8) {
-      auto cvmat = Tensor2CVMat(src_tensor);
-      cvmat.convertTo(cvmat, CV_32FC(cvmat.channels()));
-      auto dst_tensor = CVMat2Tensor(cvmat);
-      return dst_tensor;
-    }
-    return src_tensor;
-  }
-
-  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto shape = src_tensor.shape();
-    int height = shape[1];
-    int width = shape[2];
-    int channels = shape[3];
-
-    auto dst_mat = Transpose(Tensor2CVMat(src_tensor));
-
-    auto dst_tensor = CVMat2Tensor(dst_mat);
-    dst_tensor.Reshape({1, channels, height, width});
-
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::DefaultFormatBundleImpl, (cpu, 0),
-                                 DefaultFormatBundleImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/image2tensor_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/image2tensor_impl.cpp
@ -1,35 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/image2tensor.h"
-#include "opencv_utils.h"
-
-namespace mmdeploy::cpu {
-
-class ImageToTensorImpl : public ::mmdeploy::ImageToTensorImpl {
- public:
-  explicit ImageToTensorImpl(const Value& args) : ::mmdeploy::ImageToTensorImpl(args) {}
-
- protected:
-  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto shape = src_tensor.shape();
-    int height = shape[1];
-    int width = shape[2];
-    int channels = shape[3];
-
-    auto dst_mat = Transpose(Tensor2CVMat(src_tensor));
-
-    auto dst_tensor = CVMat2Tensor(dst_mat);
-    dst_tensor.Reshape({1, channels, height, width});
-
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ImageToTensorImpl, (cpu, 0), ImageToTensorImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/load_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/load_impl.cpp
@ -1,41 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/load.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy::cpu {
-
-class PrepareImageImpl : public ::mmdeploy::PrepareImageImpl {
- public:
-  explicit PrepareImageImpl(const Value& args) : ::mmdeploy::PrepareImageImpl(args){};
-  ~PrepareImageImpl() override = default;
-
- protected:
-  Result<Tensor> ConvertToBGR(const Mat& img) override {
-    auto src_mat = Mat2CVMat(img);
-    auto dst_mat = ColorTransfer(src_mat, img.pixel_format(), PixelFormat::kBGR);
-    if (arg_.to_float32) {
-      cv::Mat _dst_mat;
-      dst_mat.convertTo(_dst_mat, CV_32FC3);
-      dst_mat = _dst_mat;
-    }
-    return ::mmdeploy::cpu::CVMat2Tensor(dst_mat);
-  }
-
-  Result<Tensor> ConvertToGray(const Mat& img) override {
-    auto src_mat = Mat2CVMat(img);
-    auto dst_mat = ColorTransfer(src_mat, img.pixel_format(), PixelFormat::kGRAYSCALE);
-    if (arg_.to_float32) {
-      cv::Mat _dst_mat;
-      dst_mat.convertTo(_dst_mat, CV_32FC1);
-      dst_mat = _dst_mat;
-    }
-    return ::mmdeploy::cpu::CVMat2Tensor(dst_mat);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::PrepareImageImpl, (cpu, 0), PrepareImageImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/normalize_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/normalize_impl.cpp
@ -1,39 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/registry.h"
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/normalize.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy::cpu {
-
-class NormalizeImpl : public ::mmdeploy::NormalizeImpl {
- public:
-  NormalizeImpl(const Value& value) : ::mmdeploy::NormalizeImpl(value){};
-  ~NormalizeImpl() = default;
-
- protected:
-  Result<Tensor> NormalizeImage(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto mat = Tensor2CVMat(src_tensor);
-    auto dst_mat = Normalize(mat, arg_.mean, arg_.std, arg_.to_rgb, true);
-    return CVMat2Tensor(dst_mat);
-  }
-
-  Result<Tensor> ConvertToRGB(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-    auto src_mat = Tensor2CVMat(tensor);
-    auto dst_mat = ColorTransfer(src_mat, PixelFormat::kBGR, PixelFormat::kRGB);
-    return CVMat2Tensor(dst_mat);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::NormalizeImpl, (cpu, 0), NormalizeImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/pad_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/pad_impl.cpp
@ -1,42 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/pad.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy::cpu {
-
-class PadImpl : public ::mmdeploy::PadImpl {
- public:
-  PadImpl(const Value& args) : ::mmdeploy::PadImpl(args) {
-    static map<string, int> border_map{{"constant", cv::BORDER_CONSTANT},
-                                       {"edge", cv::BORDER_REPLICATE},
-                                       {"reflect", cv::BORDER_REFLECT_101},
-                                       {"symmetric", cv::BORDER_REFLECT}};
-    if (border_map.find(arg_.padding_mode) == border_map.end()) {
-      MMDEPLOY_ERROR("unsupported padding_mode '{}'", arg_.padding_mode);
-      throw std::invalid_argument("unsupported padding_mode");
-    }
-    border_type_ = border_map[arg_.padding_mode];
-  }
-
- protected:
-  Result<Tensor> PadImage(const Tensor& img, const std::array<int, 4>& padding) override {
-    OUTCOME_TRY(auto tensor, MakeAvailableOnDevice(img, device_, stream_));
-
-    SyncOnScopeExit(stream_, tensor.buffer() != img.buffer(), tensor);
-
-    cv::Mat dst_mat = Pad(Tensor2CVMat(tensor), padding[1], padding[0], padding[3], padding[2],
-                          border_type_, arg_.pad_val);
-    return CVMat2Tensor(dst_mat);
-  }
-
- private:
-  int border_type_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::PadImpl, (cpu, 0), PadImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/resize_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/resize_impl.cpp
@ -1,33 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/registry.h"
-#include "mmdeploy/core/tensor.h"
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/resize.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy::cpu {
-
-class ResizeImpl final : public ::mmdeploy::ResizeImpl {
- public:
-  ResizeImpl(const Value& args) : ::mmdeploy::ResizeImpl(args) {}
-  ~ResizeImpl() = default;
-
- protected:
-  Result<Tensor> ResizeImage(const Tensor& img, int dst_h, int dst_w) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(img, device_, stream_));
-
-    SyncOnScopeExit(stream_, src_tensor.buffer() != img.buffer(), src_tensor);
-
-    auto src_mat = Tensor2CVMat(src_tensor);
-    auto dst_mat = Resize(src_mat, dst_h, dst_w, arg_.interpolation);
-
-    return CVMat2Tensor(dst_mat);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ResizeImpl, (cpu, 0), ResizeImpl);
-
-}  // namespace mmdeploy::cpu
--- a/csrc/mmdeploy/preprocess/cpu/ten_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/ten_crop_impl.cpp
@ -1,38 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/ten_crop.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy {
-namespace cpu {
-
-Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
-                         int left, int bottom, int right);
-
-class TenCropImpl : public ::mmdeploy::TenCropImpl {
- public:
-  explicit TenCropImpl(const Value& args) : ::mmdeploy::TenCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    return ::mmdeploy::cpu::CropImage(stream_, device_, tensor, top, left, bottom, right);
-  }
-
-  Result<Tensor> HorizontalFlip(const Tensor& tensor) {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-    cv::Mat mat = Tensor2CVMat(src_tensor);
-    cv::Mat flipped_mat;
-    cv::flip(mat, flipped_mat, 1);
-    return CVMat2Tensor(flipped_mat);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::TenCropImpl, (cpu, 0), TenCropImpl);
-
-}  // namespace cpu
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cpu/three_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/three_crop_impl.cpp
@ -1,29 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/three_crop.h"
-#include "opencv_utils.h"
-
-using namespace std;
-
-namespace mmdeploy {
-namespace cpu {
-
-Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
-                         int left, int bottom, int right);
-
-class ThreeCropImpl : public ::mmdeploy::ThreeCropImpl {
- public:
-  explicit ThreeCropImpl(const Value& args) : ::mmdeploy::ThreeCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    return ::mmdeploy::cpu::CropImage(stream_, device_, tensor, top, left, bottom, right);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ThreeCropImpl, (cpu, 0), ThreeCropImpl);
-
-}  // namespace cpu
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/cuda/CMakeLists.txt
@ -1,28 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-project(mmdeploy_cuda_transform_impl CUDA CXX)
-
-find_package(pplcv REQUIRED)
-
-set(SRCS
-        collect_impl.cpp
-        crop_impl.cpp
-        three_crop_impl.cpp
-        ten_crop_impl.cpp
-        crop_utils.cpp
-        image2tensor_impl.cpp
-        default_format_bundle_impl.cpp
-        load_impl.cpp
-        normalize_impl.cpp
-        pad_impl.cpp
-        resize_impl.cpp
-        cast.cu
-        crop.cu
-        normalize.cu
-        transpose.cu)
-mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy::transform ${PPLCV_LIBRARIES})
-target_include_directories(${PROJECT_NAME}
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include ${PPLCV_INCLUDE_DIRS})
-add_library(mmdeploy::transform_impl::cuda ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/preprocess/cuda/cast.cu
+++ b/csrc/mmdeploy/preprocess/cuda/cast.cu
@ -1,36 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <stdint.h>
-
-namespace mmdeploy {
-namespace cuda {
-
-template <int channels>
-__global__ void cast(const uint8_t *src, int height, int width, float *dst) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (x >= width || y >= height) return;
-
-  int loc = (y * width + x) * channels;
-  for (int i = 0; i < channels; ++i) {
-    dst[loc + i] = src[loc + i];
-  }
-}
-
-template <int channels>
-void CastToFloat(const uint8_t *src, int height, int width, float *dst, cudaStream_t stream) {
-  const dim3 thread_block(32, 8);
-  const dim3 block_num((width + thread_block.x - 1) / thread_block.x,
-                       (height + thread_block.y - 1) / thread_block.y);
-  cast<channels><<<block_num, thread_block, 0, stream>>>(src, height, width, dst);
-}
-
-template void CastToFloat<3>(const uint8_t *src, int height, int width, float *dst,
-                             cudaStream_t stream);
-
-template void CastToFloat<1>(const uint8_t *src, int height, int width, float *dst,
-                             cudaStream_t stream);
-
-}  // namespace cuda
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cuda/collect_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/collect_impl.cpp
@ -1,15 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/collect.h"
-
-namespace mmdeploy::cuda {
-
-class CollectImpl : public ::mmdeploy::CollectImpl {
- public:
-  CollectImpl(const Value& args) : ::mmdeploy::CollectImpl(args) {}
-  ~CollectImpl() = default;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::CollectImpl, (cuda, 0), CollectImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/crop_impl.cpp
@ -1,71 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/crop.h"
-
-using namespace std;
-
-namespace mmdeploy::cuda {
-
-template <typename T, int channels>
-void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w,
-          cudaStream_t stream);
-
-class CenterCropImpl : public ::mmdeploy::CenterCropImpl {
- public:
-  explicit CenterCropImpl(const Value& args) : ::mmdeploy::CenterCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    OUTCOME_TRY(auto device_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, device_tensor.buffer() != tensor.buffer(), device_tensor);
-
-    auto stream = GetNative<cudaStream_t>(stream_);
-    auto desc = device_tensor.desc();
-
-    int h = bottom - top + 1;
-    int w = right - left + 1;
-    int c = desc.shape[3];
-    auto type = desc.data_type;
-
-    TensorShape shape{1, bottom - top + 1, right - left + 1, tensor.desc().shape[3]};
-    TensorDesc dst_desc{device_, tensor.desc().data_type, shape, desc.name};
-    Tensor dst_tensor{dst_desc};
-    assert(device_.is_device());
-    if (DataType::kINT8 == type) {
-      uint8_t* input = device_tensor.data<uint8_t>();
-      uint8_t* output = dst_tensor.data<uint8_t>();
-      if (3 == c) {
-        Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, stream);
-      } else if (1 == c) {
-        Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else if (DataType::kFLOAT == type) {
-      float* input = static_cast<float*>(device_tensor.buffer().GetNative());
-      float* output = static_cast<float*>(dst_tensor.buffer().GetNative());
-      if (3 == c) {
-        Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, stream);
-      } else if (1 == c) {
-        Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported channels {}", c);
-      return Status(eNotSupported);
-    }
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::CenterCropImpl, (cuda, 0), CenterCropImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/crop_utils.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/crop_utils.cpp
@ -1,66 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/crop.h"
-
-using namespace std;
-
-namespace mmdeploy {
-namespace cuda {
-
-template <typename T, int channels>
-void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w,
-          cudaStream_t stream);
-
-Result<Tensor> CropImage(Stream& _stream, const Device& device, const Tensor& tensor, int top,
-                         int left, int bottom, int right) {
-  OUTCOME_TRY(auto device_tensor, MakeAvailableOnDevice(tensor, device, _stream));
-
-  SyncOnScopeExit sync(_stream, device_tensor.buffer() != tensor.buffer(), device_tensor);
-
-  auto stream = GetNative<cudaStream_t>(_stream);
-  auto desc = device_tensor.desc();
-
-  int h = bottom - top + 1;
-  int w = right - left + 1;
-  int c = desc.shape[3];
-  auto type = desc.data_type;
-
-  TensorShape shape{1, bottom - top + 1, right - left + 1, tensor.desc().shape[3]};
-  TensorDesc dst_desc{device, tensor.desc().data_type, shape, desc.name};
-  Tensor dst_tensor{dst_desc};
-  assert(device.is_device());
-  if (DataType::kINT8 == type) {
-    uint8_t* input = device_tensor.data<uint8_t>();
-    uint8_t* output = dst_tensor.data<uint8_t>();
-    if (3 == c) {
-      Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, stream);
-    } else if (1 == c) {
-      Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, stream);
-    } else {
-      MMDEPLOY_ERROR("unsupported channels {}", c);
-      return Status(eNotSupported);
-    }
-  } else if (DataType::kFLOAT == type) {
-    float* input = static_cast<float*>(device_tensor.buffer().GetNative());
-    float* output = static_cast<float*>(dst_tensor.buffer().GetNative());
-    if (3 == c) {
-      Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, stream);
-    } else if (1 == c) {
-      Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, stream);
-    } else {
-      MMDEPLOY_ERROR("unsupported channels {}", c);
-      return Status(eNotSupported);
-    }
-  } else {
-    MMDEPLOY_ERROR("unsupported channels {}", c);
-    return Status(eNotSupported);
-  }
-  return dst_tensor;
-}
-
-}  // namespace cuda
-
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cuda/default_format_bundle_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/default_format_bundle_impl.cpp
@ -1,81 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/default_format_bundle.h"
-
-namespace mmdeploy::cuda {
-
-template <int channels>
-void CastToFloat(const uint8_t* src, int height, int width, float* dst, cudaStream_t stream);
-
-template <typename T>
-void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
-
-class DefaultFormatBundleImpl final : public ::mmdeploy::DefaultFormatBundleImpl {
- public:
-  explicit DefaultFormatBundleImpl(const Value& args) : ::mmdeploy::DefaultFormatBundleImpl(args) {}
-
- protected:
-  Result<Tensor> ToFloat32(const Tensor& tensor, const bool& img_to_float) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto data_type = src_tensor.data_type();
-    auto h = tensor.shape(1);
-    auto w = tensor.shape(2);
-    auto c = tensor.shape(3);
-    auto stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-
-    if (img_to_float && data_type == DataType::kINT8) {
-      TensorDesc desc{device_, DataType::kFLOAT, tensor.shape(), ""};
-      Tensor dst_tensor{desc};
-      if (c == 3) {
-        CastToFloat<3>(src_tensor.data<uint8_t>(), h, w, dst_tensor.data<float>(), stream);
-      } else if (c == 1) {
-        CastToFloat<1>(src_tensor.data<uint8_t>(), h, w, dst_tensor.data<float>(), stream);
-      } else {
-        MMDEPLOY_ERROR("channel num: unsupported channel num {}", c);
-        return Status(eNotSupported);
-      }
-      return dst_tensor;
-    }
-    return src_tensor;
-  }
-
-  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto h = tensor.shape(1);
-    auto w = tensor.shape(2);
-    auto c = tensor.shape(3);
-    auto hw = h * w;
-
-    Tensor dst_tensor(src_tensor.desc());
-    dst_tensor.Reshape({1, c, h, w});
-
-    auto stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-
-    if (DataType::kINT8 == tensor.data_type()) {
-      auto input = src_tensor.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      Transpose(input, (int)h, (int)w, (int)c, output, stream);
-    } else if (DataType::kFLOAT == tensor.data_type()) {
-      auto input = src_tensor.data<float>();
-      auto output = dst_tensor.data<float>();
-      Transpose(input, (int)h, (int)w, (int)c, output, stream);
-    } else {
-      assert(0);
-    }
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::DefaultFormatBundleImpl, (cuda, 0),
-                                 DefaultFormatBundleImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/image2tensor_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/image2tensor_impl.cpp
@ -1,50 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/image2tensor.h"
-
-namespace mmdeploy::cuda {
-
-template <typename T>
-void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
-
-class ImageToTensorImpl final : public ::mmdeploy::ImageToTensorImpl {
- public:
-  explicit ImageToTensorImpl(const Value& args) : ::mmdeploy::ImageToTensorImpl(args) {}
-
- protected:
-  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto h = tensor.shape(1);
-    auto w = tensor.shape(2);
-    auto c = tensor.shape(3);
-    auto hw = h * w;
-
-    Tensor dst_tensor(src_tensor.desc());
-    dst_tensor.Reshape({1, c, h, w});
-
-    auto stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-
-    if (DataType::kINT8 == tensor.data_type()) {
-      auto input = src_tensor.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      Transpose(input, (int)h, (int)w, (int)c, output, stream);
-    } else if (DataType::kFLOAT == tensor.data_type()) {
-      auto input = src_tensor.data<float>();
-      auto output = dst_tensor.data<float>();
-      Transpose(input, (int)h, (int)w, (int)c, output, stream);
-    } else {
-      assert(0);
-    }
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ImageToTensorImpl, (cuda, 0), ImageToTensorImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/load_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/load_impl.cpp
@ -1,167 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/load.h"
-#include "ppl/cv/cuda/cvtcolor.h"
-
-using namespace std;
-using namespace ppl::cv::cuda;
-
-namespace mmdeploy::cuda {
-
-template <int channels>
-void CastToFloat(const uint8_t* src, int height, int width, float* dst, cudaStream_t stream);
-
-class PrepareImageImpl : public ::mmdeploy::PrepareImageImpl {
- public:
-  explicit PrepareImageImpl(const Value& args) : ::mmdeploy::PrepareImageImpl(args){};
-  ~PrepareImageImpl() override = default;
-
- protected:
-  Tensor Mat2Tensor(const mmdeploy::Mat& mat) {
-    TensorDesc desc{
-        mat.buffer().GetDevice(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
-    return Tensor(std::move(desc), mat.buffer());
-  }
-
- protected:
-  Result<Tensor> ConvertToBGR(const Mat& img) override {
-    auto _img = MakeAvailableOnDevice(img, device_, stream_);
-    auto src_mat = _img.value();
-    if (img.pixel_format() == PixelFormat::kBGR) {
-      return Mat2Tensor(src_mat);
-    }
-
-    cudaStream_t stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-    Mat dst_mat(src_mat.height(), src_mat.width(), PixelFormat::kBGR, src_mat.type(), device_);
-
-    SyncOnScopeExit sync(stream_, true, src_mat, dst_mat);
-
-    ppl::common::RetCode ret = 0;
-
-    int src_h = src_mat.height();
-    int src_w = src_mat.width();
-    int src_c = src_mat.channel();
-    int src_stride = src_w * src_mat.channel();
-    uint8_t* src_ptr = src_mat.data<uint8_t>();
-
-    int dst_w = dst_mat.width();
-    int dst_stride = dst_w * dst_mat.channel();
-    uint8_t* dst_ptr = dst_mat.data<uint8_t>();
-
-    switch (img.pixel_format()) {
-      case PixelFormat::kRGB:
-        ret = RGB2BGR<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      case PixelFormat::kGRAYSCALE:
-        ret = GRAY2BGR<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      case PixelFormat::kNV12:
-        assert(src_c == 1);
-        NV122BGR<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      case PixelFormat::kNV21:
-        assert(src_c == 1);
-        NV212BGR<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      case PixelFormat::kBGRA:
-        BGRA2BGR<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      default:
-        MMDEPLOY_ERROR("src type: unknown type {}", img.pixel_format());
-        return Status(eNotSupported);
-    }
-    if (ret != 0) {
-      MMDEPLOY_ERROR("color transfer from {} to BGR failed, ret {}", img.pixel_format(), ret);
-      return Status(eFail);
-    }
-    if (arg_.to_float32) {
-      TensorDesc desc{device_, DataType::kFLOAT, {1, src_h, src_w, dst_mat.channel()}, ""};
-      Tensor dst_tensor{desc};
-      CastToFloat<3>(dst_ptr, src_h, src_w, dst_tensor.data<float>(), stream);
-      return dst_tensor;
-    } else {
-      return Mat2Tensor(dst_mat);
-    }
-  }
-
-  Result<Tensor> ConvertToGray(const Mat& img) override {
-    OUTCOME_TRY(auto src_mat, MakeAvailableOnDevice(img, device_, stream_));
-
-    if (img.pixel_format() == PixelFormat::kGRAYSCALE) {
-      return Mat2Tensor(src_mat);
-    }
-
-    cudaStream_t stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-    Mat dst_mat(src_mat.height(), src_mat.width(), PixelFormat::kGRAYSCALE, src_mat.type(),
-                device_);
-
-    SyncOnScopeExit sync(stream_, true, src_mat, dst_mat);
-
-    ppl::common::RetCode ret = 0;
-
-    int src_h = src_mat.height();
-    int src_w = src_mat.width();
-    int src_c = src_mat.channel();
-    int src_stride = src_w * src_mat.channel();
-    uint8_t* src_ptr = src_mat.data<uint8_t>();
-
-    int dst_w = dst_mat.width();
-    int dst_stride = dst_w * dst_mat.channel();
-    uint8_t* dst_ptr = dst_mat.data<uint8_t>();
-
-    switch (img.pixel_format()) {
-      case PixelFormat::kRGB:
-        ret = RGB2GRAY<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      case PixelFormat::kBGR:
-        ret = BGR2GRAY<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      case PixelFormat::kNV12: {
-        assert(src_c == 1);
-        Mat rgb_mat(src_mat.height(), src_mat.width(), PixelFormat::kRGB, src_mat.type(), device_);
-        NV122RGB<uint8_t>(stream, src_h, src_w, src_stride, src_ptr,
-                          rgb_mat.width() * rgb_mat.channel(), rgb_mat.data<uint8_t>());
-        RGB2GRAY<uint8_t>(stream, rgb_mat.height(), rgb_mat.width(),
-                          rgb_mat.width() * rgb_mat.channel(), rgb_mat.data<uint8_t>(), dst_stride,
-                          dst_mat.data<uint8_t>());
-        break;
-      }
-      case PixelFormat::kNV21: {
-        assert(src_c == 1);
-        Mat rgb_mat(src_mat.height(), src_mat.width(), PixelFormat::kRGB, src_mat.type(), device_);
-        NV212RGB<uint8_t>(stream, src_h, src_w, src_stride, src_ptr,
-                          rgb_mat.width() * rgb_mat.channel(), rgb_mat.data<uint8_t>());
-        RGB2GRAY<uint8_t>(stream, rgb_mat.height(), rgb_mat.width(),
-                          rgb_mat.width() * rgb_mat.channel(), rgb_mat.data<uint8_t>(), dst_stride,
-                          dst_mat.data<uint8_t>());
-        break;
-      }
-      case PixelFormat::kBGRA:
-        BGRA2GRAY<uint8_t>(stream, src_h, src_w, src_stride, src_ptr, dst_stride, dst_ptr);
-        break;
-      default:
-        MMDEPLOY_ERROR("src type: unknown type {}", img.pixel_format());
-        throw Status(eNotSupported);
-    }
-    if (ret != 0) {
-      MMDEPLOY_ERROR("color transfer from {} to Gray failed", img.pixel_format());
-      throw Status(eFail);
-    }
-    if (arg_.to_float32) {
-      TensorDesc desc{device_, DataType::kFLOAT, {1, src_h, src_w, dst_mat.channel()}, ""};
-      Tensor dst_tensor{desc};
-      CastToFloat<1>(dst_ptr, src_h, src_w, dst_tensor.data<float>(), stream);
-      return dst_tensor;
-    } else {
-      return Mat2Tensor(dst_mat);
-    }
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::PrepareImageImpl, (cuda, 0), PrepareImageImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/normalize_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/normalize_impl.cpp
@ -1,94 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/normalize.h"
-#include "ppl/cv/cuda/cvtcolor.h"
-
-using namespace std;
-using namespace ppl::cv::cuda;
-
-namespace mmdeploy::cuda {
-
-template <typename T, int channels>
-void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean,
-               const float* std, bool to_rgb, cudaStream_t stream);
-
-class NormalizeImpl : public ::mmdeploy::NormalizeImpl {
- public:
-  explicit NormalizeImpl(const Value& args) : ::mmdeploy::NormalizeImpl(args) {}
-
- protected:
-  Result<Tensor> NormalizeImage(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto src_desc = src_tensor.desc();
-    int h = (int)src_desc.shape[1];
-    int w = (int)src_desc.shape[2];
-    int c = (int)src_desc.shape[3];
-    int stride = w * c;
-
-    TensorDesc dst_desc{device_, DataType::kFLOAT, src_desc.shape, src_desc.name};
-    Tensor dst_tensor{dst_desc};
-    auto output = dst_tensor.data<float>();
-    auto stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-
-    if (DataType::kINT8 == src_desc.data_type) {
-      auto input = src_tensor.data<uint8_t>();
-      if (3 == c) {
-        Normalize<uint8_t, 3>(input, h, w, stride, output, arg_.mean.data(), arg_.std.data(),
-                              arg_.to_rgb, stream);
-      } else if (1 == c) {
-        Normalize<uint8_t, 1>(input, h, w, stride, output, arg_.mean.data(), arg_.std.data(),
-                              arg_.to_rgb, stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else if (DataType::kFLOAT == src_desc.data_type) {
-      auto input = src_tensor.data<float>();
-      if (3 == c) {
-        Normalize<float, 3>(input, h, w, stride, output, arg_.mean.data(), arg_.std.data(),
-                            arg_.to_rgb, stream);
-      } else if (1 == c) {
-        Normalize<float, 1>(input, h, w, stride, output, arg_.mean.data(), arg_.std.data(),
-                            arg_.to_rgb, stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src_desc.data_type);
-      assert(0);
-      return Status(eNotSupported);
-    }
-    return dst_tensor;
-  }
-
-  Result<Tensor> ConvertToRGB(const Tensor& tensor) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    auto src_desc = src_tensor.desc();
-    int h = (int)src_desc.shape[1];
-    int w = (int)src_desc.shape[2];
-    int c = (int)src_desc.shape[3];
-    int stride = w * c;
-    auto stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-
-    TensorDesc dst_desc{device_, DataType::kINT8, src_desc.shape, src_desc.name};
-    Tensor dst_tensor{dst_desc};
-    RGB2BGR<uint8_t>(stream, h, w, stride, tensor.data<uint8_t>(), stride,
-                     dst_tensor.data<uint8_t>());
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::NormalizeImpl, (cuda, 0), NormalizeImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/pad_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/pad_impl.cpp
@ -1,98 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/pad.h"
-#include "ppl/cv/cuda/copymakeborder.h"
-
-using namespace std;
-using namespace ppl::cv::cuda;
-
-namespace mmdeploy::cuda {
-
-class PadImpl : public ::mmdeploy::PadImpl {
- public:
-  explicit PadImpl(const Value& args) : ::mmdeploy::PadImpl(args) {
-    map<string, ppl::cv::BorderType> border_map{{"constant", ppl::cv::BORDER_CONSTANT},
-                                                {"edge", ppl::cv::BORDER_REPLICATE},
-                                                {"reflect", ppl::cv::BORDER_REFLECT_101},
-                                                {"symmetric", ppl::cv::BORDER_REFLECT}};
-    if (border_map.find(arg_.padding_mode) == border_map.end()) {
-      MMDEPLOY_ERROR("unsupported padding_mode '{}'", arg_.padding_mode);
-      throw_exception(eNotSupported);
-    }
-    padding_mode_ = border_map[arg_.padding_mode];
-  }
-
- protected:
-  Result<Tensor> PadImage(const Tensor& img, const array<int, 4>& padding) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(img, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != img.buffer(), src_tensor);
-
-    auto desc = src_tensor.desc();
-    int height = desc.shape[1];
-    int width = desc.shape[2];
-    int c = desc.shape[3];
-
-    auto dst_height = height + padding[1] + padding[3];
-    auto dst_width = width + padding[0] + padding[2];
-    TensorShape dst_shape{1, dst_height, dst_width, c};
-    TensorDesc dst_desc{device_, desc.data_type, dst_shape, ""};
-    Tensor dst_tensor(dst_desc);
-
-    ppl::common::RetCode ret = 0;
-    cudaStream_t stream = ::mmdeploy::GetNative<cudaStream_t>(stream_);
-
-    if (desc.data_type == DataType::kFLOAT) {
-      auto src_buffer = src_tensor.data<float>();
-      auto dst_buffer = dst_tensor.data<float>();
-      if (3 == c) {
-        ret = CopyMakeBorder<float, 3>(stream, height, width, width * c, src_buffer, dst_width * c,
-                                       dst_buffer, padding[1], padding[3], padding[0], padding[2],
-                                       padding_mode_, arg_.pad_val);
-      } else if (1 == c) {
-        ret = CopyMakeBorder<float, 1>(stream, height, width, width * c, src_buffer, dst_width * c,
-                                       dst_buffer, padding[1], padding[3], padding[0], padding[2],
-                                       padding_mode_, arg_.pad_val);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        assert(0);
-        return Status(eNotSupported);
-      }
-    } else if (desc.data_type == DataType::kINT8) {
-      auto src_buffer = src_tensor.data<uint8_t>();
-      auto dst_buffer = dst_tensor.data<uint8_t>();
-      if (3 == c) {
-        ret = CopyMakeBorder<ppl::cv::uchar, 3>(
-            stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, padding[1],
-            padding[3], padding[0], padding[2], padding_mode_, (ppl::cv::uchar)arg_.pad_val);
-      } else if (1 == c) {
-        ret = CopyMakeBorder<ppl::cv::uchar, 1>(
-            stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, padding[1],
-            padding[3], padding[0], padding[2], padding_mode_, (ppl::cv::uchar)arg_.pad_val);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        assert(0);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", desc.data_type);
-      assert(0);
-      return Status(eNotSupported);
-    }
-    if (ret != 0) {
-      MMDEPLOY_ERROR("unexpected exception happened");
-      assert(0);
-      return Status(eNotSupported);
-    }
-    return dst_tensor;
-  }
-
- private:
-  ppl::cv::BorderType padding_mode_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::PadImpl, (cuda, 0), PadImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/resize_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/resize_impl.cpp
@ -1,85 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/resize.h"
-#include "ppl/cv/cuda/resize.h"
-
-using namespace std;
-
-namespace mmdeploy::cuda {
-
-class ResizeImpl final : public ::mmdeploy::ResizeImpl {
- public:
-  explicit ResizeImpl(const Value& args) : ::mmdeploy::ResizeImpl(args) {
-    if (arg_.interpolation != "bilinear" && arg_.interpolation != "nearest") {
-      MMDEPLOY_ERROR("{} interpolation is not supported", arg_.interpolation);
-      throw_exception(eNotSupported);
-    }
-  }
-  ~ResizeImpl() override = default;
-
- protected:
-  Result<Tensor> ResizeImage(const Tensor& tensor, int dst_h, int dst_w) override {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    TensorDesc dst_desc{
-        device_, src_tensor.data_type(), {1, dst_h, dst_w, src_tensor.shape(3)}, src_tensor.name()};
-    Tensor dst_tensor(dst_desc);
-
-    auto stream = GetNative<cudaStream_t>(stream_);
-    if (tensor.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(ResizeDispatch<uint8_t>(src_tensor, dst_tensor, stream));
-    } else if (tensor.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(ResizeDispatch<float>(src_tensor, dst_tensor, stream));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", tensor.data_type());
-      return Status(eNotSupported);
-    }
-    return dst_tensor;
-  }
-
- private:
-  template <class T, int C, class... Args>
-  ppl::common::RetCode DispatchImpl(Args&&... args) {
-    if (arg_.interpolation == "bilinear") {
-      return ppl::cv::cuda::Resize<T, C>(std::forward<Args>(args)...,
-                                         ppl::cv::INTERPOLATION_LINEAR);
-    }
-    if (arg_.interpolation == "nearest") {
-      return ppl::cv::cuda::Resize<T, C>(std::forward<Args>(args)...,
-                                         ppl::cv::INTERPOLATION_NEAREST_POINT);
-    }
-    return ppl::common::RC_UNSUPPORTED;
-  }
-
-  template <class T>
-  Result<void> ResizeDispatch(const Tensor& src, Tensor& dst, cudaStream_t stream) {
-    int h = (int)src.shape(1);
-    int w = (int)src.shape(2);
-    int c = (int)src.shape(3);
-    int dst_h = (int)dst.shape(1);
-    int dst_w = (int)dst.shape(2);
-    ppl::common::RetCode ret = 0;
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-    if (1 == c) {
-      ret = DispatchImpl<T, 1>(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output);
-    } else if (3 == c) {
-      ret = DispatchImpl<T, 3>(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output);
-    } else if (4 == c) {
-      ret = DispatchImpl<T, 4>(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output);
-    } else {
-      MMDEPLOY_ERROR("unsupported channels {}", c);
-      return Status(eNotSupported);
-    }
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ResizeImpl, (cuda, 0), ResizeImpl);
-
-}  // namespace mmdeploy::cuda
--- a/csrc/mmdeploy/preprocess/cuda/ten_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/ten_crop_impl.cpp
@ -1,77 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/ten_crop.h"
-#include "ppl/cv/cuda/flip.h"
-
-using namespace std;
-
-namespace mmdeploy {
-namespace cuda {
-
-Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
-                         int left, int bottom, int right);
-
-class TenCropImpl : public ::mmdeploy::TenCropImpl {
- public:
-  explicit TenCropImpl(const Value& args) : ::mmdeploy::TenCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    return ::mmdeploy::cuda::CropImage(stream_, device_, tensor, top, left, bottom, right);
-  }
-
-  Result<Tensor> HorizontalFlip(const Tensor& tensor) {
-    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
-
-    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
-
-    TensorDesc dst_desc = tensor.desc();
-    dst_desc.device = device_;
-    Tensor dst_tensor(dst_desc);
-    auto stream = GetNative<cudaStream_t>(stream_);
-    int h = (int)tensor.shape(1);
-    int w = (int)tensor.shape(2);
-    int c = (int)tensor.shape(3);
-    ppl::common::RetCode ret;
-    if (tensor.data_type() == DataType::kINT8) {
-      auto input = tensor.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      if (c == 1) {
-        ret = ppl::cv::cuda::Flip<uint8_t, 1>(stream, h, w, w * c, input, w * c, output, 1);
-      } else if (c == 3) {
-        ret = ppl::cv::cuda::Flip<uint8_t, 3>(stream, h, w, w * c, input, w * c, output, 1);
-      } else {
-        ret = ppl::common::RC_UNSUPPORTED;
-      }
-    } else if (tensor.data_type() == DataType::kFLOAT) {
-      auto input = tensor.data<float>();
-      auto output = dst_tensor.data<float>();
-      if (c == 1) {
-        ret = ppl::cv::cuda::Flip<float, 1>(stream, h, w, w * c, input, w * c, output, 1);
-      } else if (c == 3) {
-        ret = ppl::cv::cuda::Flip<float, 3>(stream, h, w, w * c, input, w * c, output, 1);
-      } else {
-        ret = ppl::common::RC_UNSUPPORTED;
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", tensor.data_type());
-      return Status(eNotSupported);
-    }
-
-    if (ret != 0) {
-      return Status(eFail);
-    }
-
-    return dst_tensor;
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::TenCropImpl, (cuda, 0), TenCropImpl);
-
-}  // namespace cuda
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cuda/three_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/three_crop_impl.cpp
@ -1,30 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/preprocess/transform/three_crop.h"
-
-using namespace std;
-
-namespace mmdeploy {
-namespace cuda {
-
-Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
-                         int left, int bottom, int right);
-
-class ThreeCropImpl : public ::mmdeploy::ThreeCropImpl {
- public:
-  explicit ThreeCropImpl(const Value& args) : ::mmdeploy::ThreeCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    return ::mmdeploy::cuda::CropImage(stream_, device_, tensor, top, left, bottom, right);
-  }
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ThreeCropImpl, (cuda, 0), ThreeCropImpl);
-
-}  // namespace cuda
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
@ -3,29 +3,21 @@
 project(mmdeploy_elena_transform_impl)

 set(SRCS
-        crop_impl.cpp
-        collect_impl.cpp
-        image2tensor_impl.cpp
-        default_format_bundle_impl.cpp
-        load_impl.cpp
-        normalize_impl.cpp
-        pad_impl.cpp
-        resize_impl.cpp
+        fused.cpp
        elena_registry.cpp)

 file(GLOB CPU_KERNEL_SRCS "cpu_kernel/*.cpp")

 set(ALL_SRCS ${SRCS} ${CPU_KERNEL_SRCS})
 if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-        file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
-        set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
+    file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
+    set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
 endif ()

 mmdeploy_add_module(${PROJECT_NAME} "${ALL_SRCS}")
 target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy::transform)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::transform)
 if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-        target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
+    target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
 endif ()
 add_library(mmdeploy::transform_impl::elena ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/preprocess/elena/collect_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/collect_impl.cpp
@ -1,132 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include <iostream>
-#include <set>
-#include <string>
-
-#include "elena_registry.h"
-#include "mmdeploy/archive/json_archive.h"
-#include "mmdeploy/core/mat.h"
-#include "mmdeploy/core/tensor.h"
-#include "mmdeploy/core/utils/device_utils.h"
-#include "mmdeploy/core/utils/formatter.h"
-#include "mmdeploy/preprocess/transform/collect.h"
-#include "mmdeploy/preprocess/transform/tracer.h"
-
-namespace mmdeploy::elena {
-
-using namespace trace;
-
-struct ExtractTransParamVisitor {
-  bool valid{true};
-  std::set<std::string> st;
-
-  std::array<float, 3> mean;
-  std::array<float, 3> std;
-  std::array<int, 2> resize_hw;
-  std::string resize_mode;
-  float pad_val;
-  std::array<int, 4> pad_tlbr;
-  std::array<int, 2> pad_hw;
-  std::array<int, 4> crop_tlbr;
-  std::array<int, 2> crop_hw;
-
-  void CheckValid(const std::string& name) {
-    if (st.count(name)) {
-      valid = false;
-      return;
-    }
-    st.insert(name);
-  }
-
-  void operator()(CvtColorParam&) {}
-  void operator()(CastParam&) {}
-  void operator()(HWC2CHWParam&) {}
-
-  void operator()(ResizeParam& param) {
-    CheckValid("Resize");
-    resize_hw = {param.size[0], param.size[1]};
-    resize_mode = param.mode;
-  }
-  void operator()(PadParam& param) {
-    CheckValid("Pad");
-    pad_val = param.pad_val;
-    std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
-    std::copy_n(param.size.begin(), 2, pad_hw.begin());
-  }
-  void operator()(NormParam& param) {
-    CheckValid("Normalize");
-    std::copy(param.mean.begin(), param.mean.end(), mean.begin());
-    std::copy(param.std.begin(), param.std.end(), std.begin());
-  }
-  void operator()(CropParam& param) {
-    CheckValid("CenterCrop");
-    std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
-    std::copy_n(param.size.begin(), 2, crop_hw.begin());
-  }
-};
-
-class CollectImpl : public ::mmdeploy::CollectImpl {
- public:
-  CollectImpl(const Value& args) : ::mmdeploy::CollectImpl(args) {
-    Platform platform(device_.platform_id());
-    device_name_ = platform.GetPlatformName();
-    sha256_ = args["context"].value("sha256", std::string(""));
-  }
-
-  ~CollectImpl() = default;
-
-  Result<Value> Process(const Value& input) override {
-    auto tracer = input["__tracer__"].get<Tracer>();
-    Mat _src_mat = input["ori_img"].get<Mat>();
-    OUTCOME_TRY(auto src_mat, MakeAvailableOnDevice(_src_mat, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    ExtractTransParamVisitor visitor{};
-    for (auto&& trans : tracer.trans_) {
-      std::visit(visitor, trans);
-    }
-    std::string tag = sha256_ + "_" + device_name_;
-    FuseFunc func = FuseKernel::Get().GetFunc(tag);
-
-    if (!visitor.valid) {
-      MMDEPLOY_ERROR("unsupported fuse transform");
-      throw std::invalid_argument("");
-    }
-    if (src_mat.type() != DataType::kINT8) {
-      MMDEPLOY_ERROR("unsupported data type in fuse transform");
-      throw std::invalid_argument("");
-    }
-    if (!func) {
-      MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag);
-      throw std::invalid_argument("");
-    }
-
-    Value output = input;
-    auto img_fields = GetImageFields(input);
-    for (auto& key : img_fields) {
-      assert(input.contains(key));
-      auto src_tensor = input[key].get<Tensor>();
-      auto desc = src_tensor.desc();
-      desc.device = device_;
-      Tensor dst_tensor{desc};
-
-      func(stream_.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(),
-           to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1],
-           visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1],
-           visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1],
-           visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0],
-           visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0],
-           visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2),
-           dst_tensor.shape(3));
-      output[key] = std::move(dst_tensor);
-    }
-    return ::mmdeploy::CollectImpl::Process(output);
-  }
-
-  std::string sha256_;
-  std::string device_name_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::CollectImpl, (elena, 0), CollectImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/crop_impl.cpp
@ -1,32 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/crop.h"
-
-using namespace std;
-
-namespace mmdeploy::elena {
-
-class CenterCropImpl : public ::mmdeploy::CenterCropImpl {
- public:
-  explicit CenterCropImpl(const Value& args) : ::mmdeploy::CenterCropImpl(args) {}
-
- protected:
-  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                           int right) override {
-    auto& src_desc = tensor.desc();
-    auto data_type = src_desc.data_type;
-    auto shape = src_desc.shape;
-    shape[1] = bottom - top + 1;  // h
-    shape[2] = right - left + 1;  // w
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::CenterCropImpl, (elena, 0), CenterCropImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/default_format_bundle_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/default_format_bundle_impl.cpp
@ -1,44 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/default_format_bundle.h"
-
-namespace mmdeploy::elena {
-
-class DefaultFormatBundleImpl : public ::mmdeploy::DefaultFormatBundleImpl {
- public:
-  explicit DefaultFormatBundleImpl(const Value& args) : ::mmdeploy::DefaultFormatBundleImpl(args) {}
-
- protected:
-  Result<Tensor> ToFloat32(const Tensor& tensor, const bool& img_to_float) override {
-    auto& src_desc = tensor.desc();
-    auto data_type = src_desc.data_type;
-    auto shape = src_desc.shape;
-
-    if (img_to_float && data_type == DataType::kINT8) {
-      data_type = DataType::kFLOAT;
-    }
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-
-  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
-    auto& src_desc = tensor.desc();
-    auto data_type = src_desc.data_type;
-    auto shape = src_desc.shape;
-    shape = {shape[0], shape[3], shape[1], shape[2]};
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::DefaultFormatBundleImpl, (elena, 0),
-                                 DefaultFormatBundleImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/fused.cpp
+++ b/csrc/mmdeploy/preprocess/elena/fused.cpp
@ -0,0 +1,138 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <set>
+
+#include "mmdeploy/archive/value_archive.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/managed.h"
+#include "mmdeploy/preprocess/elena/elena_registry.h"
+#include "mmdeploy/preprocess/transform/tracer.h"
+#include "mmdeploy/preprocess/transform/transform.h"
+
+namespace mmdeploy::transform {
+
+using namespace trace;
+using namespace elena;
+
+struct ExtractTransParamVisitor {
+  bool valid{true};
+  std::set<std::string> st;
+
+  std::array<float, 3> mean;
+  std::array<float, 3> std;
+  std::array<int, 2> resize_hw;
+  std::string resize_mode;
+  float pad_val;
+  std::array<int, 4> pad_tlbr;
+  std::array<int, 2> pad_hw;
+  std::array<int, 4> crop_tlbr;
+  std::array<int, 2> crop_hw;
+
+  void CheckValid(const std::string& name) {
+    if (st.count(name)) {
+      valid = false;
+      return;
+    }
+    st.insert(name);
+  }
+
+  void operator()(CvtColorParam&) {}
+  void operator()(CastParam&) {}
+  void operator()(HWC2CHWParam&) {}
+
+  void operator()(ResizeParam& param) {
+    CheckValid("Resize");
+    resize_hw = {param.size[0], param.size[1]};
+    resize_mode = param.mode;
+  }
+  void operator()(PadParam& param) {
+    CheckValid("Pad");
+    pad_val = param.pad_val;
+    std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
+    std::copy_n(param.size.begin(), 2, pad_hw.begin());
+  }
+  void operator()(NormParam& param) {
+    CheckValid("Normalize");
+    std::copy(param.mean.begin(), param.mean.end(), mean.begin());
+    std::copy(param.std.begin(), param.std.end(), std.begin());
+  }
+  void operator()(CropParam& param) {
+    CheckValid("CenterCrop");
+    std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
+    std::copy_n(param.size.begin(), 2, crop_hw.begin());
+  }
+};
+
+class Fused : public Transform {
+ public:
+  explicit Fused(const Value& args) {
+    device_ = operation::gContext().device();
+    tag_ = args["hash_code"].get<std::string>();
+    tag_.append("_").append(GetPlatformName(device_));
+    func_ = FuseKernel::Get().GetFunc(tag_);
+    if (!func_) {
+      MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag_);
+      throw_exception(eNotSupported);
+    }
+  }
+
+  struct Context {
+    Context() { operation::gContext().set_use_dummy(false); }
+    ~Context() { operation::gContext().set_use_dummy(true); }
+  };
+
+  Result<void> Apply(Value& data) override {
+    auto tracer = data["__tracer__"].get<Tracer>();
+    Mat _src_mat = data["ori_img"].get<Mat>();
+
+    auto& stream = operation::gContext().stream();
+
+    // ! Create a scope that `use_dummy` is false
+    Context context;
+    OUTCOME_TRY(auto src_mat, operation::Secure(_src_mat, device_, stream));
+
+    ExtractTransParamVisitor visitor{};
+    for (auto&& trans : tracer.trans_) {
+      std::visit(visitor, trans);
+    }
+
+    if (!visitor.valid) {
+      MMDEPLOY_ERROR("unsupported fuse transform");
+      return Status(eNotSupported);
+    }
+    if (src_mat.type() != DataType::kINT8) {
+      MMDEPLOY_ERROR("unsupported data type in fuse transform");
+      return Status(eNotSupported);
+    }
+
+    auto img_fields = GetImageFields(data);
+    for (auto& key : img_fields) {
+      assert(data.contains(key));
+      auto src_tensor = data[key].get<Tensor>();
+      auto desc = src_tensor.desc();
+      desc.device = device_;
+      Tensor dst_tensor{desc};
+
+      func_(stream.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(),
+            to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1],
+            visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1],
+            visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1],
+            visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0],
+            visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0],
+            visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2),
+            dst_tensor.shape(3));
+      operation::gContext().Track(dst_tensor);
+      data[key] = std::move(dst_tensor);
+    }
+    return success();
+  }
+
+ private:
+  Device device_;
+  std::string tag_;
+  FuseFunc func_;
+};
+
+MMDEPLOY_REGISTER_TRANSFORM(Fused);
+
+}  // namespace mmdeploy::transform
--- a/csrc/mmdeploy/preprocess/elena/image2tensor_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/image2tensor_impl.cpp
@ -1,28 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/image2tensor.h"
-
-namespace mmdeploy::elena {
-
-class ImageToTensorImpl : public ::mmdeploy::ImageToTensorImpl {
- public:
-  explicit ImageToTensorImpl(const Value& args) : ::mmdeploy::ImageToTensorImpl(args) {}
-
- protected:
-  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
-    auto& src_desc = tensor.desc();
-    auto data_type = src_desc.data_type;
-    auto shape = src_desc.shape;
-    shape = {shape[0], shape[3], shape[1], shape[2]};
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ImageToTensorImpl, (elena, 0), ImageToTensorImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/load_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/load_impl.cpp
@ -1,57 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/load.h"
-
-using namespace std;
-
-namespace mmdeploy::elena {
-
-class PrepareImageImpl : public ::mmdeploy::PrepareImageImpl {
- public:
-  explicit PrepareImageImpl(const Value& args) : ::mmdeploy::PrepareImageImpl(args){};
-  ~PrepareImageImpl() override = default;
-
- protected:
-  Result<Tensor> ConvertToBGR(const Mat& img) override {
-    auto data_type = img.type();
-    auto format = img.pixel_format();
-    TensorShape shape = {1, img.height(), img.width(), 3};
-
-    if (format == PixelFormat::kNV12 || format == PixelFormat::kNV21) {
-      shape[1] = shape[1] / 3 * 2;
-    }
-
-    if (arg_.to_float32) {
-      data_type = DataType::kFLOAT;
-    }
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-
-  Result<Tensor> ConvertToGray(const Mat& img) override {
-    auto data_type = img.type();
-    auto format = img.pixel_format();
-    TensorShape shape = {1, img.height(), img.width(), 1};
-
-    if (format == PixelFormat::kNV12 || format == PixelFormat::kNV21) {
-      shape[1] = shape[1] / 3 * 2;
-    }
-
-    if (arg_.to_float32) {
-      data_type = DataType::kFLOAT;
-    }
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::PrepareImageImpl, (elena, 0), PrepareImageImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/normalize_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/normalize_impl.cpp
@ -1,30 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/normalize.h"
-
-using namespace std;
-
-namespace mmdeploy::elena {
-
-class NormalizeImpl : public ::mmdeploy::NormalizeImpl {
- public:
-  NormalizeImpl(const Value& value) : ::mmdeploy::NormalizeImpl(value){};
-  ~NormalizeImpl() = default;
-
- protected:
-  Result<Tensor> NormalizeImage(const Tensor& tensor) override {
-    auto& src_desc = tensor.desc();
-    auto data_type = DataType::kFLOAT;
-    auto shape = src_desc.shape;
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::NormalizeImpl, (elena, 0), NormalizeImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/pad_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/pad_impl.cpp
@ -1,31 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/pad.h"
-
-using namespace std;
-
-namespace mmdeploy::elena {
-
-class PadImpl : public ::mmdeploy::PadImpl {
- public:
-  PadImpl(const Value& args) : ::mmdeploy::PadImpl(args) {}
-
- protected:
-  Result<Tensor> PadImage(const Tensor& img, const std::array<int, 4>& padding) override {
-    auto& src_desc = img.desc();
-    auto data_type = src_desc.data_type;
-    auto shape = src_desc.shape;  // 1 x h x w x c
-    shape[1] += padding[1] + padding[3];
-    shape[2] += padding[0] + padding[2];
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::PadImpl, (elena, 0), PadImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/elena/resize_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/resize_impl.cpp
@ -1,30 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "mmdeploy/preprocess/transform/resize.h"
-
-using namespace std;
-
-namespace mmdeploy::elena {
-
-class ResizeImpl final : public ::mmdeploy::ResizeImpl {
- public:
-  ResizeImpl(const Value& args) : ::mmdeploy::ResizeImpl(args) {}
-  ~ResizeImpl() = default;
-
- protected:
-  Result<Tensor> ResizeImage(const Tensor& img, int dst_h, int dst_w) override {
-    auto& src_desc = img.desc();
-    auto data_type = src_desc.data_type;
-    TensorShape shape = {1, dst_h, dst_w, img.shape().back()};
-
-    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
-    Tensor dummy(dummy_desc, dummy_buffer_);
-
-    return dummy;
-  }
-  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM_IMPL(::mmdeploy::ResizeImpl, (elena, 0), ResizeImpl);
-
-}  // namespace mmdeploy::elena
--- a/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
@ -5,7 +5,7 @@ project(mmdeploy_transform)
 set(SRCS
        collect.cpp
        compose.cpp
-        crop.cpp
+        center_crop.cpp
        three_crop.cpp
        ten_crop.cpp
        image2tensor.cpp
@ -18,6 +18,7 @@ set(SRCS
        tracer.cpp
        lift.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation)
 target_include_directories(
        ${PROJECT_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/preprocess>)
 add_library(mmdeploy::transform ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/preprocess/transform/center_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/center_crop.cpp
@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <array>
+
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/managed.h"
+#include "mmdeploy/operation/vision.h"
+#include "mmdeploy/preprocess/transform/tracer.h"
+#include "mmdeploy/preprocess/transform/transform.h"
+
+using namespace std;
+
+namespace mmdeploy::transform {
+
+class CenterCrop : public Transform {
+ public:
+  explicit CenterCrop(const Value& args) {
+    if (!args.contains(("crop_size"))) {
+      MMDEPLOY_ERROR("'crop_size' is expected");
+      throw_exception(eInvalidArgument);
+    }
+    if (args["crop_size"].is_number_integer()) {
+      int crop_size = args["crop_size"].get<int>();
+      crop_size_[0] = crop_size_[1] = crop_size;
+    } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
+      crop_size_[0] = args["crop_size"][0].get<int>();
+      crop_size_[1] = args["crop_size"][1].get<int>();
+    } else {
+      MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+      throw_exception(eInvalidArgument);
+    }
+
+    crop_ = operation::Managed<operation::Crop>::Create();
+  }
+
+  Result<void> Apply(Value& data) override {
+    MMDEPLOY_DEBUG("input: {}", data);
+    auto img_fields = GetImageFields(data);
+
+    for (auto& key : img_fields) {
+      auto tensor = data[key].get<Tensor>();
+      auto desc = tensor.desc();
+      int h = desc.shape[1];
+      int w = desc.shape[2];
+      int crop_height = crop_size_[0];
+      int crop_width = crop_size_[1];
+
+      int y1 = std::max(0, int(std::round((h - crop_height) / 2.0)));
+      int x1 = std::max(0, int(std::round((w - crop_width) / 2.0)));
+      int y2 = std::min(h, y1 + crop_height) - 1;
+      int x2 = std::min(w, x1 + crop_width) - 1;
+
+      Tensor dst_tensor;
+      OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
+
+      auto& shape = dst_tensor.desc().shape;
+
+      // trace static info & runtime args
+      if (data.contains("__tracer__")) {
+        data["__tracer__"].get_ref<Tracer&>().CenterCrop(
+            {y1, x1, h - (int)shape[1] - y1, w - (int)shape[2] - x1},
+            {(int)shape[1], (int)shape[2]}, tensor.data_type());
+      }
+
+      data["img_shape"] = {shape[0], shape[1], shape[2], shape[3]};
+      if (data.contains("scale_factor")) {
+        // image has been processed by `Resize` transform before.
+        // Compute cropped image's offset against the original image
+        assert(data["scale_factor"].is_array() && data["scale_factor"].size() >= 2);
+        float w_scale = data["scale_factor"][0].get<float>();
+        float h_scale = data["scale_factor"][1].get<float>();
+        data["offset"].push_back(x1 / w_scale);
+        data["offset"].push_back(y1 / h_scale);
+      } else {
+        data["offset"].push_back(x1);
+        data["offset"].push_back(y1);
+      }
+
+      data[key] = std::move(dst_tensor);
+    }
+
+    MMDEPLOY_DEBUG("output: {}", data);
+    return success();
+  }
+
+ private:
+  operation::Managed<operation::Crop> crop_;
+  std::array<int, 2> crop_size_{};
+};
+
+MMDEPLOY_REGISTER_TRANSFORM(CenterCrop);
+
+}  // namespace mmdeploy::transform
--- a/csrc/mmdeploy/preprocess/transform/collect.cpp
+++ b/csrc/mmdeploy/preprocess/transform/collect.cpp
@ -1,76 +1,69 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "collect.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/preprocess/transform/transform.h"

-#include "mmdeploy/archive/json_archive.h"
-#include "mmdeploy/core/logger.h"
+namespace mmdeploy::transform {

-namespace mmdeploy {
-
-CollectImpl::CollectImpl(const Value &args) : TransformImpl(args) {
-  if (!args.contains("keys") || !args["keys"].is_array()) {
-    throw std::invalid_argument("'keys' is missed in arguments, or it is not an array as expected");
-  }
-  if (args.contains("meta_keys") && !args["meta_keys"].is_array()) {
-    throw std::invalid_argument("'meta_keys' has to be an array");
-  }
-
-  for (auto &v : args["keys"]) {
-    arg_.keys.push_back(v.get<std::string>());
-  }
-  if (args.contains("meta_keys")) {
-    for (auto &v : args["meta_keys"]) {
-      arg_.meta_keys.push_back(v.get<std::string>());
+class Collect : public Transform {
+ public:
+  explicit Collect(const Value& args) {
+    if (!args.contains("keys") || !args["keys"].is_array()) {
+      MMDEPLOY_ERROR("'keys' is missed in arguments, or it is not an array as expected");
+      throw_exception(eInvalidArgument);
    }
-  }
-}
-
-Result<Value> CollectImpl::Process(const Value &input) {
-  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
-  Value output;
-
-  // collect 'ori_img' and 'attribute' from `input`, because those two fields
-  // are given by users, not generated by transform ops
-  if (input.contains("ori_img")) {
-    output["ori_img"] = input["ori_img"];
-  }
-  if (input.contains("attribute")) {
-    output["attribute"] = input["attribute"];
-  }
-
-  for (auto &meta_key : arg_.meta_keys) {
-    if (input.contains(meta_key)) {
-      output["img_metas"][meta_key] = input[meta_key];
+    if (args.contains("meta_keys") && !args["meta_keys"].is_array()) {
+      MMDEPLOY_ERROR("'meta_keys' has to be an array");
+      throw_exception(eInvalidArgument);
    }
-  }
-  for (auto &key : arg_.keys) {
-    if (!input.contains(key)) {
-      MMDEPLOY_INFO("missed key '{}' in input", key);
-      return Status(eInvalidArgument);
-    } else {
-      output[key] = input[key];
+
+    for (auto& v : args["keys"]) {
+      keys_.push_back(v.get<std::string>());
+    }
+    if (args.contains("meta_keys")) {
+      for (auto& v : args["meta_keys"]) {
+        meta_keys_.push_back(v.get<std::string>());
+      }
    }
  }

-  MMDEPLOY_DEBUG("output: {}", to_json(output).dump(2));
-  return output;
-}
+  Result<void> Apply(Value& data) override {
+    MMDEPLOY_DEBUG("input: {}", data);
+    Value::Object output;

-Collect::Collect(const Value &args, int version) : Transform(args) {
-  auto impl_creator = gRegistry<CollectImpl>().Get(specified_platform_, version);
-  if (nullptr == impl_creator) {
-    MMDEPLOY_ERROR("'Collect' is not supported on '{}' platform", specified_platform_);
-    throw_exception(eEntryNotFound);
+    // collect 'ori_img' and 'attribute' from `input`, because those two fields
+    // are given by users, not generated by transform ops
+    if (data.contains("ori_img")) {
+      output["ori_img"] = data["ori_img"];
+    }
+    if (data.contains("attribute")) {
+      output["attribute"] = data["attribute"];
+    }
+
+    for (auto& meta_key : meta_keys_) {
+      if (data.contains(meta_key)) {
+        output["img_metas"][meta_key] = data[meta_key];
+      }
+    }
+    for (auto& key : keys_) {
+      if (!data.contains(key)) {
+        MMDEPLOY_INFO("missed key '{}' in input", key);
+        return Status(eInvalidArgument);
+      } else {
+        output[key] = data[key];
+      }
+    }
+
+    data = std::move(output);
+    MMDEPLOY_DEBUG("output: {}", data);
+    return success();
  }
-  impl_ = impl_creator->Create(args);
-}

-Result<Value> Collect::Process(const Value &input) { return impl_->Process(input); }
+ private:
+  std::vector<std::string> keys_;
+  std::vector<std::string> meta_keys_;
+};

-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (Collect, 0), [](const Value &config) {
-  return std::make_unique<Collect>(config, 0);
-});
+MMDEPLOY_REGISTER_TRANSFORM(Collect);

-MMDEPLOY_DEFINE_REGISTRY(CollectImpl);
-
-}  // namespace mmdeploy
+}  // namespace mmdeploy::transform
--- a/csrc/mmdeploy/preprocess/transform/collect.h
+++ b/csrc/mmdeploy/preprocess/transform/collect.h
@ -1,42 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#ifndef MMDEPLOY_COLLECT_H
-#define MMDEPLOY_COLLECT_H
-
-#include "transform.h"
-namespace mmdeploy {
-
-class MMDEPLOY_API CollectImpl : public TransformImpl {
- public:
-  explicit CollectImpl(const Value& args);
-  ~CollectImpl() override = default;
-
-  Result<Value> Process(const Value& input) override;
-
- protected:
-  struct collect_arg_t {
-    std::vector<std::string> keys;
-    std::vector<std::string> meta_keys;
-  };
-  using ArgType = collect_arg_t;
-
- protected:
-  ArgType arg_;
-};
-
-class MMDEPLOY_API Collect : public Transform {
- public:
-  explicit Collect(const Value& args, int version = 0);
-  ~Collect() override = default;
-
-  Result<Value> Process(const Value& input) override;
-
- private:
-  std::unique_ptr<CollectImpl> impl_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(CollectImpl, std::unique_ptr<CollectImpl>(const Value& config));
-
-}  // namespace mmdeploy
-
-#endif  // MMDEPLOY_COLLECT_H
--- a/csrc/mmdeploy/preprocess/transform/compose.cpp
+++ b/csrc/mmdeploy/preprocess/transform/compose.cpp
@ -1,94 +1,98 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "compose.h"
-
-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/archive/value_archive.h"
+#include "mmdeploy/core/profiler.h"
 #include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/preprocess/transform/transform.h"

-namespace mmdeploy {
+namespace mmdeploy::transform {

-void SaveIntermediates(Value& value, Value::Array& intermediates) {
-  if (value.is_array()) {
-    for (auto& inner : value) {
-      if (auto it = inner.find("__data__"); it != inner.end()) {
-        std::move(it->begin(), it->end(), std::back_inserter(intermediates));
-        it->array().clear();
+class Compose : public Transform {
+ public:
+  explicit Compose(const Value& args) {
+    assert(args.contains("context"));
+
+    Value context;
+    context = args["context"];
+    context["device"].get_to(device_);
+    context["stream"].get_to(stream_);
+
+    if (auto parent = context.value<profiler::Scope*>("scope", nullptr)) {
+      scope_ = parent->CreateScope("Compose");
+      context["scope"] = scope_;
+    }
+
+    auto transforms = args["transforms"].array();
+    operation::Context ctx(device_, stream_);
+
+    EnableTransformFusion(args, transforms);
+
+    for (auto cfg : transforms) {
+      cfg["context"] = context;
+      auto type = cfg.value("type", std::string{});
+      MMDEPLOY_DEBUG("creating transform: {} with cfg: {}", type, cfg);
+      auto creator = gRegistry<Transform>().Get(type);
+      if (!creator) {
+        MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
+                       gRegistry<Transform>().List());
+        throw_exception(eEntryNotFound);
+      }
+      auto transform = creator->Create(cfg);
+      if (!transform) {
+        MMDEPLOY_ERROR("Failed to create transform: {}, config: {}", type, cfg);
+        throw_exception(eFail);
+      }
+      transforms_.push_back(std::move(transform));
+      if (scope_) {
+        transform_scopes_.push_back(scope_->CreateScope(type));
      }
    }
-  } else if (value.is_object()) {
-    if (auto it = value.find("__data__"); it != value.end()) {
-      std::move(it->begin(), it->end(), std::back_inserter(intermediates));
-      it->array().clear();
-    }
  }
-}

-Compose::Compose(const Value& args, int version) : Transform(args) {
-  assert(args.contains("context"));
-
-  Value context;
-  context = args["context"];
-  context["stream"].get_to(stream_);
-  bool fuse_transform = args.value("fuse_transform", false);
-  if (fuse_transform) {
-    std::string sha256 = args.value("sha256", std::string(""));
-    context["fuse_transform"] = true;
-    context["sha256"] = sha256;
-  }
-  if (context.contains("scope")) {
-    auto scope = context["scope"].get<profiler::Scope*>();
-    scope_ = scope->CreateScope("Compose");
-  }
-  for (auto cfg : args["transforms"]) {
-    cfg["context"] = context;
-    auto type = cfg.value("type", std::string{});
-    MMDEPLOY_DEBUG("creating transform: {} with cfg: {}", type, mmdeploy::to_json(cfg).dump(2));
-    auto creator = gRegistry<Transform>().Get(type, version);
-    if (!creator) {
-      MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                     gRegistry<Transform>().List());
-      throw_exception(eEntryNotFound);
+  Result<void> Apply(Value& data) override {
+    profiler::ScopedCounter counter(scope_);
+    operation::Context context(device_, stream_);
+    if (!hash_code_.empty()) {
+      context.set_use_dummy(true);
    }
-    if (scope_) {
-      auto scope = scope_->CreateScope(type);
-      if (type == "Lift") {
-        cfg["context"]["scope"] = scope;
-        transform_scopes_.push_back(nullptr);
-      } else {
-        transform_scopes_.push_back(scope);
+    for (size_t i = 0; i < transforms_.size(); ++i) {
+      std::optional<profiler::ScopedCounter> child_counter;
+      if (scope_) {
+        child_counter.emplace(transform_scopes_[i]);
+      }
+      OUTCOME_TRY(transforms_[i]->Apply(data));
+      if (scope_) {
+        OUTCOME_TRY(stream_.Wait());
      }
-    } else {
-      transform_scopes_.push_back(nullptr);
    }
-    auto transform = creator->Create(cfg);
-    if (!transform) {
-      MMDEPLOY_ERROR("Failed to create transform: {}, config: {}", type, cfg);
-      throw_exception(eFail);
-    }
-    transforms_.push_back(std::move(transform));
+    return success();
  }
-}

-Result<Value> Compose::Process(const Value& input) {
-  Value output = input;
-  Value::Array intermediates;
-  int idx = 0;
-  for (auto& transform : transforms_) {
-    profiler::ScopedCounter counter(transform_scopes_[idx++]);
-    OUTCOME_TRY(auto t, transform->Process(output));
-    SaveIntermediates(t, intermediates);
-    output = std::move(t);
-    if (transform_scopes_[idx - 1]) {
-      OUTCOME_TRY(stream_.Wait());
+ private:
+  void EnableTransformFusion(const Value& args, Value::Array& transforms) {
+    if (args.value("fuse_transform", false)) {
+      hash_code_ = args.value("sha256", hash_code_);
+      if (!hash_code_.empty()) {
+        operation::gContext().set_use_dummy(true);
+        auto it = transforms.begin();
+        for (; it != transforms.end(); ++it) {
+          if (it->value<std::string>("type", {}) == "Collect") {
+            break;
+          }
+        }
+        transforms.insert(it, Value::Object{{"type", "Fused"}, {"hash_code", hash_code_}});
+      }
    }
  }
-  OUTCOME_TRY(stream_.Wait());
-  return std::move(output);
-}

-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (Compose, 0), [](const Value& config) {
-  return std::make_unique<Compose>(config, 0);
-});
+  std::vector<std::unique_ptr<Transform>> transforms_;
+  Device device_;
+  Stream stream_;
+  std::vector<profiler::Scope*> transform_scopes_;
+  profiler::Scope* scope_{};
+  std::string hash_code_;
+};

-}  // namespace mmdeploy
+MMDEPLOY_REGISTER_TRANSFORM(Compose);
+
+}  // namespace mmdeploy::transform
--- a/csrc/mmdeploy/preprocess/transform/compose.h
+++ b/csrc/mmdeploy/preprocess/transform/compose.h
@ -1,27 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#ifndef MMDEPLOY_SRC_PREPROCESS_TRANSFORM_COMPOSE_H_
-#define MMDEPLOY_SRC_PREPROCESS_TRANSFORM_COMPOSE_H_
-
-#include "mmdeploy/core/profiler.h"
-#include "transform.h"
-
-namespace mmdeploy {
-
-class MMDEPLOY_API Compose : public Transform {
- public:
-  explicit Compose(const Value& args, int version = 0);
-  ~Compose() override = default;
-
-  Result<Value> Process(const Value& input) override;
-
- private:
-  std::vector<std::unique_ptr<Transform>> transforms_;
-  Stream stream_;
-  std::vector<profiler::Scope*> transform_scopes_;
-  profiler::Scope* scope_{nullptr};
-};
-
-}  // namespace mmdeploy
-
-#endif  // MMDEPLOY_SRC_PREPROCESS_TRANSFORM_COMPOSE_H_
--- a/csrc/mmdeploy/preprocess/transform/crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/crop.cpp
@ -1,93 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "crop.h"
-
-#include "mmdeploy/archive/json_archive.h"
-#include "mmdeploy/preprocess/transform/tracer.h"
-
-using namespace std;
-
-namespace mmdeploy {
-
-CenterCropImpl::CenterCropImpl(const Value& args) : TransformImpl(args) {
-  if (!args.contains(("crop_size"))) {
-    throw std::invalid_argument("'crop_size' is expected");
-  }
-  if (args["crop_size"].is_number_integer()) {
-    int crop_size = args["crop_size"].get<int>();
-    arg_.crop_size[0] = arg_.crop_size[1] = crop_size;
-  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-    arg_.crop_size[0] = args["crop_size"][0].get<int>();
-    arg_.crop_size[1] = args["crop_size"][1].get<int>();
-  } else {
-    throw std::invalid_argument("'crop_size' should be integer or an int array of size 2");
-  }
-}
-
-Result<Value> CenterCropImpl::Process(const Value& input) {
-  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
-  auto img_fields = GetImageFields(input);
-
-  // copy input data, and update its properties
-  Value output = input;
-
-  for (auto& key : img_fields) {
-    auto tensor = input[key].get<Tensor>();
-    auto desc = tensor.desc();
-    int h = desc.shape[1];
-    int w = desc.shape[2];
-    int crop_height = arg_.crop_size[0];
-    int crop_width = arg_.crop_size[1];
-
-    int y1 = std::max(0, int(std::round((h - crop_height) / 2.0)));
-    int x1 = std::max(0, int(std::round((w - crop_width) / 2.0)));
-    int y2 = std::min(h, y1 + crop_height) - 1;
-    int x2 = std::min(w, x1 + crop_width) - 1;
-
-    OUTCOME_TRY(auto dst_tensor, CropImage(tensor, y1, x1, y2, x2));
-
-    auto& shape = dst_tensor.desc().shape;
-
-    // trace static info & runtime args
-    if (output.contains("__tracer__")) {
-      output["__tracer__"].get_ref<Tracer&>().CenterCrop(
-          {y1, x1, h - (int)shape[1] - y1, w - (int)shape[2] - x1}, {(int)shape[1], (int)shape[2]},
-          tensor.data_type());
-    }
-
-    output["img_shape"] = {shape[0], shape[1], shape[2], shape[3]};
-    if (input.contains("scale_factor")) {
-      // image has been processed by `Resize` transform before.
-      // Compute cropped image's offset against the original image
-      assert(input["scale_factor"].is_array() && input["scale_factor"].size() >= 2);
-      float w_scale = input["scale_factor"][0].get<float>();
-      float h_scale = input["scale_factor"][1].get<float>();
-      output["offset"].push_back(x1 / w_scale);
-      output["offset"].push_back(y1 / h_scale);
-    } else {
-      output["offset"].push_back(x1);
-      output["offset"].push_back(y1);
-    }
-
-    SetTransformData(output, key, std::move(dst_tensor));
-  }
-
-  MMDEPLOY_DEBUG("output: {}", to_json(output).dump(2));
-  return output;
-}
-
-CenterCrop::CenterCrop(const Value& args, int version) : Transform(args) {
-  auto impl_creator = gRegistry<CenterCropImpl>().Get(specified_platform_, version);
-  if (nullptr == impl_creator) {
-    MMDEPLOY_ERROR("'CenterCrop' is not supported on '{}' platform", specified_platform_);
-    throw std::domain_error("'Resize' is not supported on specified platform");
-  }
-  impl_ = impl_creator->Create(args);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (CenterCrop, 0), [](const Value& config) {
-  return std::make_unique<CenterCrop>(config, 0);
-});
-
-MMDEPLOY_DEFINE_REGISTRY(CenterCropImpl);
-}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/transform/crop.h
+++ b/csrc/mmdeploy/preprocess/transform/crop.h
@ -1,49 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#ifndef MMDEPLOY_CROP_H
-#define MMDEPLOY_CROP_H
-
-#include <array>
-
-#include "mmdeploy/core/tensor.h"
-#include "transform.h"
-
-namespace mmdeploy {
-
-class MMDEPLOY_API CenterCropImpl : public TransformImpl {
- public:
-  explicit CenterCropImpl(const Value& args);
-  ~CenterCropImpl() override = default;
-
-  Result<Value> Process(const Value& input) override;
-
- protected:
-  virtual Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
-                                   int right) = 0;
-
- protected:
-  struct center_crop_arg_t {
-    std::array<int, 2> crop_size;
-  };
-  using ArgType = struct center_crop_arg_t;
-
- protected:
-  ArgType arg_;
-};
-
-class MMDEPLOY_API CenterCrop : public Transform {
- public:
-  explicit CenterCrop(const Value& args, int version = 0);
-  ~CenterCrop() override = default;
-
-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
-
- protected:
-  std::unique_ptr<CenterCropImpl> impl_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(CenterCropImpl, std::unique_ptr<CenterCropImpl>(const Value& config));
-
-}  // namespace mmdeploy
-
-#endif  // MMDEPLOY_CROP_H
--- a/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
+++ b/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
@ -1,73 +1,72 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "default_format_bundle.h"
-
-#include <cassert>
-
-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/managed.h"
+#include "mmdeploy/operation/vision.h"
 #include "mmdeploy/preprocess/transform/tracer.h"
+#include "mmdeploy/preprocess/transform/transform.h"

-namespace mmdeploy {
+namespace mmdeploy::transform {

-DefaultFormatBundleImpl::DefaultFormatBundleImpl(const Value& args) : TransformImpl(args) {
-  if (args.contains("img_to_float") && args["img_to_float"].is_boolean()) {
-    arg_.img_to_float = args["img_to_float"].get<bool>();
+class DefaultFormatBundle : public Transform {
+ public:
+  explicit DefaultFormatBundle(const Value& args) {
+    if (args.contains("img_to_float") && args["img_to_float"].is_boolean()) {
+      img_to_float_ = args["img_to_float"].get<bool>();
+    }
+    to_float_ = operation::Managed<operation::ToFloat>::Create();
+    hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
  }
-}

-Result<Value> DefaultFormatBundleImpl::Process(const Value& input) {
-  MMDEPLOY_DEBUG("DefaultFormatBundle input: {}", to_json(input).dump(2));
-  Value output = input;
-  if (input.contains("img")) {
-    Tensor in_tensor = input["img"].get<Tensor>();
-    OUTCOME_TRY(auto tensor, ToFloat32(in_tensor, arg_.img_to_float));
+  Result<void> Apply(Value& data) override {
+    MMDEPLOY_DEBUG("DefaultFormatBundle input: {}", data);

-    // set default meta keys
-    if (!output.contains("pad_shape")) {
-      for (auto v : tensor.shape()) {
-        output["pad_shape"].push_back(v);
+    if (data.contains("img")) {
+      Tensor tensor = data["img"].get<Tensor>();
+      auto input_data_type = tensor.data_type();
+      if (img_to_float_) {
+        OUTCOME_TRY(to_float_.Apply(tensor, tensor));
      }
-    }
-    if (!output.contains("scale_factor")) {
-      output["scale_factor"].push_back(1.0);
-    }
-    if (!output.contains("img_norm_cfg")) {
-      int channel = tensor.shape()[3];
-      for (int i = 0; i < channel; i++) {
-        output["img_norm_cfg"]["mean"].push_back(0.0);
-        output["img_norm_cfg"]["std"].push_back(1.0);
+
+      // set default meta keys
+      if (!data.contains("pad_shape")) {
+        for (auto v : tensor.shape()) {
+          data["pad_shape"].push_back(v);
+        }
      }
-      output["img_norm_cfg"]["to_rgb"] = false;
+      if (!data.contains("scale_factor")) {
+        data["scale_factor"].push_back(1.0);
+      }
+      if (!data.contains("img_norm_cfg")) {
+        int channel = tensor.shape()[3];
+        for (int i = 0; i < channel; i++) {
+          data["img_norm_cfg"]["mean"].push_back(0.0);
+          data["img_norm_cfg"]["std"].push_back(1.0);
+        }
+        data["img_norm_cfg"]["to_rgb"] = false;
+      }
+
+      // trace static info & runtime args
+      if (data.contains("__tracer__")) {
+        data["__tracer__"].get_ref<Tracer&>().DefaultFormatBundle(img_to_float_, input_data_type);
+      }
+
+      // transpose
+      OUTCOME_TRY(hwc2chw_.Apply(tensor, tensor));
+      data["img"] = std::move(tensor);
    }

-    // trace static info & runtime args
-    if (output.contains("__tracer__")) {
-      output["__tracer__"].get_ref<Tracer&>().DefaultFormatBundle(arg_.img_to_float,
-                                                                  in_tensor.data_type());
-    }
-
-    // transpose
-    OUTCOME_TRY(tensor, HWC2CHW(tensor));
-    SetTransformData(output, "img", std::move(tensor));
+    MMDEPLOY_DEBUG("DefaultFormatBundle output: {}", data);
+    return success();
  }

-  MMDEPLOY_DEBUG("DefaultFormatBundle output: {}", to_json(output).dump(2));
-  return output;
-}
+ private:
+  operation::Managed<operation::ToFloat> to_float_;
+  operation::Managed<operation::HWC2CHW> hwc2chw_;
+  bool img_to_float_ = true;
+};

-DefaultFormatBundle::DefaultFormatBundle(const Value& args, int version) : Transform(args) {
-  auto impl_creator = gRegistry<DefaultFormatBundleImpl>().Get(specified_platform_, version);
-  if (nullptr == impl_creator) {
-    MMDEPLOY_ERROR("'DefaultFormatBundle' is not supported on '{}' platform", specified_platform_);
-    throw std::domain_error("'DefaultFormatBundle' is not supported on specified platform");
-  }
-  impl_ = impl_creator->Create(args);
-}
+MMDEPLOY_REGISTER_TRANSFORM(DefaultFormatBundle);

-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (DefaultFormatBundle, 0), [](const Value& config) {
-  return std::make_unique<DefaultFormatBundle>(config, 0);
-});
-
-MMDEPLOY_DEFINE_REGISTRY(DefaultFormatBundleImpl);
-}  // namespace mmdeploy
+}  // namespace mmdeploy::transform
--- a/csrc/mmdeploy/preprocess/transform/default_format_bundle.h
+++ b/csrc/mmdeploy/preprocess/transform/default_format_bundle.h
@ -1,50 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#ifndef MMDEPLOY_DEFAULT_FORMAT_BUNDLE_H
-#define MMDEPLOY_DEFAULT_FORMAT_BUNDLE_H
-
-#include "mmdeploy/core/tensor.h"
-#include "transform.h"
-
-namespace mmdeploy {
-/**
- * It simplifies the pipeline of formatting common fields
- */
-class MMDEPLOY_API DefaultFormatBundleImpl : public TransformImpl {
- public:
-  DefaultFormatBundleImpl(const Value& args);
-  ~DefaultFormatBundleImpl() override = default;
-
-  Result<Value> Process(const Value& input) override;
-
- protected:
-  virtual Result<Tensor> ToFloat32(const Tensor& tensor, const bool& img_to_float) = 0;
-  virtual Result<Tensor> HWC2CHW(const Tensor& tensor) = 0;
-
- protected:
-  struct default_format_bundle_arg_t {
-    bool img_to_float = true;
-  };
-  using ArgType = struct default_format_bundle_arg_t;
-
- protected:
-  ArgType arg_;
-};
-
-class MMDEPLOY_API DefaultFormatBundle : public Transform {
- public:
-  explicit DefaultFormatBundle(const Value& args, int version = 0);
-  ~DefaultFormatBundle() override = default;
-
-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
-
- private:
-  std::unique_ptr<DefaultFormatBundleImpl> impl_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(DefaultFormatBundleImpl,
-                          std::unique_ptr<DefaultFormatBundleImpl>(const Value& config));
-
-}  // namespace mmdeploy
-
-#endif  // MMDEPLOY_DEFAULT_FORMAT_BUNDLE_H
--- a/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
+++ b/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
@ -1,56 +1,52 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "image2tensor.h"
-
 #include <cassert>

-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/operation/managed.h"
+#include "mmdeploy/operation/vision.h"
 #include "mmdeploy/preprocess/transform/tracer.h"
+#include "mmdeploy/preprocess/transform/transform.h"

-namespace mmdeploy {
+namespace mmdeploy::transform {

-ImageToTensorImpl::ImageToTensorImpl(const Value& args) : TransformImpl(args) {
-  for (auto& key : args["keys"]) {
-    arg_.keys.push_back(key.get<std::string>());
-  }
-}
-
-Result<Value> ImageToTensorImpl::Process(const Value& input) {
-  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
-  Value output = input;
-  for (auto& key : arg_.keys) {
-    assert(input.contains(key));
-    Tensor src_tensor = input[key].get<Tensor>();
-    auto& shape = src_tensor.desc().shape;
-
-    assert(shape.size() == 4);
-    assert(shape[3] == 1 || shape[3] == 3);
-
-    OUTCOME_TRY(auto dst, HWC2CHW(src_tensor));
-    SetTransformData(output, key, std::move(dst));
-
-    if (output.contains("__tracer__")) {
-      output["__tracer__"].get_ref<Tracer&>().ImageToTensor(src_tensor.data_type());
+class ImageToTensor : public Transform {
+ public:
+  explicit ImageToTensor(const Value& args) {
+    for (auto& key : args["keys"]) {
+      keys_.push_back(key.get<std::string>());
    }
-  }  // for key
-  MMDEPLOY_DEBUG("output: {}", to_json(output).dump(2));
-  return output;
-}
-
-ImageToTensor::ImageToTensor(const Value& args, int version) : Transform(args) {
-  auto impl_creator = gRegistry<ImageToTensorImpl>().Get(specified_platform_, version);
-  if (nullptr == impl_creator) {
-    MMDEPLOY_ERROR("'ImageToTensor' is not supported on '{}' platform", specified_platform_);
-    throw std::domain_error("'ImageToTensor' is not supported on specified platform");
+    hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
  }
-  impl_ = impl_creator->Create(args);
-}

-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (ImageToTensor, 0), [](const Value& config) {
-  return std::make_unique<ImageToTensor>(config, 0);
-});
+  Result<void> Apply(Value& data) override {
+    MMDEPLOY_DEBUG("input: {}", data);
+    for (auto& key : keys_) {
+      assert(data.contains(key));
+      Tensor src_tensor = data[key].get<Tensor>();
+      auto& shape = src_tensor.desc().shape;

-MMDEPLOY_DEFINE_REGISTRY(ImageToTensorImpl);
+      assert(shape.size() == 4);
+      assert(shape[3] == 1 || shape[3] == 3);

-}  // namespace mmdeploy
+      Tensor dst;
+      OUTCOME_TRY(hwc2chw_.Apply(src_tensor, dst));
+      data[key] = std::move(dst);
+
+      if (data.contains("__tracer__")) {
+        data["__tracer__"].get_ref<Tracer&>().ImageToTensor(src_tensor.data_type());
+      }
+    }  // for key
+    MMDEPLOY_DEBUG("output: {}", data);
+    return success();
+  }
+
+ private:
+  operation::Managed<operation::HWC2CHW> hwc2chw_;
+  std::vector<std::string> keys_;
+};
+
+MMDEPLOY_REGISTER_TRANSFORM(ImageToTensor);
+
+}  // namespace mmdeploy::transform
--- a/csrc/mmdeploy/preprocess/transform/image2tensor.h
+++ b/csrc/mmdeploy/preprocess/transform/image2tensor.h
@ -1,53 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#ifndef MMDEPLOY_IMAGE2TENSOR_H
-#define MMDEPLOY_IMAGE2TENSOR_H
-
-#include "mmdeploy/core/tensor.h"
-#include "transform.h"
-
-namespace mmdeploy {
-/**
- * Convert image to `Tensor` by given keys.
- *
- * The dimension order of input image is (1, H, W, C). The pipeline will convert
- * it to (1, C, H, W).
- *
- */
-class MMDEPLOY_API ImageToTensorImpl : public TransformImpl {
- public:
-  ImageToTensorImpl(const Value& args);
-  ~ImageToTensorImpl() override = default;
-
-  Result<Value> Process(const Value& input) override;
-
- protected:
-  virtual Result<Tensor> HWC2CHW(const Tensor& tensor) = 0;
-
- protected:
-  struct to_img_tensor_arg_t {
-    std::vector<std::string> keys;
-  };
-  using ArgType = struct to_img_tensor_arg_t;
-
- protected:
-  ArgType arg_;
-};
-
-class MMDEPLOY_API ImageToTensor : public Transform {
- public:
-  explicit ImageToTensor(const Value& args, int version = 0);
-  ~ImageToTensor() override = default;
-
-  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
-
- private:
-  std::unique_ptr<ImageToTensorImpl> impl_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(ImageToTensorImpl,
-                          std::unique_ptr<ImageToTensorImpl>(const Value& config));
-
-}  // namespace mmdeploy
-
-#endif  // MMDEPLOY_IMAGE2TENSOR_H
--- a/csrc/mmdeploy/preprocess/transform/lift.cpp
+++ b/csrc/mmdeploy/preprocess/transform/lift.cpp
@ -1,35 +1,35 @@
 // Copyright (c) OpenMMLab. All rights reserved.

-#include "mmdeploy/preprocess/transform/lift.h"
-
-#include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/archive/value_archive.h"
 #include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/preprocess/transform/transform.h"

-namespace mmdeploy {
-Lift::Lift(const Value& args, int version) : Transform(args) {
-  std::string type = "Compose";
-  auto creator = gRegistry<Transform>().Get(type, version);
-  if (!creator) {
-    MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                   gRegistry<Transform>().List());
-    throw_exception(eEntryNotFound);
+namespace mmdeploy::transform {
+
+class Lift : public Transform {
+ public:
+  explicit Lift(const Value& args) {
+    const char* type = "compose";
+    if (auto creator = gRegistry<Transform>().Get(type)) {
+      compose_ = creator->Create(args);
+    } else {
+      MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
+                     gRegistry<Transform>().List());
+      throw_exception(eEntryNotFound);
+    }
  }
-  compose_ = creator->Create(args);
-}

-Result<Value> Lift::Process(const Value& input) {
-  Value output;
-  for (int i = 0; i < input.size(); i++) {
-    Value single = input[i];
-    OUTCOME_TRY(auto t, compose_->Process(single));
-    output.push_back(std::move(t));
+  Result<void> Apply(Value& data) override {
+    for (auto& item : data.array()) {
+      OUTCOME_TRY(compose_->Apply(item));
+    }
+    return success();
  }
-  return std::move(output);
-}

-MMDEPLOY_REGISTER_FACTORY_FUNC(Transform, (Lift, 0), [](const Value& config) {
-  return std::make_unique<Lift>(config, 0);
-});
+ private:
+  std::unique_ptr<Transform> compose_;
+};

-}  // namespace mmdeploy
+MMDEPLOY_REGISTER_TRANSFORM(Lift);
+
+}  // namespace mmdeploy::transform
--- a/Show More
+++ b/Show More