[Feature] Sync mmaction2-sdk(master) to dev1.x (#1307)

* sync mmaction sdk * pipeline.json * fix docs * replace topk when make regression for mmaction2 * add python api * add missing file * add missing test file * remove cudnn dep for formatshape * add sample arg for input
2022-11-10 15:13:24 +08:00 · 2022-11-10 15:13:24 +08:00 · ccc21289d1
parent 180500d76d
commit ccc21289d1
46 changed files with 2678 additions and 35 deletions
--- a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
@ -0,0 +1,190 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "video_recognizer.h"
+
+#include <numeric>
+#include <vector>
+
+#include "common_internal.h"
+#include "executor_internal.h"
+#include "mmdeploy/archive/value_archive.h"
+#include "mmdeploy/codebase/mmaction/mmaction.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/status_code.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/core/value.h"
+#include "model.h"
+#include "pipeline.h"
+
+using namespace mmdeploy;
+
+namespace {
+Value config_template(const Model& model) {
+  // clang-format off
+  return {
+    {"type", "Pipeline"},
+    {"input", {"video"}},
+    {
+      "tasks", {
+        {
+          {"name", "Video Recognizer"},
+          {"type", "Inference"},
+          {"input", "video"},
+          {"output", "label"},
+          {"params", {{"model", std::move(model)}}},
+        }
+      }
+    },
+    {"output", "label"},
+  };
+  // clang-format on
+}
+
+}  // namespace
+
+int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id,
+                                     mmdeploy_video_recognizer_t* recognizer) {
+  mmdeploy_context_t context{};
+  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+  if (ec != MMDEPLOY_SUCCESS) {
+    return ec;
+  }
+  ec = mmdeploy_video_recognizer_create_v2(model, context, recognizer);
+  mmdeploy_context_destroy(context);
+  return ec;
+}
+
+int mmdeploy_video_recognizer_create_by_path(const char* model_path, const char* device_name,
+                                             int device_id,
+                                             mmdeploy_video_recognizer_t* recognizer) {
+  mmdeploy_model_t model{};
+
+  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    return ec;
+  }
+  auto ec = mmdeploy_video_recognizer_create(model, device_name, device_id, recognizer);
+  mmdeploy_model_destroy(model);
+  return ec;
+}
+int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t recognizer,
+                                    const mmdeploy_mat_t* images,
+                                    const mmdeploy_video_sample_info_t* video_info, int video_count,
+                                    mmdeploy_video_recognition_t** results, int** result_count) {
+  wrapped<mmdeploy_value_t> input;
+  if (auto ec =
+          mmdeploy_video_recognizer_create_input(images, video_info, video_count, input.ptr())) {
+    return ec;
+  }
+
+  wrapped<mmdeploy_value_t> output;
+  if (auto ec = mmdeploy_video_recognizer_apply_v2(recognizer, input, output.ptr())) {
+    return ec;
+  }
+
+  if (auto ec = mmdeploy_video_recognizer_get_result(output, results, result_count)) {
+    return ec;
+  }
+  return MMDEPLOY_SUCCESS;
+}
+
+void mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
+                                              int* result_count, int video_count) {
+  delete[] results;
+  delete[] result_count;
+}
+
+void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer) {
+  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
+}
+
+int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
+                                        mmdeploy_video_recognizer_t* recognizer) {
+  auto config = config_template(*Cast(model));
+  return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)recognizer);
+}
+
+int mmdeploy_video_recognizer_create_input(const mmdeploy_mat_t* images,
+                                           const mmdeploy_video_sample_info_t* video_info,
+                                           int video_count, mmdeploy_value_t* value) {
+  if (video_count && (images == nullptr || video_info == nullptr)) {
+    return MMDEPLOY_E_INVALID_ARG;
+  }
+  try {
+    auto input = std::make_unique<Value>(Value{Value::kArray});
+    auto sample = std::make_unique<Value>(Value::kArray);
+    for (int i = 0; i < video_count; ++i) {
+      int clip_len = video_info[i].clip_len;
+      int num_clips = video_info[i].num_clips;
+      int n_mat = clip_len * num_clips;
+      for (int j = 0; j < n_mat; j++) {
+        mmdeploy::Mat _mat{images[j].height,
+                           images[j].width,
+                           PixelFormat(images[j].format),
+                           DataType(images[j].type),
+                           images[j].data,
+                           images[j].device ? *(const Device*)(images[j].device) : Device{0}};
+        sample->push_back({{"ori_img", _mat}, {"clip_len", clip_len}, {"num_clips", num_clips}});
+      }
+      input->front().push_back(std::move(*sample.release()));
+    }
+    *value = Cast(input.release());
+  } catch (const std::exception& e) {
+    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+  } catch (...) {
+    MMDEPLOY_ERROR("unknown exception caught");
+  }
+  return MMDEPLOY_SUCCESS;
+}
+
+int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
+                                       mmdeploy_value_t input, mmdeploy_value_t* output) {
+  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
+}
+
+int mmdeploy_video_recognizer_get_result(mmdeploy_value_t output,
+                                         mmdeploy_video_recognition_t** results,
+                                         int** result_count) {
+  if (!output || !results || !result_count) {
+    return MMDEPLOY_E_INVALID_ARG;
+  }
+  try {
+    Value& value = Cast(output)->front();
+
+    auto classify_outputs = from_value<std::vector<mmaction::Labels>>(value);
+
+    std::vector<int> _result_count;
+    _result_count.reserve(classify_outputs.size());
+
+    for (const auto& cls_output : classify_outputs) {
+      _result_count.push_back((int)cls_output.size());
+    }
+
+    auto total = std::accumulate(begin(_result_count), end(_result_count), 0);
+
+    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+    std::unique_ptr<mmdeploy_video_recognition_t[]> result_data(
+        new mmdeploy_video_recognition_t[total]{});
+    auto result_ptr = result_data.get();
+    for (const auto& cls_output : classify_outputs) {
+      for (const auto& label : cls_output) {
+        result_ptr->label_id = label.label_id;
+        result_ptr->score = label.score;
+        ++result_ptr;
+      }
+    }
+
+    *result_count = result_count_data.release();
+    *results = result_data.release();
+
+    return MMDEPLOY_SUCCESS;
+  } catch (const std::exception& e) {
+    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+  } catch (...) {
+    MMDEPLOY_ERROR("unknown exception caught");
+  }
+  return MMDEPLOY_E_FAIL;
+}
--- a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
@ -0,0 +1,139 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+/**
+ * @file video_recognizer.h
+ * @brief Interface to MMACTION video recognition task
+ */
+
+#ifndef MMDEPLOY_VIDEO_RECOGNIZER_H
+#define MMDEPLOY_VIDEO_RECOGNIZER_H
+
+#include "common.h"
+#include "executor.h"
+#include "model.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mmdeploy_video_recognition_t {
+  int label_id;
+  float score;
+} mmdeploy_video_recognition_t;
+
+typedef struct mmdeploy_video_sample_info_t {
+  int clip_len;
+  int num_clips;
+} mmdeploy_video_sample_info_t;
+
+typedef struct mmdeploy_video_recognizer* mmdeploy_video_recognizer_t;
+
+/**
+ * @brief Create video recognizer's handle
+ * @param[in] model an instance of mmaction sdk model created by
+ * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+ * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+ * @param[in] device_id id of device.
+ * @param[out] recognizer handle of the created video recognizer, which must be destroyed
+ * by \ref mmdeploy_video_recognizer_destroy
+ * @return status of creating video recognizer's handle
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name,
+                                                  int device_id,
+                                                  mmdeploy_video_recognizer_t* recognizer);
+
+/**
+ * @brief Create a video recognizer instance
+ * @param[in] model_path path to video recognition model
+ * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+ * @param[in] device_id id of device.
+ * @param[out] recognizer handle of the created video recognizer, which must be destroyed
+ * by \ref mmdeploy_video_recognizer_destroy
+ * @return status code of the operation
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_create_by_path(const char* model_path,
+                                                          const char* device_name, int device_id,
+                                                          mmdeploy_video_recognizer_t* recognizer);
+
+/**
+ * @brief Apply video recognizer to a batch of videos
+ * @param[in] recognizer video recognizer's handle created by \ref
+ * mmdeploy_video_recognizer_create_by_path
+ * @param[in] images a batch of videos
+ * @param[in] video_info video information of each video
+ * @param[in] video_count number of videos
+ * @param[out] results a linear buffer contains the recognized video, must be release
+ * by \ref mmdeploy_video_recognizer_release_result
+ * @param[out] result_count a linear buffer with length being \p video_count to save the number of
+ * recognition results of each video. It must be released by \ref
+ * mmdeploy_video_recognizer_release_result
+ * @return status code of the operation
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t recognizer,
+                                                 const mmdeploy_mat_t* images,
+                                                 const mmdeploy_video_sample_info_t* video_info,
+                                                 int video_count,
+                                                 mmdeploy_video_recognition_t** results,
+                                                 int** result_count);
+
+/** @brief Release result buffer returned by \ref mmdeploy_video_recognizer_apply
+ * @param[in] results result buffer by video recognizer
+ * @param[in] result_count \p results size buffer
+ * @param[in] video_count length of \p result_count
+ */
+MMDEPLOY_API void mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
+                                                           int* result_count, int video_count);
+
+/**
+ * @brief destroy video recognizer
+ * @param[in] recognizer handle of video recognizer created by \ref
+ * mmdeploy_video_recognizer_create_by_path or \ref mmdeploy_video_recognizer_create
+ */
+MMDEPLOY_API void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer);
+
+/**
+ * @brief Same as \ref mmdeploy_video_recognizer_create, but allows to control execution context of
+ * tasks via context
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model,
+                                                     mmdeploy_context_t context,
+                                                     mmdeploy_video_recognizer_t* recognizer);
+
+/**
+ * @brief Pack video recognizer inputs into mmdeploy_value_t
+ * @param[in] images a batch of videos
+ * @param[in] video_info video information of each video
+ * @param[in] video_count number of videos in the batch
+ * @param[out] value created value
+ * @return status code of the operation
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_create_input(
+    const mmdeploy_mat_t* images, const mmdeploy_video_sample_info_t* video_info, int video_count,
+    mmdeploy_value_t* value);
+
+/**
+ * @brief Apply video recognizer to a batch of videos
+ * @param[in] input packed input
+ * @param[out] output inference output
+ * @return status code of the operation
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
+                                                    mmdeploy_value_t input,
+                                                    mmdeploy_value_t* output);
+
+/**
+ * @brief Apply video recognizer to a batch of videos
+ * @param[in] output inference output
+ * @param[out] results structured output
+ * @param[out] result_count number of each videos
+ * @return status code of the operation
+ */
+MMDEPLOY_API int mmdeploy_video_recognizer_get_result(mmdeploy_value_t output,
+                                                      mmdeploy_video_recognition_t** results,
+                                                      int** result_count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MMDEPLOY_VIDEO_RECOGNIZER_H
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
@ -0,0 +1,91 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_MMDEPLOY_APIS_CXX_VIDEO_RECOGNIZER_HPP_
+#define MMDEPLOY_CSRC_MMDEPLOY_APIS_CXX_VIDEO_RECOGNIZER_HPP_
+
+#include "mmdeploy/common.hpp"
+#include "mmdeploy/video_recognizer.h"
+
+namespace mmdeploy {
+
+namespace cxx {
+
+using VideoRecognition = mmdeploy_video_recognition_t;
+using VideoSampleInfo = mmdeploy_video_sample_info_t;
+
+class VideoRecognizer : public NonMovable {
+ public:
+  VideoRecognizer(const Model& model, const Context& context) {
+    auto ec = mmdeploy_video_recognizer_create_v2(model, context, &recognizer_);
+    if (ec != MMDEPLOY_SUCCESS) {
+      throw_exception(static_cast<ErrorCode>(ec));
+    }
+  }
+
+  ~VideoRecognizer() {
+    if (recognizer_) {
+      mmdeploy_video_recognizer_destroy(recognizer_);
+      recognizer_ = {};
+    }
+  }
+
+  using Result = Result_<VideoRecognition>;
+
+  std::vector<Result> Apply(Span<const std::vector<Mat>> videos,
+                            Span<const VideoSampleInfo> infos) {
+    if (videos.empty()) {
+      return {};
+    }
+
+    int video_count = videos.size();
+
+    VideoRecognition* results{};
+    int* result_count{};
+    std::vector<Mat> images;
+    std::vector<VideoSampleInfo> video_info;
+    for (int i = 0; i < videos.size(); i++) {
+      for (auto& mat : videos[i]) {
+        images.push_back(mat);
+      }
+      video_info.push_back(infos[i]);
+    }
+
+    auto ec =
+        mmdeploy_video_recognizer_apply(recognizer_, reinterpret(images.data()), video_info.data(),
+                                        video_count, &results, &result_count);
+    if (ec != MMDEPLOY_SUCCESS) {
+      throw_exception(static_cast<ErrorCode>(ec));
+    }
+
+    std::vector<Result> rets;
+    rets.reserve(video_count);
+
+    std::shared_ptr<VideoRecognition> data(results, [result_count, count = video_count](auto p) {
+      mmdeploy_video_recognizer_release_result(p, result_count, count);
+    });
+
+    size_t offset = 0;
+    for (size_t i = 0; i < video_count; ++i) {
+      offset += rets.emplace_back(offset, result_count[i], data).size();
+    }
+
+    return rets;
+  }
+
+  Result Apply(const std::vector<Mat>& video, const VideoSampleInfo info) {
+    return Apply(Span{video}, Span{info})[0];
+  }
+
+ private:
+  mmdeploy_video_recognizer_t recognizer_{};
+};
+
+}  // namespace cxx
+
+using cxx::VideoRecognition;
+using cxx::VideoRecognizer;
+using cxx::VideoSampleInfo;
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_MMDEPLOY_APIS_CXX_VIDEO_RECOGNIZER_HPP_
--- a/csrc/mmdeploy/apis/python/video_recognizer.cpp
+++ b/csrc/mmdeploy/apis/python/video_recognizer.cpp
@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/video_recognizer.h"
+
+#include "common.h"
+
+namespace mmdeploy::python {
+
+class PyVideoRecognizer {
+ public:
+  PyVideoRecognizer(const char* model_path, const char* device_name, int device_id) {
+    auto status =
+        mmdeploy_video_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
+    if (status != MMDEPLOY_SUCCESS) {
+      throw std::runtime_error("failed to create video_recognizer");
+    }
+  }
+  std::vector<std::vector<std::tuple<int, float>>> Apply(
+      const std::vector<std::vector<PyImage>>& imgs, const std::vector<std::pair<int, int>>& info) {
+    if (info.size() != imgs.size()) {
+      throw std::invalid_argument("the length of info is not equal with imgs");
+    }
+    for (int i = 0; i < info.size(); i++) {
+      if (imgs[i].size() != info[i].first * info[i].second) {
+        throw std::invalid_argument("invalid info");
+      }
+    }
+    int total = 0;
+    for (int i = 0; i < imgs.size(); i++) {
+      total += imgs[i].size();
+    }
+    std::vector<mmdeploy_mat_t> clips;
+    std::vector<mmdeploy_video_sample_info_t> clip_info;
+    clips.reserve(total);
+    clip_info.reserve(total);
+    for (int i = 0; i < imgs.size(); i++) {
+      for (const auto& img : imgs[i]) {
+        auto mat = GetMat(img);
+        clips.push_back(mat);
+      }
+      clip_info.push_back({info[i].first, info[i].second});
+    }
+
+    mmdeploy_video_recognition_t* results{};
+    int* result_count{};
+    auto status = mmdeploy_video_recognizer_apply(recognizer_, clips.data(), clip_info.data(), 1,
+                                                  &results, &result_count);
+    if (status != MMDEPLOY_SUCCESS) {
+      throw std::runtime_error("failed to apply video_recognizer, code: " + std::to_string(status));
+    }
+
+    auto output = std::vector<std::vector<std::tuple<int, float>>>{};
+    output.reserve(imgs.size());
+    auto result_ptr = results;
+    for (int i = 0; i < imgs.size(); ++i) {
+      std::vector<std::tuple<int, float>> label_score;
+      for (int j = 0; j < result_count[i]; ++j) {
+        label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
+      }
+      output.push_back(std::move(label_score));
+      result_ptr += result_count[i];
+    }
+    mmdeploy_video_recognizer_release_result(results, result_count, (int)imgs.size());
+    return output;
+  }
+
+  ~PyVideoRecognizer() {
+    mmdeploy_video_recognizer_destroy(recognizer_);
+    recognizer_ = {};
+  }
+
+ private:
+  mmdeploy_video_recognizer_t recognizer_{};
+};
+
+static PythonBindingRegisterer register_video_recognizer{[](py::module& m) {
+  py::class_<PyVideoRecognizer>(m, "VideoRecognizer")
+      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
+             return std::make_unique<PyVideoRecognizer>(model_path, device_name, device_id);
+           }),
+           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
+      .def("__call__",
+           [](PyVideoRecognizer* self, const std::vector<PyImage>& imgs,
+              const std::pair<int, int>& info) { return self->Apply({imgs}, {info})[0]; })
+      .def("batch", &PyVideoRecognizer::Apply);
+}};
+
+}  // namespace mmdeploy::python
--- a/csrc/mmdeploy/codebase/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/CMakeLists.txt
@ -11,6 +11,7 @@ if ("all" IN_LIST MMDEPLOY_CODEBASES)
    list(APPEND CODEBASES "mmedit")
    list(APPEND CODEBASES "mmpose")
    list(APPEND CODEBASES "mmrotate")
+    list(APPEND CODEBASES "mmaction")
 else ()
    set(CODEBASES ${MMDEPLOY_CODEBASES})
 endif ()
--- a/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_mmaction)
+
+file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+add_subdirectory(cpu)
+add_subdirectory(cuda)
+target_link_libraries(${PROJECT_NAME} PRIVATE
+    mmdeploy::transform
+    mmdeploy_opencv_utils)
+
+add_library(mmdeploy::mmaction ALIAS ${PROJECT_NAME})
+
+set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} video_recognizer CACHE INTERNAL "")
--- a/csrc/mmdeploy/codebase/mmaction/base_head.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/base_head.cpp
@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <algorithm>
+#include <numeric>
+
+#include "mmdeploy/codebase/mmaction/mmaction.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+
+namespace mmdeploy::mmaction {
+
+class BaseHead : public MMAction {
+ public:
+  explicit BaseHead(const Value& cfg) : MMAction(cfg) {
+    if (cfg.contains("params")) {
+      topk_ = cfg["params"].value("topk", 1);
+      if (topk_ <= 0) {
+        MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
+        throw_exception(eInvalidArgument);
+      }
+    }
+  }
+
+  Result<Value> operator()(const Value& infer_res) {
+    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+    auto output = infer_res["output"].get<Tensor>();
+
+    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
+                     (int)output.data_type());
+      return Status(eNotSupported);
+    }
+
+    auto class_num = (int)output.shape(1);
+
+    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+    OUTCOME_TRY(stream().Wait());
+
+    return GetLabels(_scores, class_num);
+  }
+
+ private:
+  Value GetLabels(const Tensor& scores, int class_num) const {
+    auto scores_data = scores.data<float>();
+    Labels output;
+    output.reserve(topk_);
+    std::vector<int> idx(class_num);
+    iota(begin(idx), end(idx), 0);
+    partial_sort(begin(idx), begin(idx) + topk_, end(idx),
+                 [&](int i, int j) { return scores_data[i] > scores_data[j]; });
+    for (int i = 0; i < topk_; ++i) {
+      auto label = Label{idx[i], scores_data[idx[i]]};
+      MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
+      output.push_back(label);
+    }
+    return to_value(std::move(output));
+  }
+
+ private:
+  static constexpr const auto kHost = Device{0};
+  int topk_{1};
+};
+
+REGISTER_CODEBASE_COMPONENT(MMAction, BaseHead);
+using SlowFastHead = BaseHead;
+REGISTER_CODEBASE_COMPONENT(MMAction, SlowFastHead);
+using TSNHead = BaseHead;
+REGISTER_CODEBASE_COMPONENT(MMAction, TSNHead);
+
+}  // namespace mmdeploy::mmaction
--- a/csrc/mmdeploy/codebase/mmaction/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmaction/cpu/CMakeLists.txt
@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_mmaction_cpu_impl CXX)
+
+if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+    add_library(${PROJECT_NAME} OBJECT format_shape_impl.cpp)
+    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+        target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+    endif ()
+    target_link_libraries(${PROJECT_NAME} PRIVATE
+            mmdeploy::core)
+    target_link_libraries(mmdeploy_mmaction PRIVATE ${PROJECT_NAME})
+    mmdeploy_export(${PROJECT_NAME})
+endif ()
--- a/csrc/mmdeploy/codebase/mmaction/cpu/format_shape_impl.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/cpu/format_shape_impl.cpp
@ -0,0 +1,138 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmaction/format_shape.h"
+#include "mmdeploy/core/utils/device_utils.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cpu {
+
+class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
+ public:
+  explicit FormatShapeImpl(const Value& args) : ::mmdeploy::FormatShapeImpl(args) {}
+
+ protected:
+  Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len, int num_clips) {
+    int N = tensors.size();
+    int H = tensors[0].shape(1);
+    int W = tensors[0].shape(2);
+    int C = tensors[0].shape(3);
+
+    std::vector<Tensor> host_tensors;
+    host_tensors.reserve(N);
+    for (int i = 0; i < N; i++) {
+      OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensors[i], kHost, stream_));
+      host_tensors.push_back(std::move(src_tensor));
+    }
+    OUTCOME_TRY(stream_.Wait());
+
+    TensorDesc desc = {kHost, DataType::kFLOAT, {N, H, W, C}};
+    Tensor imgs(desc);
+    int offset = 0;
+    int n_item = H * W * C;
+    int copy_size = n_item * sizeof(float);
+    for (int i = 0; i < N; i++) {
+      auto src_buffer = host_tensors[i].buffer();
+      auto dst_buffer = imgs.buffer();
+      OUTCOME_TRY(stream_.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+      offset += copy_size;
+    }
+    OUTCOME_TRY(stream_.Wait());
+
+    Tensor dst;
+    if (arg_.input_format == "NCHW") {
+      OUTCOME_TRY(dst, FormatNCHW(imgs, clip_len, num_clips));
+    }
+    if (arg_.input_format == "NCTHW") {
+      OUTCOME_TRY(dst, FormatNCTHW(imgs, clip_len, num_clips));
+    }
+    TensorShape expand_dim = dst.shape();
+    expand_dim.insert(expand_dim.begin(), 1);
+    dst.Reshape(expand_dim);
+
+    return dst;
+  }
+
+  Result<Tensor> FormatNCHW(Tensor& src, int clip_len, int num_clips) {
+    int N = src.shape(0);
+    int H = src.shape(1);
+    int W = src.shape(2);
+    int C = src.shape(3);
+    return Transpose(src, {N, H, W, C}, {0, 3, 1, 2});
+  };
+
+  Result<Tensor> FormatNCTHW(Tensor& src, int clip_len, int num_clips) {
+    int N = src.shape(0);
+    int H = src.shape(1);
+    int W = src.shape(2);
+    int C = src.shape(3);
+    int L = clip_len;
+    if (N % L != 0) {
+      return Status(eInvalidArgument);
+    }
+    int M = N / L;
+    src.Reshape({M, L, H, W, C});
+
+    return Transpose(src, {M, L, H, W, C}, {0, 4, 1, 2, 3});
+  };
+
+  Result<Tensor> Transpose(Tensor& src, const std::vector<int>& src_dims,
+                           const std::vector<int>& permutation) {
+    Tensor dst(src.desc());
+    TensorShape shape(src.shape().size());
+    for (int i = 0; i < shape.size(); i++) {
+      shape[i] = src.shape(permutation[i]);
+    }
+    dst.Reshape(shape);
+    int ndim = shape.size();
+    std::vector<int> dst_strides(ndim);
+    std::vector<int> src_strides(ndim);
+    dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
+    for (int i = ndim - 2; i >= 0; i--) {
+      dst_strides[i] = dst_strides[i + 1] * shape[i + 1];
+      src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
+    }
+    std::vector<int> tmp(ndim);
+    for (int i = 0; i < ndim; i++) {
+      tmp[i] = src_strides[permutation[i]];
+    }
+    src_strides.swap(tmp);
+    std::vector<int> coord(ndim, 0);
+    auto dst_data = dst.data<float>();
+    auto src_data = src.data<float>();
+
+    int i;
+    do {
+      dst_data[0] = src_data[0];
+      for (i = ndim - 1; i >= 0; i--) {
+        if (++coord[i] == shape[i]) {
+          coord[i] = 0;
+          dst_data -= (shape[i] - 1) * dst_strides[i];
+          src_data -= (shape[i] - 1) * src_strides[i];
+        } else {
+          dst_data += dst_strides[i];
+          src_data += src_strides[i];
+          break;
+        }
+      }
+    } while (i >= 0);
+    return dst;
+  }
+
+  constexpr static Device kHost{0, 0};
+};
+
+class FormatShapeImplCreator : public Creator<::mmdeploy::FormatShapeImpl> {
+ public:
+  const char* GetName() const override { return "cpu"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<FormatShapeImpl>(args); }
+};
+
+}  // namespace cpu
+}  // namespace mmdeploy
+
+using ::mmdeploy::FormatShapeImpl;
+using ::mmdeploy::cpu::FormatShapeImplCreator;
+REGISTER_MODULE(FormatShapeImpl, FormatShapeImplCreator);
--- a/csrc/mmdeploy/codebase/mmaction/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmaction/cuda/CMakeLists.txt
@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+if (NOT "cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+    return()
+endif ()
+
+project(mmdeploy_mmaction_cuda_impl CXX)
+
+add_library(${PROJECT_NAME} OBJECT format_shape_impl.cpp transpose.cu)
+set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+    target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+endif ()
+target_include_directories(${PROJECT_NAME} PRIVATE
+            ${CUDA_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} PRIVATE
+        mmdeploy::core)
+target_link_libraries(mmdeploy_mmaction PRIVATE ${PROJECT_NAME})
+mmdeploy_export(${PROJECT_NAME})
--- a/csrc/mmdeploy/codebase/mmaction/cuda/format_shape_impl.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/cuda/format_shape_impl.cpp
@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "cuda_runtime.h"
+#include "mmdeploy/codebase/mmaction/format_shape.h"
+#include "mmdeploy/core/utils/device_utils.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cuda {
+
+template <typename T>
+void Transpose(const T* src, const int* src_strides, T* dst, const int* dst_strides, int ndim,
+               int total, cudaStream_t stream);
+
+class FormatShapeImpl : public ::mmdeploy::FormatShapeImpl {
+ public:
+  explicit FormatShapeImpl(const Value& args) : ::mmdeploy::FormatShapeImpl(args) {}
+
+ protected:
+  Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len, int num_clips) {
+    int N = tensors.size();
+    int H = tensors[0].shape(1);
+    int W = tensors[0].shape(2);
+    int C = tensors[0].shape(3);
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+    TensorDesc desc = {device_, DataType::kFLOAT, {N, H, W, C}};
+    Tensor imgs(desc);
+    int offset = 0;
+    int n_item = H * W * C;
+    int copy_size = n_item * sizeof(float);
+    for (int i = 0; i < N; i++) {
+      auto src_buffer = tensors[i].buffer();
+      auto dst_buffer = imgs.buffer();
+      OUTCOME_TRY(stream_.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+      offset += copy_size;
+    }
+
+    Tensor dst;
+    if (arg_.input_format == "NCHW") {
+      OUTCOME_TRY(dst, FormatNCHW(imgs, clip_len, num_clips));
+    }
+    if (arg_.input_format == "NCTHW") {
+      OUTCOME_TRY(dst, FormatNCTHW(imgs, clip_len, num_clips));
+    }
+    TensorShape expand_dim = dst.shape();
+    expand_dim.insert(expand_dim.begin(), 1);
+    dst.Reshape(expand_dim);
+
+    return dst;
+  }
+
+  Result<Tensor> FormatNCHW(Tensor& src, int clip_len, int num_clips) {
+    int N = src.shape(0);
+    int H = src.shape(1);
+    int W = src.shape(2);
+    int C = src.shape(3);
+    return Transpose(src, {N, H, W, C}, {0, 3, 1, 2});
+  };
+
+  Result<Tensor> FormatNCTHW(Tensor& src, int clip_len, int num_clips) {
+    int N = src.shape(0);
+    int H = src.shape(1);
+    int W = src.shape(2);
+    int C = src.shape(3);
+    int L = clip_len;
+    if (N % L != 0) {
+      return Status(eInvalidArgument);
+    }
+    int M = N / L;
+    src.Reshape({M, L, H, W, C});
+
+    return Transpose(src, {M, L, H, W, C}, {0, 4, 1, 2, 3});
+  };
+
+  Result<Tensor> Transpose(Tensor& src, const std::vector<int>& src_dims,
+                           const std::vector<int>& permutation) {
+    Tensor dst(src.desc());
+    TensorShape shape(src.shape().size());
+    for (int i = 0; i < shape.size(); i++) {
+      shape[i] = src.shape(permutation[i]);
+    }
+    dst.Reshape(shape);
+
+    int ndim = src_dims.size();
+    std::vector<int> dst_dims(ndim);
+    for (int i = 0; i < ndim; i++) {
+      dst_dims[i] = src_dims[permutation[i]];
+    }
+
+    std::vector<int> src_strides(ndim);
+    std::vector<int> dst_strides(ndim);
+    std::vector<int> buffer(ndim);
+    buffer.back() = 1;
+    dst_strides.back() = 1;
+    for (int i = ndim - 1; i > 0; i--) {
+      buffer[i - 1] = buffer[i] * src_dims[i];
+      dst_strides[i - 1] = dst_strides[i] * dst_dims[i];
+    }
+    for (int i = 0; i < ndim; ++i) {
+      src_strides[i] = buffer[permutation[i]];
+    }
+
+    Buffer _src_strides(Device("cuda"), sizeof(int) * ndim);
+    Buffer _dst_strides(Device("cuda"), sizeof(int) * ndim);
+    OUTCOME_TRY(stream_.Copy(src_strides.data(), _src_strides));
+    OUTCOME_TRY(stream_.Copy(dst_strides.data(), _dst_strides));
+
+    ::mmdeploy::cuda::Transpose(src.data<float>(), GetNative<int*>(_src_strides), dst.data<float>(),
+                                GetNative<int*>(_dst_strides), ndim, src.size(),
+                                (cudaStream_t)stream_.GetNative());
+    return dst;
+  }
+};
+
+class FormatShapeImplCreator : public Creator<::mmdeploy::FormatShapeImpl> {
+ public:
+  const char* GetName() const override { return "cuda"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<FormatShapeImpl>(args); }
+};
+
+}  // namespace cuda
+}  // namespace mmdeploy
+
+using ::mmdeploy::FormatShapeImpl;
+using ::mmdeploy::cuda::FormatShapeImplCreator;
+REGISTER_MODULE(FormatShapeImpl, FormatShapeImplCreator);
--- a/csrc/mmdeploy/codebase/mmaction/cuda/transpose.cu
+++ b/csrc/mmdeploy/codebase/mmaction/cuda/transpose.cu
@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <stdint.h>
+#include <stdio.h>
+
+namespace mmdeploy {
+namespace cuda {
+
+template <typename T>
+__global__ void transpose(const T* src, const int* src_strides, T* dst, const int* dst_strides,
+                          int ndim, int total) {
+  int u = blockIdx.x * blockDim.x + threadIdx.x;
+  if (u >= total) return;
+
+  int remaining = u;
+  int v = 0;
+  for (int i = 0; i < ndim; i++) {
+    int p = remaining / dst_strides[i];
+    remaining -= p * dst_strides[i];
+    v += p * src_strides[i];
+  }
+  dst[u] = src[v];
+}
+
+template <typename T>
+void Transpose(const T* src, const int* src_strides, T* dst, const int* dst_strides, int ndim,
+               int total, cudaStream_t stream) {
+  int thread_num = 256;
+  int block_num = (total + thread_num - 1) / thread_num;
+  transpose<T>
+      <<<block_num, thread_num, 0, stream>>>(src, src_strides, dst, dst_strides, ndim, total);
+}
+
+template void Transpose<float>(const float* src, const int* src_strides, float* dst,
+                               const int* dst_strides, int ndim, int total, cudaStream_t stream);
+
+}  // namespace cuda
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmaction/format_shape.h"
+
+#include "mmdeploy/archive/json_archive.h"
+#include "mmdeploy/core/utils/device_utils.h"
+
+using namespace std;
+
+namespace mmdeploy {
+
+FormatShapeImpl::FormatShapeImpl(const Value& args) : TransformImpl(args) {
+  arg_.input_format = args.value("input_format", std::string(""));
+  if (arg_.input_format != "NCHW" && arg_.input_format != "NCTHW") {
+    throw std::domain_error("'input_format' should be 'NCHW' or 'NCTHW'");
+  }
+}
+
+Result<Value> FormatShapeImpl::Process(const Value& input) {
+  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
+
+  if (!input.is_array()) {
+    MMDEPLOY_ERROR("input of format shape should be array");
+    return Status(eInvalidArgument);
+  }
+  if (!(input[0].contains("img") || input[0].contains("img"))) {
+    MMDEPLOY_ERROR("input should contains imgs or img");
+    return Status(eInvalidArgument);
+  }
+
+  int n_image = input.size();
+  int clip_len = input[0]["clip_len"].get<int>();
+  int num_clips = input[0]["num_clips"].get<int>();
+  std::vector<Tensor> images;
+
+  if (input[0].contains("imgs")) {
+    int n_crop = input[0]["imgs"].size();
+    int total = n_image * n_crop;
+    images.reserve(total);
+    for (int i = 0; i < n_crop; i++) {
+      for (int j = 0; j < n_image; j++) {
+        images.push_back(input[j]["imgs"][i].get<Tensor>());
+      }
+    }
+  } else if (input[0].contains("img")) {
+    images.reserve(n_image);
+    for (int i = 0; i < n_image; i++) {
+      images.push_back(input[i]["img"].get<Tensor>());
+    }
+  }
+
+  Value output;
+  OUTCOME_TRY(auto img, Format(images, clip_len, num_clips));
+  SetTransformData(output, "img", std::move(img));
+  return output;
+}
+
+class FormatShape : public Transform {
+ public:
+  explicit FormatShape(const Value& args, int version = 0) : Transform(args) {
+    auto impl_creator = Registry<FormatShapeImpl>::Get().GetCreator(specified_platform_, version);
+    if (nullptr == impl_creator) {
+      MMDEPLOY_ERROR("'FormatShape' is not supported on '{}' platform", specified_platform_);
+      throw std::domain_error("'FormatShape' is not supported on specified platform");
+    }
+    impl_ = impl_creator->Create(args);
+  }
+  ~FormatShape() override = default;
+
+  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
+
+ protected:
+  std::unique_ptr<FormatShapeImpl> impl_;
+};
+
+class FormatShapeCreator : public Creator<Transform> {
+ public:
+  const char* GetName(void) const override { return "FormatShape"; }
+  int GetVersion(void) const override { return version_; }
+  ReturnType Create(const Value& args) override { return make_unique<FormatShape>(args, version_); }
+
+ private:
+  int version_{1};
+};
+
+REGISTER_MODULE(Transform, FormatShapeCreator);
+MMDEPLOY_DEFINE_REGISTRY(FormatShapeImpl);
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.h
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.h
@ -0,0 +1,37 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CODEBASE_MMACTION_FORMAT_SHAPE_H_
+#define MMDEPLOY_SRC_CODEBASE_MMACTION_FORMAT_SHAPE_H_
+
+#include <array>
+#include <vector>
+
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/preprocess/transform/transform.h"
+
+namespace mmdeploy {
+
+class FormatShapeImpl : public TransformImpl {
+ public:
+  explicit FormatShapeImpl(const Value& args);
+  ~FormatShapeImpl() override = default;
+
+  Result<Value> Process(const Value& input) override;
+
+ protected:
+  virtual Result<Tensor> Format(const std::vector<Tensor>& tensors, int clip_len,
+                                int num_clips) = 0;
+
+ protected:
+  struct format_shape_arg_t {
+    std::string input_format;
+  };
+  using ArgType = struct format_shape_arg_t;
+  ArgType arg_;
+};
+
+MMDEPLOY_DECLARE_REGISTRY(FormatShapeImpl);
+
+}  // namespace mmdeploy
+
+#endif
--- a/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/codebase/mmaction/mmaction.h"
+
+namespace mmdeploy {
+namespace mmaction {
+
+REGISTER_CODEBASE(MMAction);
+
+}
+
+MMDEPLOY_DEFINE_REGISTRY(mmaction::MMAction);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmaction/mmaction.h
+++ b/csrc/mmdeploy/codebase/mmaction/mmaction.h
@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CODEBASE_MMACTION_MMACTION_H_
+#define MMDEPLOY_SRC_CODEBASE_MMACTION_MMACTION_H_
+
+#include "mmdeploy/codebase/common.h"
+#include "mmdeploy/core/device.h"
+#include "mmdeploy/core/module.h"
+#include "mmdeploy/core/serialization.h"
+
+namespace mmdeploy {
+namespace mmaction {
+
+struct Label {
+  int label_id;
+  float score;
+  MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
+};
+
+using Labels = std::vector<Label>;
+
+DECLARE_CODEBASE(MMAction, mmaction);
+}  // namespace mmaction
+
+MMDEPLOY_DECLARE_REGISTRY(mmaction::MMAction);
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_SRC_CODEBASE_MMACTION_MMACTION_H_
--- a/csrc/mmdeploy/preprocess/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/cpu/CMakeLists.txt
@ -5,6 +5,9 @@ project(mmdeploy_cpu_transform_impl)
 set(SRCS
        collect_impl.cpp
        crop_impl.cpp
+        ten_crop_impl.cpp
+        three_crop_impl.cpp
+        crop_utils.cpp
        image2tensor_impl.cpp
        default_format_bundle_impl.cpp
        load_impl.cpp
--- a/csrc/mmdeploy/preprocess/cpu/crop_utils.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/crop_utils.cpp
@ -0,0 +1,24 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/preprocess/transform/crop.h"
+#include "mmdeploy/utils/opencv/opencv_utils.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cpu {
+
+Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
+                         int left, int bottom, int right) {
+  OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device, stream));
+
+  SyncOnScopeExit(stream, src_tensor.buffer() != tensor.buffer(), src_tensor);
+
+  cv::Mat mat = Tensor2CVMat(src_tensor);
+  cv::Mat cropped_mat = Crop(mat, top, left, bottom, right);
+  return CVMat2Tensor(cropped_mat);
+}
+
+}  // namespace cpu
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cpu/ten_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/ten_crop_impl.cpp
@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/preprocess/transform/ten_crop.h"
+#include "opencv_utils.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cpu {
+
+Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
+                         int left, int bottom, int right);
+
+class TenCropImpl : public ::mmdeploy::TenCropImpl {
+ public:
+  explicit TenCropImpl(const Value& args) : ::mmdeploy::TenCropImpl(args) {}
+
+ protected:
+  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                           int right) override {
+    return ::mmdeploy::cpu::CropImage(stream_, device_, tensor, top, left, bottom, right);
+  }
+
+  Result<Tensor> HorizontalFlip(const Tensor& tensor) {
+    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
+    SyncOnScopeExit(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
+    cv::Mat mat = Tensor2CVMat(src_tensor);
+    cv::Mat flipped_mat;
+    cv::flip(mat, flipped_mat, 1);
+    return CVMat2Tensor(flipped_mat);
+  }
+};
+
+class TenCropImplCreator : public Creator<::mmdeploy::TenCropImpl> {
+ public:
+  const char* GetName() const override { return "cpu"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<TenCropImpl>(args); }
+};
+
+}  // namespace cpu
+}  // namespace mmdeploy
+
+using ::mmdeploy::TenCropImpl;
+using ::mmdeploy::cpu::TenCropImplCreator;
+REGISTER_MODULE(TenCropImpl, TenCropImplCreator);
--- a/csrc/mmdeploy/preprocess/cpu/three_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cpu/three_crop_impl.cpp
@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/preprocess/transform/three_crop.h"
+#include "opencv_utils.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cpu {
+
+Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
+                         int left, int bottom, int right);
+
+class ThreeCropImpl : public ::mmdeploy::ThreeCropImpl {
+ public:
+  explicit ThreeCropImpl(const Value& args) : ::mmdeploy::ThreeCropImpl(args) {}
+
+ protected:
+  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                           int right) override {
+    return ::mmdeploy::cpu::CropImage(stream_, device_, tensor, top, left, bottom, right);
+  }
+};
+
+class ThreeCropImplCreator : public Creator<::mmdeploy::ThreeCropImpl> {
+ public:
+  const char* GetName() const override { return "cpu"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<ThreeCropImpl>(args); }
+};
+
+}  // namespace cpu
+}  // namespace mmdeploy
+
+using ::mmdeploy::ThreeCropImpl;
+using ::mmdeploy::cpu::ThreeCropImplCreator;
+REGISTER_MODULE(ThreeCropImpl, ThreeCropImplCreator);
--- a/csrc/mmdeploy/preprocess/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/cuda/CMakeLists.txt
@ -7,6 +7,9 @@ find_package(pplcv REQUIRED)
 set(SRCS
        collect_impl.cpp
        crop_impl.cpp
+        three_crop_impl.cpp
+        ten_crop_impl.cpp
+        crop_utils.cpp
        image2tensor_impl.cpp
        default_format_bundle_impl.cpp
        load_impl.cpp
--- a/csrc/mmdeploy/preprocess/cuda/crop_utils.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/crop_utils.cpp
@ -0,0 +1,66 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/preprocess/transform/crop.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cuda {
+
+template <typename T, int channels>
+void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w,
+          cudaStream_t stream);
+
+Result<Tensor> CropImage(Stream& _stream, const Device& device, const Tensor& tensor, int top,
+                         int left, int bottom, int right) {
+  OUTCOME_TRY(auto device_tensor, MakeAvailableOnDevice(tensor, device, _stream));
+
+  SyncOnScopeExit sync(_stream, device_tensor.buffer() != tensor.buffer(), device_tensor);
+
+  auto stream = GetNative<cudaStream_t>(_stream);
+  auto desc = device_tensor.desc();
+
+  int h = bottom - top + 1;
+  int w = right - left + 1;
+  int c = desc.shape[3];
+  auto type = desc.data_type;
+
+  TensorShape shape{1, bottom - top + 1, right - left + 1, tensor.desc().shape[3]};
+  TensorDesc dst_desc{device, tensor.desc().data_type, shape, desc.name};
+  Tensor dst_tensor{dst_desc};
+  assert(device.is_device());
+  if (DataType::kINT8 == type) {
+    uint8_t* input = device_tensor.data<uint8_t>();
+    uint8_t* output = dst_tensor.data<uint8_t>();
+    if (3 == c) {
+      Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, stream);
+    } else if (1 == c) {
+      Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, stream);
+    } else {
+      MMDEPLOY_ERROR("unsupported channels {}", c);
+      return Status(eNotSupported);
+    }
+  } else if (DataType::kFLOAT == type) {
+    float* input = static_cast<float*>(device_tensor.buffer().GetNative());
+    float* output = static_cast<float*>(dst_tensor.buffer().GetNative());
+    if (3 == c) {
+      Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, stream);
+    } else if (1 == c) {
+      Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, stream);
+    } else {
+      MMDEPLOY_ERROR("unsupported channels {}", c);
+      return Status(eNotSupported);
+    }
+  } else {
+    MMDEPLOY_ERROR("unsupported channels {}", c);
+    return Status(eNotSupported);
+  }
+  return dst_tensor;
+}
+
+}  // namespace cuda
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/cuda/ten_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/ten_crop_impl.cpp
@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/preprocess/transform/ten_crop.h"
+#include "ppl/cv/cuda/flip.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cuda {
+
+Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
+                         int left, int bottom, int right);
+
+class TenCropImpl : public ::mmdeploy::TenCropImpl {
+ public:
+  explicit TenCropImpl(const Value& args) : ::mmdeploy::TenCropImpl(args) {}
+
+ protected:
+  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                           int right) override {
+    return ::mmdeploy::cuda::CropImage(stream_, device_, tensor, top, left, bottom, right);
+  }
+
+  Result<Tensor> HorizontalFlip(const Tensor& tensor) {
+    OUTCOME_TRY(auto src_tensor, MakeAvailableOnDevice(tensor, device_, stream_));
+
+    SyncOnScopeExit sync(stream_, src_tensor.buffer() != tensor.buffer(), src_tensor);
+
+    TensorDesc dst_desc = tensor.desc();
+    dst_desc.device = device_;
+    Tensor dst_tensor(dst_desc);
+    auto stream = GetNative<cudaStream_t>(stream_);
+    int h = (int)tensor.shape(1);
+    int w = (int)tensor.shape(2);
+    int c = (int)tensor.shape(3);
+    ppl::common::RetCode ret;
+    if (tensor.data_type() == DataType::kINT8) {
+      auto input = tensor.data<uint8_t>();
+      auto output = dst_tensor.data<uint8_t>();
+      if (c == 1) {
+        ret = ppl::cv::cuda::Flip<uint8_t, 1>(stream, h, w, w * c, input, w * c, output, 1);
+      } else if (c == 3) {
+        ret = ppl::cv::cuda::Flip<uint8_t, 3>(stream, h, w, w * c, input, w * c, output, 1);
+      } else {
+        ret = ppl::common::RC_UNSUPPORTED;
+      }
+    } else if (tensor.data_type() == DataType::kFLOAT) {
+      auto input = tensor.data<float>();
+      auto output = dst_tensor.data<float>();
+      if (c == 1) {
+        ret = ppl::cv::cuda::Flip<float, 1>(stream, h, w, w * c, input, w * c, output, 1);
+      } else if (c == 3) {
+        ret = ppl::cv::cuda::Flip<float, 3>(stream, h, w, w * c, input, w * c, output, 1);
+      } else {
+        ret = ppl::common::RC_UNSUPPORTED;
+      }
+    } else {
+      MMDEPLOY_ERROR("unsupported data type {}", tensor.data_type());
+      return Status(eNotSupported);
+    }
+
+    if (ret != 0) {
+      return Status(eFail);
+    }
+
+    return dst_tensor;
+  }
+};
+
+class TenCropImplCreator : public Creator<::mmdeploy::TenCropImpl> {
+ public:
+  const char* GetName() const override { return "cuda"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<TenCropImpl>(args); }
+
+ private:
+  int version_{1};
+};
+
+}  // namespace cuda
+}  // namespace mmdeploy
+
+using ::mmdeploy::TenCropImpl;
+using ::mmdeploy::cuda::TenCropImplCreator;
+REGISTER_MODULE(TenCropImpl, TenCropImplCreator);
--- a/csrc/mmdeploy/preprocess/cuda/three_crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/three_crop_impl.cpp
@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/preprocess/transform/three_crop.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace cuda {
+
+Result<Tensor> CropImage(Stream& stream, const Device& device, const Tensor& tensor, int top,
+                         int left, int bottom, int right);
+
+class ThreeCropImpl : public ::mmdeploy::ThreeCropImpl {
+ public:
+  explicit ThreeCropImpl(const Value& args) : ::mmdeploy::ThreeCropImpl(args) {}
+
+ protected:
+  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                           int right) override {
+    return ::mmdeploy::cuda::CropImage(stream_, device_, tensor, top, left, bottom, right);
+  }
+};
+
+class ThreeCropImplCreator : public Creator<::mmdeploy::ThreeCropImpl> {
+ public:
+  const char* GetName() const override { return "cuda"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<ThreeCropImpl>(args); }
+
+ private:
+  int version_{1};
+};
+
+}  // namespace cuda
+}  // namespace mmdeploy
+
+using ::mmdeploy::ThreeCropImpl;
+using ::mmdeploy::cuda::ThreeCropImplCreator;
+REGISTER_MODULE(ThreeCropImpl, ThreeCropImplCreator);
--- a/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
@ -6,6 +6,8 @@ set(SRCS
        collect.cpp
        compose.cpp
        crop.cpp
+        three_crop.cpp
+        ten_crop.cpp
        image2tensor.cpp
        default_format_bundle.cpp
        load.cpp
@ -13,7 +15,8 @@ set(SRCS
        pad.cpp
        resize.cpp
        transform.cpp
-        tracer.cpp)
+        tracer.cpp
+        lift.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
 target_include_directories(
        ${PROJECT_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/preprocess>)
--- a/csrc/mmdeploy/preprocess/transform/compose.cpp
+++ b/csrc/mmdeploy/preprocess/transform/compose.cpp
@ -8,6 +8,21 @@

 namespace mmdeploy {

+void SaveIntermediates(Value& value, Value::Array& intermediates) {
+  if (value.is_array()) {
+    for (auto& inner : value) {
+      if (auto it = inner.find("__data__"); it != inner.end()) {
+        std::move(it->begin(), it->end(), std::back_inserter(intermediates));
+        it->array().clear();
+      }
+    }
+  } else if (value.is_object()) {
+    if (auto it = value.find("__data__"); it != value.end()) {
+      std::move(it->begin(), it->end(), std::back_inserter(intermediates));
+      it->array().clear();
+    }
+  }
+}
 Compose::Compose(const Value& args, int version) : Transform(args) {
  assert(args.contains("context"));

@ -44,10 +59,7 @@ Result<Value> Compose::Process(const Value& input) {
  Value::Array intermediates;
  for (auto& transform : transforms_) {
    OUTCOME_TRY(auto t, transform->Process(output));
-    if (auto it = t.find("__data__"); it != t.end()) {
-      std::move(it->begin(), it->end(), std::back_inserter(intermediates));
-      it->array().clear();
-    }
+    SaveIntermediates(t, intermediates);
    output = std::move(t);
  }
  OUTCOME_TRY(stream_.Wait());
--- a/csrc/mmdeploy/preprocess/transform/lift.cpp
+++ b/csrc/mmdeploy/preprocess/transform/lift.cpp
@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/lift.h"
+
+#include "mmdeploy/archive/json_archive.h"
+#include "mmdeploy/archive/value_archive.h"
+#include "mmdeploy/core/utils/formatter.h"
+
+namespace mmdeploy {
+Lift::Lift(const Value& args, int version) : Transform(args) {
+  std::string type = "Compose";
+  auto creator = Registry<Transform>::Get().GetCreator(type, version);
+  if (!creator) {
+    MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
+                   Registry<Transform>::Get().List());
+    throw_exception(eEntryNotFound);
+  }
+  compose_ = creator->Create(args);
+}
+
+Result<Value> Lift::Process(const Value& input) {
+  Value output;
+  for (int i = 0; i < input.size(); i++) {
+    Value single = input[i];
+    OUTCOME_TRY(auto t, compose_->Process(single));
+    output.push_back(std::move(t));
+  }
+  return std::move(output);
+}
+
+class LiftCreator : public Creator<Transform> {
+ public:
+  const char* GetName() const override { return "Lift"; }
+  int GetVersion() const override { return version_; }
+  ReturnType Create(const Value& args) override { return std::make_unique<Lift>(args, version_); }
+
+ private:
+  int version_{1};
+};
+
+REGISTER_MODULE(Transform, LiftCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/transform/lift.h
+++ b/csrc/mmdeploy/preprocess/transform/lift.h
@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_PREPROCESS_TRANSFORM_LIFT_H_
+#define MMDEPLOY_SRC_PREPROCESS_TRANSFORM_LIFT_H_
+
+#include "mmdeploy/preprocess/transform/transform.h"
+
+namespace mmdeploy {
+
+class MMDEPLOY_API Lift : public Transform {
+ public:
+  explicit Lift(const Value& args, int version = 0);
+  ~Lift() override = default;
+
+  Result<Value> Process(const Value& input) override;
+
+ private:
+  std::unique_ptr<Transform> compose_;
+};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_SRC_PREPROCESS_TRANSFORM_Lift_H_
--- a/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
@ -0,0 +1,90 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/ten_crop.h"
+
+#include "mmdeploy/archive/json_archive.h"
+
+using namespace std;
+
+namespace mmdeploy {
+
+TenCropImpl::TenCropImpl(const Value& args) : TransformImpl(args) {
+  // (w, h) of crop size
+  if (!args.contains(("crop_size"))) {
+    throw std::invalid_argument("'crop_size' is expected");
+  }
+  if (args["crop_size"].is_number_integer()) {
+    int crop_size = args["crop_size"].get<int>();
+    arg_.crop_size[0] = arg_.crop_size[1] = crop_size;
+  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
+    arg_.crop_size[0] = args["crop_size"][0].get<int>();
+    arg_.crop_size[1] = args["crop_size"][1].get<int>();
+  } else {
+    throw std::invalid_argument("'crop_size' should be integer or an int array of size 2");
+  }
+}
+
+Result<Value> TenCropImpl::Process(const Value& input) {
+  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
+
+  // copy input data, and update its properties
+  Value output = input;
+  auto tensor = input["img"].get<Tensor>();
+  int img_h = tensor.shape(1);
+  int img_w = tensor.shape(2);
+  int crop_w = arg_.crop_size[0];
+  int crop_h = arg_.crop_size[1];
+
+  int w_step = (img_w - crop_w) / 4;
+  int h_step = (img_h - crop_h) / 4;
+  std::array<std::pair<int, int>, 5> offsets = {{{0, 0},
+                                                 {4 * w_step, 0},
+                                                 {0, 4 * h_step},
+                                                 {4 * w_step, 4 * h_step},
+                                                 {2 * w_step, 2 * h_step}}};
+  vector<Tensor> cropped;
+  cropped.reserve(10);
+  for (const auto& [offx, offy] : offsets) {
+    int y1 = offy;
+    int y2 = offy + crop_h - 1;
+    int x1 = offx;
+    int x2 = offx + crop_w - 1;
+    OUTCOME_TRY(auto cropped_tensor, CropImage(tensor, y1, x1, y2, x2));
+    OUTCOME_TRY(auto flipped_tensor, HorizontalFlip(cropped_tensor));
+    cropped.push_back(std::move(cropped_tensor));
+    cropped.push_back(std::move(flipped_tensor));
+  }
+
+  output["imgs"] = Value{};
+  for (int i = 0; i < cropped.size(); i++) {
+    output["imgs"].push_back(cropped[i]);
+    output["__data__"].push_back(std::move(cropped[i]));
+  }
+
+  return output;
+}
+
+TenCrop::TenCrop(const Value& args, int version) : Transform(args) {
+  auto impl_creator = Registry<TenCropImpl>::Get().GetCreator(specified_platform_, version);
+  if (nullptr == impl_creator) {
+    MMDEPLOY_ERROR("'TenCrop' is not supported on '{}' platform", specified_platform_);
+    throw std::domain_error("'Resize' is not supported on specified platform");
+  }
+  impl_ = impl_creator->Create(args);
+}
+
+class TenCropCreator : public Creator<Transform> {
+ public:
+  const char* GetName(void) const override { return "TenCrop"; }
+  int GetVersion(void) const override { return version_; }
+  ReturnType Create(const Value& args) override {
+    return std::make_unique<TenCrop>(args, version_);
+  }
+
+ private:
+  int version_{1};
+};
+
+REGISTER_MODULE(Transform, TenCropCreator);
+MMDEPLOY_DEFINE_REGISTRY(TenCropImpl);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/transform/ten_crop.h
+++ b/csrc/mmdeploy/preprocess/transform/ten_crop.h
@ -0,0 +1,49 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_TEN_CROP_H
+#define MMDEPLOY_TEN_CROP_H
+
+#include <array>
+
+#include "mmdeploy/core/tensor.h"
+#include "transform.h"
+
+namespace mmdeploy {
+
+class MMDEPLOY_API TenCropImpl : public TransformImpl {
+ public:
+  explicit TenCropImpl(const Value& args);
+  ~TenCropImpl() override = default;
+
+  Result<Value> Process(const Value& input) override;
+
+ protected:
+  virtual Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                                   int right) = 0;
+  virtual Result<Tensor> HorizontalFlip(const Tensor& tensor) = 0;
+
+ protected:
+  struct ten_crop_arg_t {
+    std::array<int, 2> crop_size;
+  };
+  using ArgType = struct ten_crop_arg_t;
+
+ protected:
+  ArgType arg_;
+};
+
+class MMDEPLOY_API TenCrop : public Transform {
+ public:
+  explicit TenCrop(const Value& args, int version = 0);
+  ~TenCrop() override = default;
+
+  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
+
+ protected:
+  std::unique_ptr<TenCropImpl> impl_;
+};
+
+MMDEPLOY_DECLARE_REGISTRY(TenCropImpl);
+}  // namespace mmdeploy
+
+#endif
--- a/csrc/mmdeploy/preprocess/transform/three_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/three_crop.cpp
@ -0,0 +1,101 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/three_crop.h"
+
+#include "mmdeploy/archive/json_archive.h"
+
+using namespace std;
+
+namespace mmdeploy {
+
+Result<void> check_input_shape(int img_h, int img_w, int crop_h, int crop_w) {
+  if (img_h == crop_h || img_w == crop_w) {
+    return success();
+  }
+  MMDEPLOY_ERROR("ThreeCrop error, img_h: {} != crop_h: {} && img_w: {} != crop_w {}", img_h,
+                 crop_h, img_w, crop_w);
+  return Status(eInvalidArgument);
+}
+
+ThreeCropImpl::ThreeCropImpl(const Value& args) : TransformImpl(args) {
+  // (w, h) of crop size
+  if (!args.contains(("crop_size"))) {
+    throw std::invalid_argument("'crop_size' is expected");
+  }
+  if (args["crop_size"].is_number_integer()) {
+    int crop_size = args["crop_size"].get<int>();
+    arg_.crop_size[0] = arg_.crop_size[1] = crop_size;
+  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
+    arg_.crop_size[0] = args["crop_size"][0].get<int>();
+    arg_.crop_size[1] = args["crop_size"][1].get<int>();
+  } else {
+    throw std::invalid_argument("'crop_size' should be integer or an int array of size 2");
+  }
+}
+
+Result<Value> ThreeCropImpl::Process(const Value& input) {
+  MMDEPLOY_DEBUG("input: {}", to_json(input).dump(2));
+
+  // copy input data, and update its properties
+  Value output = input;
+  auto tensor = input["img"].get<Tensor>();
+  auto desc = tensor.desc();
+  int img_h = desc.shape[1];
+  int img_w = desc.shape[2];
+  int crop_w = arg_.crop_size[0];
+  int crop_h = arg_.crop_size[1];
+  OUTCOME_TRY(check_input_shape(img_h, img_w, crop_h, crop_w));
+
+  std::array<std::pair<int, int>, 3> offsets;
+  if (crop_h == img_h) {
+    int w_step = (img_w - crop_w) / 2;
+    offsets = {{{0, 0}, {2 * w_step, 0}, {w_step, 0}}};
+  } else if (crop_w == img_w) {
+    int h_step = (img_h - crop_h) / 2;
+    offsets = {{{0, 0}, {0, 2 * h_step}, {0, h_step}}};
+  }
+  vector<Tensor> cropped;
+  cropped.reserve(3);
+  for (const auto& [offx, offy] : offsets) {
+    int y1 = offy;
+    int y2 = offy + crop_h - 1;
+    int x1 = offx;
+    int x2 = offx + crop_w - 1;
+    OUTCOME_TRY(auto dst_tensor, CropImage(tensor, y1, x1, y2, x2));
+    cropped.push_back(std::move(dst_tensor));
+  }
+
+  output["imgs"] = Value{};
+  for (int i = 0; i < cropped.size(); i++) {
+    output["imgs"].push_back(cropped[i]);
+    output["__data__"].push_back(std::move(cropped[i]));
+  }
+
+  return output;
+}
+
+ThreeCrop::ThreeCrop(const Value& args, int version) : Transform(args) {
+  auto impl_creator = Registry<ThreeCropImpl>::Get().GetCreator(specified_platform_, version);
+  if (nullptr == impl_creator) {
+    MMDEPLOY_ERROR("'ThreeCrop' is not supported on '{}' platform", specified_platform_);
+    throw std::domain_error("'Resize' is not supported on specified platform");
+  }
+  impl_ = impl_creator->Create(args);
+}
+
+class ThreeCropCreator : public Creator<Transform> {
+ public:
+  const char* GetName(void) const override { return "ThreeCrop"; }
+  int GetVersion(void) const override { return version_; }
+  ReturnType Create(const Value& args) override {
+    return std::make_unique<ThreeCrop>(args, version_);
+  }
+
+ private:
+  int version_{1};
+};
+
+REGISTER_MODULE(Transform, ThreeCropCreator);
+MMDEPLOY_DEFINE_REGISTRY(ThreeCropImpl);
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/transform/three_crop.h
+++ b/csrc/mmdeploy/preprocess/transform/three_crop.h
@ -0,0 +1,48 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_THREE_CROP_H
+#define MMDEPLOY_THREE_CROP_H
+
+#include <array>
+
+#include "mmdeploy/core/tensor.h"
+#include "transform.h"
+
+namespace mmdeploy {
+
+class MMDEPLOY_API ThreeCropImpl : public TransformImpl {
+ public:
+  explicit ThreeCropImpl(const Value& args);
+  ~ThreeCropImpl() override = default;
+
+  Result<Value> Process(const Value& input) override;
+
+ protected:
+  virtual Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                                   int right) = 0;
+
+ protected:
+  struct three_crop_arg_t {
+    std::array<int, 2> crop_size;
+  };
+  using ArgType = struct three_crop_arg_t;
+
+ protected:
+  ArgType arg_;
+};
+
+class MMDEPLOY_API ThreeCrop : public Transform {
+ public:
+  explicit ThreeCrop(const Value& args, int version = 0);
+  ~ThreeCrop() override = default;
+
+  Result<Value> Process(const Value& input) override { return impl_->Process(input); }
+
+ protected:
+  std::unique_ptr<ThreeCropImpl> impl_;
+};
+
+MMDEPLOY_DECLARE_REGISTRY(ThreeCropImpl);
+}  // namespace mmdeploy
+
+#endif
--- a/demo/csrc/CMakeLists.txt
+++ b/demo/csrc/CMakeLists.txt
@ -38,6 +38,7 @@ add_example(restorer c image_restorer)
 add_example(text_detector c ocr)
 add_example(pose_detector c pose_detection)
 add_example(rotated_detector c rotated_object_detection)
+add_example(video_recognizer c video_recognition)
 # TODO: figure out a better way
 #add_example("" c det_cls)
 #add_example("" c det_pose)
@ -52,4 +53,5 @@ if (MMDEPLOY_BUILD_SDK_CXX_API)
    add_example(pose_detector cpp pose_detector)
    add_example(rotated_detector cpp rotated_detector)
    add_example(pose_detector cpp pose_tracker)
+    add_example(video_recognizer cpp video_cls)
 endif ()
--- a/demo/csrc/c/video_recognition.cpp
+++ b/demo/csrc/c/video_recognition.cpp
@ -0,0 +1,111 @@
+#include <fstream>
+#include <map>
+#include <opencv2/imgcodecs/imgcodecs.hpp>
+#include <opencv2/videoio.hpp>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mmdeploy/video_recognizer.h"
+
+void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
+                  std::vector<mmdeploy_mat_t>& clips, int clip_len, int frame_interval = 1,
+                  int num_clips = 1) {
+  cv::VideoCapture cap = cv::VideoCapture(video_path);
+  if (!cap.isOpened()) {
+    fprintf(stderr, "failed to load video: %s\n", video_path);
+    exit(1);
+  }
+
+  int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
+  printf("num_frames %d\n", num_frames);
+
+  int ori_clip_len = clip_len * frame_interval;
+  float avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
+  std::vector<int> frame_inds;
+  for (int i = 0; i < num_clips; i++) {
+    int clip_offset = i * avg_interval + avg_interval / 2.0;
+    for (int j = 0; j < clip_len; j++) {
+      int ind = (j * frame_interval + clip_offset) % num_frames;
+      if (num_frames <= ori_clip_len - 1) {
+        ind = j % num_frames;
+      }
+      frame_inds.push_back(ind);
+    }
+  }
+
+  std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
+  std::sort(unique_inds.begin(), unique_inds.end());
+  auto last = std::unique(unique_inds.begin(), unique_inds.end());
+  unique_inds.erase(last, unique_inds.end());
+
+  int ind = 0;
+  for (int i = 0; i < unique_inds.size(); i++) {
+    int tid = unique_inds[i];
+    cv::Mat frame;
+    while (ind < tid) {
+      cap.read(frame);
+      ind++;
+    }
+    cap.read(frame);
+    buffer[tid] = frame;
+    ind++;
+  }
+
+  clips.resize(frame_inds.size());
+  for (int i = 0; i < frame_inds.size(); i++) {
+    auto& img = buffer[frame_inds[i]];
+    mmdeploy_mat_t mat{
+        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    clips[i] = mat;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 7) {
+    fprintf(stderr,
+            "usage:\n  video_recognition device_name dump_model_directory video_path clip_len "
+            "frame_interval num_clips \n");
+    return 1;
+  }
+  auto device_name = argv[1];
+  auto model_path = argv[2];
+  auto video_path = argv[3];
+
+  int clip_len = std::stoi(argv[4]);
+  int frame_interval = std::stoi(argv[5]);
+  int num_clips = std::stoi(argv[6]);
+
+  std::map<int, cv::Mat> buffer;
+  std::vector<mmdeploy_mat_t> clips;
+  std::vector<mmdeploy_video_sample_info_t> clip_info;
+  SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
+  clip_info.push_back({clip_len, num_clips});
+
+  mmdeploy_video_recognizer_t recognizer{};
+  int status{};
+  status = mmdeploy_video_recognizer_create_by_path(model_path, device_name, 0, &recognizer);
+  if (status != MMDEPLOY_SUCCESS) {
+    fprintf(stderr, "failed to create recognizer, code: %d\n", (int)status);
+    return 1;
+  }
+
+  mmdeploy_video_recognition_t* res{};
+  int* res_count{};
+  status = mmdeploy_video_recognizer_apply(recognizer, clips.data(), clip_info.data(), 1, &res,
+                                           &res_count);
+  if (status != MMDEPLOY_SUCCESS) {
+    fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
+    return 1;
+  }
+
+  for (int i = 0; i < res_count[0]; ++i) {
+    fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
+  }
+
+  mmdeploy_video_recognizer_release_result(res, res_count, 1);
+
+  mmdeploy_video_recognizer_destroy(recognizer);
+
+  return 0;
+}
--- a/demo/csrc/cpp/video_cls.cxx
+++ b/demo/csrc/cpp/video_cls.cxx
@ -0,0 +1,90 @@
+
+#include <map>
+#include <string>
+
+#include "mmdeploy/video_recognizer.hpp"
+#include "opencv2/imgcodecs/imgcodecs.hpp"
+#include "opencv2/videoio.hpp"
+
+void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
+                  std::vector<mmdeploy::Mat>& clips, int clip_len, int frame_interval = 1,
+                  int num_clips = 1) {
+  cv::VideoCapture cap = cv::VideoCapture(video_path);
+  if (!cap.isOpened()) {
+    fprintf(stderr, "failed to load video: %s\n", video_path);
+    exit(1);
+  }
+
+  int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
+  printf("num_frames %d\n", num_frames);
+
+  int ori_clip_len = clip_len * frame_interval;
+  float avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
+  std::vector<int> frame_inds;
+  for (int i = 0; i < num_clips; i++) {
+    int clip_offset = i * avg_interval + avg_interval / 2.0;
+    for (int j = 0; j < clip_len; j++) {
+      int ind = (j * frame_interval + clip_offset) % num_frames;
+      if (num_frames <= ori_clip_len - 1) {
+        ind = j % num_frames;
+      }
+      frame_inds.push_back(ind);
+    }
+  }
+
+  std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
+  std::sort(unique_inds.begin(), unique_inds.end());
+  auto last = std::unique(unique_inds.begin(), unique_inds.end());
+  unique_inds.erase(last, unique_inds.end());
+
+  int ind = 0;
+  for (int i = 0; i < unique_inds.size(); i++) {
+    int tid = unique_inds[i];
+    cv::Mat frame;
+    while (ind < tid) {
+      cap.read(frame);
+      ind++;
+    }
+    cap.read(frame);
+    buffer[tid] = frame;
+    ind++;
+  }
+
+  clips.resize(frame_inds.size());
+  for (int i = 0; i < frame_inds.size(); i++) {
+    auto& img = buffer[frame_inds[i]];
+    clips[i] = img;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 7) {
+    fprintf(stderr,
+            "usage:\n  video_cls device_name model_path video_path video_path clip_len "
+            "frame_interval num_clips\n");
+    return 1;
+  }
+  auto device_name = argv[1];
+  auto model_path = argv[2];
+  auto video_path = argv[3];
+
+  int clip_len = std::stoi(argv[4]);
+  int frame_interval = std::stoi(argv[5]);
+  int num_clips = std::stoi(argv[6]);
+
+  std::map<int, cv::Mat> buffer;
+  std::vector<mmdeploy::Mat> clips;
+  mmdeploy::VideoSampleInfo clip_info = {clip_len, num_clips};
+  SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
+
+  mmdeploy::Model model(model_path);
+  mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{device_name, 0});
+
+  auto res = recognizer.Apply(clips, clip_info);
+
+  for (const auto& cls : res) {
+    fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
+  }
+
+  return 0;
+}
--- a/demo/python/video_recognition.py
+++ b/demo/python/video_recognition.py
@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import cv2
+from mmdeploy_python import VideoRecognizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='show how to use sdk python api')
+    parser.add_argument('device_name', help='name of device, cuda or cpu')
+    parser.add_argument(
+        'model_path',
+        help='path of mmdeploy SDK model dumped by model converter')
+    parser.add_argument('video_path', help='path of an video')
+    parser.add_argument(
+        '--clip_len', help='Frames of each sampled output clip', default=1)
+    parser.add_argument(
+        '--frame_interval',
+        help='Temporal interval of adjacent sampled frames.',
+        default=1)
+    parser.add_argument(
+        '--num_clips', help='Number of clips to be sampled', default=25)
+    args = parser.parse_args()
+    return args
+
+
+def SampleFrames(cap, clip_len, frame_interval, num_clips):
+    if not cap.isOpened():
+        print('failed to load video')
+        exit(-1)
+
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    ori_clip_len = clip_len * frame_interval
+    avg_interval = (num_frames - ori_clip_len + 1) / float(num_clips)
+    frame_inds = []
+    for i in range(num_clips):
+        clip_offset = int(i * avg_interval + avg_interval / 2.0)
+        for j in range(clip_len):
+            ind = (j * frame_interval + clip_offset) % num_frames
+            if num_frames <= ori_clip_len - 1:
+                ind = j % num_frames
+            frame_inds.append(ind)
+
+    unique_inds = sorted(list(set(frame_inds)))
+    buffer = {}
+    ind = 0
+    for i, tid in enumerate(unique_inds):
+        while ind < tid:
+            _, mat = cap.read()
+            ind += 1
+        _, mat = cap.read()
+        buffer[tid] = mat
+        ind += 1
+
+    clips = []
+    for tid in frame_inds:
+        clips.append(buffer[tid])
+    info = (clip_len, num_clips)
+    return clips, info
+
+
+def main():
+    args = parse_args()
+    cap = cv2.VideoCapture(args.video_path)
+
+    recognizer = VideoRecognizer(
+        model_path=args.model_path, device_name=args.device_name, device_id=0)
+
+    clips, info = SampleFrames(cap, args.clip_len, args.frame_interval,
+                               args.num_clips)
+
+    result = recognizer(clips, info)
+    for label_id, score in result:
+        print(label_id, score)
+
+
+if __name__ == '__main__':
+    main()
--- a/docs/en/03-benchmark/benchmark.md
+++ b/docs/en/03-benchmark/benchmark.md
@ -1812,6 +1812,76 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](../
 </table>
 </div>

+<div style="margin-left: 25px;">
+<table class="docutils">
+<thead>
+  <tr>
+    <th align="center" colspan="4">mmaction2</th>
+    <th align="center">Pytorch</th>
+    <th align="center">ONNXRuntime</th>
+    <th align="center" colspan="2">TensorRT</th>
+    <th align="center">PPLNN</th>
+    <th align="center">OpenVINO</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td align="center">model</td>
+    <td align="center">task</td>
+    <td align="center">dataset</td>
+    <td align="center">metrics</td>
+    <td align="center">fp32</td>
+    <td align="center">fp32</td>
+    <td align="center">fp32</td>
+    <td align="center">fp16</td>
+    <td align="center">fp16</td>
+    <td align="center">fp32</td>
+  </tr>
+  <tr>
+    <td align="center" rowspan="2"><a href="https://github.com/open-mmlab/mmaction2/blob/dev-1.x/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py">TSN</a></td>
+    <td align="center" rowspan="2">Recognition</td>
+    <td align="center" rowspan="2">Kinetics-400</td>
+    <td align="center">top-1</td>
+    <td align="center">69.71</td>
+    <td align="center">-</td>
+    <td align="center">69.71</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+  <tr>
+    <td align="center">top-5</td>
+    <td align="center">88.75</td>
+    <td align="center">-</td>
+    <td align="center">88.75</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+<tr>
+    <td align="center" rowspan="2"><a href="https://github.com/open-mmlab/mmaction2/blob/dev-1.x/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py">SlowFast</a></td>
+    <td align="center" rowspan="2">Recognition</td>
+    <td align="center" rowspan="2">Kinetics-400</td>
+    <td align="center">top-1</td>
+    <td align="center">74.45</td>
+    <td align="center">-</td>
+    <td align="center">75.62</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+  <tr>
+    <td align="center">top-5</td>
+    <td align="center">91.55</td>
+    <td align="center">-</td>
+    <td align="center">92.10</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+</tbody>
+</table>
+</div>
 ## Notes

 - As some datasets contain images with various resolutions in codebase like MMDet. The speed benchmark is gained through static configs in MMDeploy, while the performance benchmark is gained through dynamic ones.
--- a/docs/en/04-supported-codebases/mmaction2.md
+++ b/docs/en/04-supported-codebases/mmaction2.md
@ -0,0 +1,190 @@
+# MMAction2 Deployment
+
+- [MMAction2 Deployment](#mmaction2-deployment)
+  - [Installation](#installation)
+    - [Install mmaction2](#install-mmaction2)
+    - [Install mmdeploy](#install-mmdeploy)
+  - [Convert model](#convert-model)
+    - [Convert video recognition model](#convert-video-recognition-model)
+  - [Model specification](#model-specification)
+  - [Model Inference](#model-inference)
+    - [Backend model inference](#backend-model-inference)
+    - [SDK model inference](#sdk-model-inference)
+      - [Video recognition SDK model inference](#video-recognition-sdk-model-inference)
+  - [Supported models](#supported-models)
+
+______________________________________________________________________
+
+[MMAction2](https://github.com/open-mmlab/mmaction2) is an open-source toolbox for video understanding based on PyTorch. It is a part of the [OpenMMLab](https://openmmlab.com) project.
+
+## Installation
+
+### Install mmaction2
+
+Please follow the [installation guide](https://github.com/open-mmlab/mmaction2/tree/dev-1.x#installation) to install mmocr.
+
+### Install mmdeploy
+
+There are several methods to install mmdeploy, among which you can choose an appropriate one according to your target platform and device.
+
+**Method I：** Install precompiled package
+
+You can download the latest release package from [here](https://github.com/open-mmlab/mmdeploy/releases)
+
+**Method II：** Build using scripts
+
+If your target platform is **Ubuntu 18.04 or later version**, we encourage you to run
+[scripts](../01-how-to-build/build_from_script.md). For example, the following commands install mmdeploy as well as inference engine - `ONNX Runtime`.
+
+```shell
+git clone --recursive -b dev-1.x https://github.com/open-mmlab/mmdeploy.git
+cd mmdeploy
+python3 tools/scripts/build_ubuntu_x64_ort.py $(nproc)
+export PYTHONPATH=$(pwd)/build/lib:$PYTHONPATH
+export LD_LIBRARY_PATH=$(pwd)/../mmdeploy-dep/onnxruntime-linux-x64-1.8.1/lib/:$LD_LIBRARY_PATH
+```
+
+**Method III:** Build from source
+
+If neither **I** nor **II** meets your requirements, [building mmdeploy from source](../01-how-to-build/build_from_source.md) is the last option.
+
+## Convert model
+
+You can use [tools/deploy.py](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/tools/deploy.py) to convert mmocr models to the specified backend models. Its detailed usage can be learned from [here](https://github.com/open-mmlab/mmdeploy/blob/master/docs/en/02-how-to-run/convert_model.md#usage).
+
+When using `tools/deploy.py`, it is crucial to specify the correct deployment config. We've already provided builtin deployment config [files](https://github.com/open-mmlab/mmdeploy/tree/dev-1.x/configs/mmaction) of all supported backends for mmocr, under which the config file path follows the pattern:
+
+```
+{task}/{task}_{backend}-{precision}_{static | dynamic}_{shape}.py
+```
+
+其中：
+
+- **{task}:** task in mmaction2.
+- **{backend}:** inference backend, such as onnxruntime, tensorrt, pplnn, ncnn, openvino, coreml etc.
+- **{precision}:** fp16, int8. When it's empty, it means fp32
+- **{static | dynamic}:** static shape or dynamic shape
+- **{shape}:** input shape or shape range of a model
+- **{2d/3d}:** model type
+
+In the next part，we will take `tsn` model from `video recognition` task as an example, showing how to convert them to onnx model that can be inferred by ONNX Runtime.
+
+### Convert video recognition model
+
+```shell
+cd mmdeploy
+
+# download tsn model from mmaction2 model zoo
+mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb --dest .
+
+# convert mmaction2 model to onnxruntime model with dynamic shape
+python tools/deploy.py \
+    configs/mmaction/video-recognition/video-recognition_2d_onnxruntime_static.py \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth \
+    tests/data/arm_wrestling.mp4 \
+    --work-dir mmdeploy_models/mmaction/tsn/ort \
+    --device cpu \
+    --show \
+    --dump-info
+```
+
+## Model specification
+
+Before moving on to model inference chapter, let's know more about the converted model structure which is very important for model inference.
+
+The converted model locates in the working directory like `mmdeploy_models/mmaction/tsn/ort` in the previous example. It includes:
+
+```
+mmdeploy_models/mmaction/tsn/ort
+├── deploy.json
+├── detail.json
+├── end2end.onnx
+└── pipeline.json
+```
+
+in which,
+
+- **end2end.onnx**: backend model which can be inferred by ONNX Runtime
+- \***.json**: the necessary information for mmdeploy SDK
+
+The whole package **mmdeploy_models/mmocr/dbnet/ort** is defined as **mmdeploy SDK model**, i.e., **mmdeploy SDK model** includes both backend model and inference meta information.
+
+## Model Inference
+
+### Backend model inference
+
+Take the previous converted `end2end.onnx` mode of `tsn` as an example, you can use the following code to inference the model and visualize the results.
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import numpy as np
+import torch
+
+deploy_cfg = 'configs/mmaction/video-recognition/video-recognition_2d_onnxruntime_static.py'
+model_cfg = 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb'
+device = 'cpu'
+backend_model = ['./mmdeploy_models/mmaction2/tsn/ort/end2end.onnx']
+image = 'tests/data/arm_wrestling.mp4'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# show top5-results
+pred_scores = result[0].pred_scores.item.tolist()
+top_index = np.argsort(pred_scores)[::-1]
+for i in range(5):
+    index = top_index[i]
+    print(index, pred_scores[index])
+```
+
+### SDK model inference
+
+Given the above SDK model of `tsn` you can also perform SDK model inference like following,
+
+#### Video recognition SDK model inference
+
+```python
+from mmdeploy_python import VideoRecognizer
+import cv2
+
+# refer to demo/python/video_recognition.py
+# def SampleFrames(cap, clip_len, frame_interval, num_clips):
+#  ...
+
+cap = cv2.VideoCapture('tests/data/arm_wrestling.mp4')
+
+clips, info = SampleFrames(cap, 1, 1, 25)
+
+# create a recognizer
+recognizer = VideoRecognizer(model_path='./mmdeploy_models/mmaction/tsn/ort', device_name='cpu', device_id=0)
+# perform inference
+result = recognizer(clips, info)
+# show inference result
+for label_id, score in result:
+    print(label_id, score)
+```
+
+Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demos](https://github.com/open-mmlab/mmdeploy/tree/dev-1.x/demo).
+
+> MMAction2 only API of c, c++ and python for now.
+
+## Supported models
+
+| Model                                                                                         | TorchScript | ONNX Runtime | TensorRT | ncnn | PPLNN | OpenVINO |
+| :-------------------------------------------------------------------------------------------- | :---------: | :----------: | :------: | :--: | :---: | :------: |
+| [TSN](https://github.com/open-mmlab/mmaction2/tree/dev-1.x/configs/recognition/tsn)           |      N      |      Y       |    Y     |  N   |   N   |    N     |
+| [SlowFast](https://github.com/open-mmlab/mmaction2/tree/dev-1.x/configs/recognition/slowfast) |      N      |      Y       |    Y     |  N   |   N   |    N     |
--- a/docs/zh_cn/03-benchmark/benchmark.md
+++ b/docs/zh_cn/03-benchmark/benchmark.md
@ -1807,6 +1807,76 @@ GPU: ncnn, TensorRT, PPLNN
 </table>
 </div>

+<div style="margin-left: 25px;">
+<table class="docutils">
+<thead>
+  <tr>
+    <th align="center" colspan="4">mmaction2</th>
+    <th align="center">Pytorch</th>
+    <th align="center">ONNXRuntime</th>
+    <th align="center" colspan="2">TensorRT</th>
+    <th align="center">PPLNN</th>
+    <th align="center">OpenVINO</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td align="center">model</td>
+    <td align="center">task</td>
+    <td align="center">dataset</td>
+    <td align="center">metrics</td>
+    <td align="center">fp32</td>
+    <td align="center">fp32</td>
+    <td align="center">fp32</td>
+    <td align="center">fp16</td>
+    <td align="center">fp16</td>
+    <td align="center">fp32</td>
+  </tr>
+  <tr>
+    <td align="center" rowspan="2"><a href="https://github.com/open-mmlab/mmaction2/blob/dev-1.x/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py">TSN</a></td>
+    <td align="center" rowspan="2">Recognition</td>
+    <td align="center" rowspan="2">Kinetics-400</td>
+    <td align="center">top-1</td>
+    <td align="center">69.71</td>
+    <td align="center">-</td>
+    <td align="center">69.71</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+  <tr>
+    <td align="center">top-5</td>
+    <td align="center">88.75</td>
+    <td align="center">-</td>
+    <td align="center">88.75</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+<tr>
+    <td align="center" rowspan="2"><a href="https://github.com/open-mmlab/mmaction2/blob/dev-1.x/configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_kinetics400-rgb.py">SlowFast</a></td>
+    <td align="center" rowspan="2">Recognition</td>
+    <td align="center" rowspan="2">Kinetics-400</td>
+    <td align="center">top-1</td>
+    <td align="center">74.45</td>
+    <td align="center">-</td>
+    <td align="center">75.62</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+  <tr>
+    <td align="center">top-5</td>
+    <td align="center">91.55</td>
+    <td align="center">-</td>
+    <td align="center">92.10</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+    <td align="center">-</td>
+  </tr>
+</tbody>
+</table>
+</div>
 ## 备注

 - 由于某些数据集在代码库中包含各种分辨率的图像，例如 MMDet，速度基准是通过 MMDeploy 中的静态配置获得的，而性能基准是通过动态配置获得的
--- a/docs/zh_cn/04-supported-codebases/mmaction2.md
+++ b/docs/zh_cn/04-supported-codebases/mmaction2.md
@ -0,0 +1,193 @@
+# MMAction2 模型部署
+
+- [MMAction2 模型部署](#mmaction2-模型部署)
+  - [安装](#安装)
+    - [安装 mmaction2](#安装-mmaction2)
+    - [安装 mmdeploy](#安装-mmdeploy)
+  - [模型转换](#模型转换)
+    - [视频分类任务模型转换](#视频分类任务模型转换)
+  - [模型规范](#模型规范)
+  - [模型推理](#模型推理)
+    - [后端模型推理](#后端模型推理)
+    - [SDK 模型推理](#sdk-模型推理)
+      - [视频分类 SDK 模型推理](#视频分类-sdk-模型推理)
+  - [模型支持列表](#模型支持列表)
+
+______________________________________________________________________
+
+[MMAction2](https://github.com/open-mmlab/mmaction2)是一款基于 PyTorch 的视频理解开源工具箱，是[OpenMMLab](https://openmmlab.com)项目的成员之一。
+
+## 安装
+
+### 安装 mmaction2
+
+请参考[官网安装指南](https://github.com/open-mmlab/mmaction2/tree/dev-1.x#installation).
+
+### 安装 mmdeploy
+
+mmdeploy 有以下几种安装方式:
+
+**方式一：** 安装预编译包
+
+通过此[链接](https://github.com/open-mmlab/mmdeploy/releases)获取最新的预编译包
+
+**方式二：** 一键式脚本安装
+
+如果部署平台是 **Ubuntu 18.04 及以上版本**， 请参考[脚本安装说明](../01-how-to-build/build_from_script.md)，完成安装过程。
+比如，以下命令可以安装 mmdeploy 以及配套的推理引擎——`ONNX Runtime`.
+
+```shell
+git clone --recursive -b dev-1.x https://github.com/open-mmlab/mmdeploy.git
+cd mmdeploy
+python3 tools/scripts/build_ubuntu_x64_ort.py $(nproc)
+export PYTHONPATH=$(pwd)/build/lib:$PYTHONPATH
+export LD_LIBRARY_PATH=$(pwd)/../mmdeploy-dep/onnxruntime-linux-x64-1.8.1/lib/:$LD_LIBRARY_PATH
+```
+
+**方式三：** 源码安装
+
+在方式一、二都满足不了的情况下，请参考[源码安装说明](../01-how-to-build/build_from_source.md) 安装 mmdeploy 以及所需推理引擎。
+
+## 模型转换
+
+你可以使用 [tools/deploy.py](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/tools/deploy.py) 把 mmaction2 模型一键式转换为推理后端模型。
+该工具的详细使用说明请参考[这里](https://github.com/open-mmlab/mmdeploy/blob/master/docs/en/02-how-to-run/convert_model.md#usage).
+
+转换的关键之一是使用正确的配置文件。项目中已内置了各后端部署[配置文件](https://github.com/open-mmlab/mmdeploy/tree/dev-1.x/configs/mmaction)。
+文件的命名模式是：
+
+```
+{task}/{task}_{backend}-{precision}_{static | dynamic}_{shape}.py
+```
+
+其中：
+
+- **{task}:** mmaction2 中的任务
+- **{backend}:** 推理后端名称。比如，onnxruntime、tensorrt、pplnn、ncnn、openvino、coreml 等等
+- **{precision}:** 推理精度。比如，fp16、int8。不填表示 fp32
+- **{static | dynamic}:** 动态、静态 shape
+- **{shape}:** 模型输入的 shape 或者 shape 范围
+- **{2d/3d}:** 表示模型的类别
+
+以下，我们将演示如何把视频分类任务中 `tsn` 模型转换为 onnx 模型。
+
+### 视频分类任务模型转换
+
+```shell
+cd mmdeploy
+
+# download tsn model from mmaction2 model zoo
+mim download mmaction2 --config tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb --dest .
+
+# convert mmaction2 model to onnxruntime model with dynamic shape
+python tools/deploy.py \
+    configs/mmaction/video-recognition/video-recognition_2d_onnxruntime_static.py \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb \
+    tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth \
+    tests/data/arm_wrestling.mp4 \
+    --work-dir mmdeploy_models/mmaction/tsn/ort \
+    --device cpu \
+    --show \
+    --dump-info
+```
+
+## 模型规范
+
+在使用转换后的模型进行推理之前，有必要了解转换结果的结构。 它存放在 `--work-dir` 指定的路路径下。
+
+上例中的`mmdeploy_models/mmaction/tsn/ort`，结构如下：
+
+```
+mmdeploy_models/mmaction/tsn/ort
+├── deploy.json
+├── detail.json
+├── end2end.onnx
+└── pipeline.json
+```
+
+重要的是：
+
+- **end2end.onnx**: 推理引擎文件。可用 ONNX Runtime 推理
+- \***.json**:  mmdeploy SDK 推理所需的 meta 信息
+
+整个文件夹被定义为**mmdeploy SDK model**。换言之，**mmdeploy SDK model**既包括推理引擎，也包括推理 meta 信息。
+
+## 模型推理
+
+### 后端模型推理
+
+以上述模型转换后的 `end2end.onnx` 为例，你可以使用如下代码进行推理：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import numpy as np
+import torch
+
+deploy_cfg = 'configs/mmaction/video-recognition/video-recognition_2d_onnxruntime_static.py'
+model_cfg = 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb'
+device = 'cpu'
+backend_model = ['./mmdeploy_models/mmaction2/tsn/ort/end2end.onnx']
+image = 'tests/data/arm_wrestling.mp4'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# show top5-results
+pred_scores = result[0].pred_scores.item.tolist()
+top_index = np.argsort(pred_scores)[::-1]
+for i in range(5):
+    index = top_index[i]
+    print(index, pred_scores[index])
+```
+
+### SDK 模型推理
+
+你也可以参考如下代码，对 SDK model 进行推理：
+
+#### 视频分类 SDK 模型推理
+
+```python
+from mmdeploy_python import VideoRecognizer
+import cv2
+
+# refer to demo/python/video_recognition.py
+# def SampleFrames(cap, clip_len, frame_interval, num_clips):
+#  ...
+
+cap = cv2.VideoCapture('tests/data/arm_wrestling.mp4')
+
+clips, info = SampleFrames(cap, 1, 1, 25)
+
+# create a recognizer
+recognizer = VideoRecognizer(model_path='./mmdeploy_models/mmaction/tsn/ort', device_name='cpu', device_id=0)
+# perform inference
+result = recognizer(clips, info)
+# show inference result
+for label_id, score in result:
+    print(label_id, score)
+```
+
+除了python API，mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。
+你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/dev-1.x/demo)学习其他语言接口的使用方法。
+
+> mmaction2 的 C#，Java接口待开发
+
+## 模型支持列表
+
+| Model                                                                                         | TorchScript | ONNX Runtime | TensorRT | ncnn | PPLNN | OpenVINO |
+| :-------------------------------------------------------------------------------------------- | :---------: | :----------: | :------: | :--: | :---: | :------: |
+| [TSN](https://github.com/open-mmlab/mmaction2/tree/dev-1.x/configs/recognition/tsn)           |      N      |      Y       |    Y     |  N   |   N   |    N     |
+| [SlowFast](https://github.com/open-mmlab/mmaction2/tree/dev-1.x/configs/recognition/slowfast) |      N      |      Y       |    Y     |  N   |   N   |    N     |
--- a/mmdeploy/backend/sdk/export_info.py
+++ b/mmdeploy/backend/sdk/export_info.py
@ -164,8 +164,9 @@ def get_preprocess(deploy_cfg: mmengine.Config, model_cfg: mmengine.Config,
        for transform in transforms:
            if transform['type'] == 'Normalize':
                transform['to_float'] = False
-    assert transforms[0]['type'] == 'LoadImageFromFile', 'The first item'\
-        ' type of pipeline should be LoadImageFromFile'
+    if transforms[0]['type'] != 'Lift':
+        assert transforms[0]['type'] == 'LoadImageFromFile', \
+            'The first item type of pipeline should be LoadImageFromFile'
    return dict(
        type='Task',
        module='Transform',
@ -244,7 +245,8 @@ def get_pipeline(deploy_cfg: mmengine.Config, model_cfg: mmengine.Config,
    task = get_task_type(deploy_cfg)
    input_names = preprocess['input']
    output_names = postprocess['output']
-    if task == Task.CLASSIFICATION or task == Task.SUPER_RESOLUTION:
+    if task == Task.CLASSIFICATION or task == Task.SUPER_RESOLUTION \
+            or Task.VIDEO_RECOGNITION:
        postprocess['input'] = infer_info['output']
    else:
        postprocess['input'] = preprocess['output'] + infer_info['output']
--- a/mmdeploy/codebase/mmaction/deploy/video_recognition.py
+++ b/mmdeploy/codebase/mmaction/deploy/video_recognition.py
@ -111,11 +111,16 @@ class VideoRecognition(BaseTask):
            nn.Module: An initialized backend model.
        """
        from .video_recognition_model import build_video_recognition_model
+        data_preprocessor = self.model_cfg.model.data_preprocessor
+        data_preprocessor.setdefault('type', 'mmaction.ActionDataPreprocessor')
        model = build_video_recognition_model(
-            model_files, self.model_cfg, self.deploy_cfg, device=self.device)
+            model_files,
+            self.model_cfg,
+            self.deploy_cfg,
+            device=self.device,
+            data_preprocessor=data_preprocessor)
        model.to(self.device)
-        model.eval()
-        return model
+        return model.eval()

    def create_input(self,
                     imgs: Union[str, np.ndarray],
@ -242,7 +247,7 @@ class VideoRecognition(BaseTask):
        """
        return input_data['inputs']

-    def get_preprocess(self) -> Dict:
+    def get_preprocess(self, *args, **kwargs) -> Dict:
        """Get the preprocess information for SDK.

        Return:
@ -250,19 +255,70 @@ class VideoRecognition(BaseTask):
        """
        input_shape = get_input_shape(self.deploy_cfg)
        model_cfg = process_model_config(self.model_cfg, [''], input_shape)
-        preprocess = model_cfg.test_pipeline
-        return preprocess
+        pipeline = model_cfg.test_pipeline
+        data_preprocessor = self.model_cfg.model.data_preprocessor

-    def get_postprocess(self) -> Dict:
+        lift = dict(type='Lift', transforms=[])
+        lift['transforms'].append(dict(type='LoadImageFromFile'))
+        transforms2index = {}
+        for i, trans in enumerate(pipeline):
+            transforms2index[trans['type']] = i
+        lift_key = [
+            'Resize', 'Normalize', 'TenCrop', 'ThreeCrop', 'CenterCrop'
+        ]
+        for key in lift_key:
+            if key == 'Normalize':
+                assert key not in transforms2index
+                mean = data_preprocessor.get('mean', [0, 0, 0])
+                std = data_preprocessor.get('std', [1, 1, 1])
+                trans = dict(type='Normalize', mean=mean, std=std, to_rgb=True)
+                lift['transforms'].append(trans)
+            if key in transforms2index:
+                index = transforms2index[key]
+                if key == 'Resize' and 'scale' in pipeline[index]:
+                    value = pipeline[index].pop('scale')
+                    if len(value) == 2 and value[0] == -1:
+                        value = value[::-1]
+                    pipeline[index]['size'] = value
+                lift['transforms'].append(pipeline[index])
+
+        meta_keys = [
+            'valid_ratio', 'flip', 'img_norm_cfg', 'filename', 'ori_shape',
+            'pad_shape', 'img_shape', 'flip_direction', 'scale_factor',
+            'ori_filename'
+        ]
+        other = []
+        must_key = ['FormatShape', 'PackActionInputs']
+        for key in must_key:
+            assert key in transforms2index
+            index = transforms2index[key]
+            if key == 'PackActionInputs':
+                if 'meta_keys' in pipeline[index]:
+                    meta_keys += pipeline[index]['meta_keys']
+                pipeline[index]['meta_keys'] = list(set(meta_keys))
+                pipeline[index]['keys'] = ['img']
+                pipeline[index]['type'] = 'Collect'
+            other.append(pipeline[index])
+
+        reorder = [lift, *other]
+        return reorder
+
+    def get_postprocess(self, *args, **kwargs) -> Dict:
        """Get the postprocess information for SDK.

        Return:
            dict: Composed of the postprocess information.
        """
-        postprocess = self.model_cfg.model.cls_head
+        assert 'cls_head' in self.model_cfg.model
+        assert 'num_classes' in self.model_cfg.model.cls_head
+        logger = get_root_logger()
+        logger.warning('use default top-k value 1')
+        num_classes = self.model_cfg.model.cls_head.num_classes
+        params = dict(topk=1, num_classes=num_classes)
+        postprocess = dict(type='BaseHead', params=params)
        return postprocess

-    def get_model_name(self) -> str:
+    def get_model_name(self, *args, **kwargs) -> str:
        """Get the model name.

        Return:
--- a/mmdeploy/codebase/mmaction/deploy/video_recognition_model.py
+++ b/mmdeploy/codebase/mmaction/deploy/video_recognition_model.py
@ -9,6 +9,7 @@ from mmengine import Config
 from mmengine.model import BaseDataPreprocessor
 from mmengine.registry import Registry
 from mmengine.structures import BaseDataElement, LabelData
+from torch import nn

 from mmdeploy.codebase.base import BaseBackendModel
 from mmdeploy.utils import (Backend, get_backend, get_codebase_config,
@ -37,19 +38,11 @@ class End2EndModel(BaseBackendModel):
                 backend_files: Sequence[str],
                 device: str,
                 deploy_cfg: Union[str, Config] = None,
-                 model_cfg: Union[str, Config] = None,
+                 data_preprocessor: Optional[Union[dict, nn.Module]] = None,
                 **kwargs):
-        super(End2EndModel, self).__init__(deploy_cfg=deploy_cfg)
-        model_cfg, deploy_cfg = load_config(model_cfg, deploy_cfg)
-        from mmaction.registry import MODELS
-        preprocessor_cfg = model_cfg.model.get('data_preprocessor', None)
-        if preprocessor_cfg is not None:
-            self.data_preprocessor = MODELS.build(
-                model_cfg.model.data_preprocessor)
-        else:
-            self.data_preprocessor = BaseDataPreprocessor()
+        super(End2EndModel, self).__init__(
+            deploy_cfg=deploy_cfg, data_preprocessor=data_preprocessor)
        self.deploy_cfg = deploy_cfg
-        self.model_cfg = model_cfg
        self._init_wrapper(
            backend=backend,
            backend_files=backend_files,
@ -114,10 +107,14 @@ class End2EndModel(BaseBackendModel):
        return data_samples


-def build_video_recognition_model(model_files: Sequence[str],
-                                  model_cfg: Union[str, mmengine.Config],
-                                  deploy_cfg: Union[str, mmengine.Config],
-                                  device: str, **kwargs):
+def build_video_recognition_model(
+        model_files: Sequence[str],
+        model_cfg: Union[str, mmengine.Config],
+        deploy_cfg: Union[str, mmengine.Config],
+        device: str,
+        data_preprocessor: Optional[Union[Config,
+                                          BaseDataPreprocessor]] = None,
+        **kwargs):
    """Build video recognition model for different backends.

    Args:
@ -127,6 +124,8 @@ def build_video_recognition_model(model_files: Sequence[str],
        deploy_cfg (str | mmengine.Config): Input deployment config file or
            Config object.
        device (str):  Device to input model.
+        data_preprocessor (BaseDataPreprocessor | Config): The data
+            preprocessor of the model.

    Returns:
        BaseBackendModel: Video recognizer for a configured backend.
@ -144,7 +143,7 @@ def build_video_recognition_model(model_files: Sequence[str],
            backend_files=model_files,
            device=device,
            deploy_cfg=deploy_cfg,
-            model_cfg=model_cfg,
+            data_preprocessor=data_preprocessor,
            **kwargs))

    return backend_video_recognizer
--- a/mmdeploy/utils/constants.py
+++ b/mmdeploy/utils/constants.py
@ -85,7 +85,9 @@ SDK_TASK_MAP = {
    Task.POSE_DETECTION:
    dict(component='Detector', cls_name='PoseDetector'),
    Task.ROTATED_DETECTION:
-    dict(component='ResizeRBBox', cls_name='RotatedDetector')
+    dict(component='ResizeRBBox', cls_name='RotatedDetector'),
+    Task.VIDEO_RECOGNITION:
+    dict(component='BaseHead', cls_name='VideoRecognizer')
 }

 TENSORRT_MAX_TOPK = 3840
--- a/tests/data/arm_wrestling.mp4
+++ b/tests/data/arm_wrestling.mp4
--- a/tools/regression_test.py
+++ b/tools/regression_test.py
@ -771,7 +771,7 @@ def get_backend_result(pipeline_info: dict, model_cfg_path: Path,

        if sdk_config is not None:

-            if codebase_name == 'mmcls':
+            if codebase_name == 'mmcls' or codebase_name == 'mmaction':
                replace_top_in_pipeline_json(backend_output_path, logger)

            log_path = gen_log_path(