Merge pull request #1276 from lvhan028/dev-1.x

sync master to dev-1.x by cherry-pick
2022-11-01 23:23:19 -07:00 · 2022-11-01 23:23:19 -07:00 · 034ba67556
parent 185d838a7e 1d47848184
commit 034ba67556
63 changed files with 2314 additions and 283 deletions
--- a/.circleci/scripts/linux/build.sh
+++ b/.circleci/scripts/linux/build.sh
@ -6,11 +6,7 @@ cd mmdeploy
 MMDEPLOY_DIR=$(pwd)
 mkdir -p build && cd build
 cmake .. -DMMDEPLOY_BUILD_SDK=ON -DMMDEPLOY_BUILD_TEST=ON -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \
-         -DMMDEPLOY_BUILD_SDK_CXX_API=ON -DMMDEPLOY_BUILD_SDK_CSHARP_API=ON \
+         -DMMDEPLOY_BUILD_EXAMPLES=ON -DMMDEPLOY_BUILD_SDK_CXX_API=ON -DMMDEPLOY_BUILD_SDK_CSHARP_API=ON \
         -DMMDEPLOY_TARGET_DEVICES="$1" -DMMDEPLOY_TARGET_BACKENDS="$2" "${ARGS[@]:2}"

 make -j$(nproc) && make install
-cd install/example
-mkdir -p build
-cd build
-cmake ../cpp -DMMDeploy_DIR="$MMDEPLOY_DIR"/build/install/lib/cmake/MMDeploy "${ARGS[@]:2}" && make -j$(nproc)
--- a/.circleci/scripts/windows/install_opencv.ps1
+++ b/.circleci/scripts/windows/install_opencv.ps1
@ -1,3 +1,3 @@
-Invoke-WebRequest -Uri https://download.openmmlab.com/mmdeploy/library/opencv-4.5.5.zip -OutFile opencv.zip
+Invoke-WebRequest -Uri https://github.com/irexyc/mmdeploy-ci-resource/releases/download/opencv/opencv-win-amd64-4.5.5-vc16.zip -OutFile opencv.zip
 Expand-Archive opencv.zip .
 Move-Item opencv-4.5.5 opencv
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@ -192,19 +192,13 @@ jobs:
             -DMMDEPLOY_BUILD_SDK=ON `
             -DMMDEPLOY_BUILD_TEST=ON `
             -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON `
+             -DMMDEPLOY_BUILD_EXAMPLES=ON `
             -DMMDEPLOY_BUILD_SDK_CXX_API=ON `
             -DMMDEPLOY_BUILD_SDK_CSHARP_API=ON `
             -DMMDEPLOY_TARGET_BACKENDS="ort" `
             -DOpenCV_DIR="$env:OPENCV_PACKAGE_DIR"
            cmake --build . --config Release -- /m
            cmake --install . --config Release
-            cd install/example
-            mkdir build -ErrorAction SilentlyContinue
-            cd build
-            cmake ../cpp -G "Visual Studio 16 2019" -A x64 -T v142 `
-             -DMMDeploy_DIR="$env:MMDEPLOY_DIR/build/install/lib/cmake/MMDeploy" `
-             -DOpenCV_DIR="$env:OPENCV_PACKAGE_DIR"
-            cmake --build . --config Release -- /m
      - install_mmdeploy
      - install_model_converter_req
      - perform_model_converter_ut
@ -256,7 +250,7 @@ jobs:
      - run:
          name: Inference model by SDK
          command: |
-            mmdeploy/build/install/example/build/image_classification cpu mmdeploy-models/mmcls/onnxruntime mmdeploy/tests/data/tiger.jpeg
+            ./mmdeploy/build/bin/image_classification cpu mmdeploy-models/mmcls/onnxruntime mmdeploy/demo/resources/cityscapes.png


 # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@ -1,6 +1,7 @@
-name: Bug report
-description: Create a report to help us improve
-
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']

 body:
 - type: checkboxes
@ -52,5 +53,3 @@ body:
     If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!

     Thanks for your bug report. We appreciate it a lot.
-
-labels: ['Bug']
--- a/.github/ISSUE_TEMPLATE/2-feature_request.yaml
+++ b/.github/ISSUE_TEMPLATE/2-feature_request.yaml
@ -1,11 +1,15 @@
-name: Feature request
+name: 🚀 Feature request
 description: Suggest an idea for this project
+title: "[Feature] "

 body:
 - type: markdown
  attributes:
-    value: >
-      ## Describe the feature
+    value: |
+      We strongly appreciate you creating a PR to implement this feature [here](https://github.com/open-mmlab/mmdeploy/pulls)!
+      If you need our help, please fill in as much of the following form as you're able to.
+
+      **The less clear the description, the longer it will take to solve it.**
 - type: textarea
  attributes:
    label: Motivation
--- a/.github/ISSUE_TEMPLATE/3-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/3-documentation.yml
@ -0,0 +1,23 @@
+name: 📚 Documentation
+description: Report an issue related to the documentation.
+labels: "kind/doc,status/unconfirmed"
+title: "[Docs] "
+
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description the issue.
+  validations:
+    required: true
+
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,6 +1,12 @@
 blank_issues_enabled: false

 contact_links:
-  - name: Common Issues
+  - name: 💥 FAQ
    url: https://github.com/open-mmlab/mmdeploy/blob/master/docs/en/faq.md
    about: Check if your issue already has solutions
+  - name: 💬 Forum
+    url: https://github.com/open-mmlab/mmdeploy/discussions
+    about: Ask general usage questions and discuss with other MMDeploy community members
+  - name: 🌐 Explore OpenMMLab
+    url: https://openmmlab.com/
+    about: Get know more about OpenMMLab
--- a/.github/ISSUE_TEMPLATE/general_questions.md
+++ b/.github/ISSUE_TEMPLATE/general_questions.md
@ -1,7 +0,0 @@
---
-name: General questions
-about: Ask general questions to get help
-title: ''
-labels: ''
-assignees: ''
---
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -89,6 +89,23 @@ jobs:
          ls -lah coverage.info
          cp coverage.info ../

+  cross_build_aarch64:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: update
+        run: sudo apt update
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: gcc-multilib
+        run: |
+          sh -x tools/scripts/ubuntu_cross_build_aarch64.sh
+
  build_cuda102:
    runs-on: ubuntu-18.04
    container:
--- a/.github/workflows/linux-rknpu2-gcc.yml
+++ b/.github/workflows/linux-rknpu2-gcc.yml
@ -0,0 +1,54 @@
+name: build_rknpu2_gcc
+
+on:
+  push:
+    paths:
+      - "csrc/**"
+      - "demo/csrc/**"
+      - "CMakeLists.txt"
+
+  pull_request:
+    paths-ignore:
+      - "csrc/**"
+      - "demo/csrc/**"
+      - "CMakeLists.txt"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_rknpu2_gcc:
+    runs-on: ubuntu-18.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: rknpu2-gnu-toolchain
+        run: |
+          mkdir $GITHUB_WORKSPACE/rknpu2-gnu-toolchain
+          cd $GITHUB_WORKSPACE/rknpu2-gnu-toolchain
+          git clone https://github.com/Caesar-github/gcc-buildroot-9.3.0-2020.03-x86_64_aarch64-rockchip-linux-gnu.git
+      - name: rknpu2
+        run: |
+          mkdir $GITHUB_WORKSPACE/rknpu2
+          cd $GITHUB_WORKSPACE/rknpu2
+          git clone https://github.com/rockchip-linux/rknpu2.git
+      - name: build
+        run: |
+          export RKNN_TOOL_CHAIN=$GITHUB_WORKSPACE/rknpu2-gnu-toolchain/gcc-buildroot-9.3.0-2020.03-x86_64_aarch64-rockchip-linux-gnu/usr
+          export LD_LIBRARY_PATH=$RKNN_TOOL_CHAIN/lib64:$LD_LIBRARY_PATH
+          export RKNPU2_DEVICE_DIR=$GITHUB_WORKSPACE/rknpu2/rknpu2/runtime/RK3588
+          mkdir build && cd build
+          cmake .. \
+            -DCMAKE_TOOLCHAIN_FILE=$(pwd)/../cmake/toolchains/rknpu2-linux-gnu.cmake \
+            -DMMDEPLOY_BUILD_SDK=ON \
+            -DMMDEPLOY_SHARED_LIBS=ON \
+            -DMMDEPLOY_BUILD_EXAMPLES=ON \
+            -DMMDEPLOY_TARGET_DEVICES="cpu" \
+            -DMMDEPLOY_TARGET_BACKENDS="rknn" \
+            -DMMDEPLOY_CODEBASES=all \
+            -DOpenCV_DIR=$RKNPU2_DEVICE_DIR/../../examples/3rdparty/opencv/opencv-linux-aarch64/share/OpenCV
+          make -j$(nproc)
+          make install
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,7 +5,7 @@ endif ()
 message(STATUS "CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")

 cmake_minimum_required(VERSION 3.14)
-project(MMDeploy VERSION 0.9.0)
+project(MMDeploy VERSION 0.10.0)

 set(CMAKE_CXX_STANDARD 17)

@ -128,6 +128,7 @@ if (MMDEPLOY_BUILD_SDK)
        mmdeploy_add_deps(pplnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS pplnn)
    endif ()
    mmdeploy_add_deps(snpe BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS snpe)
+    mmdeploy_add_deps(rknn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS rknn)

    include(CMakePackageConfigHelpers)
    # generate the config file that is includes the exports
--- a/README.md
+++ b/README.md
@ -87,6 +87,7 @@ Please read [getting_started](docs/en/get_started.md) for the basic usage of MMD
  - [Build for Android](docs/en/01-how-to-build/android.md)
  - [Build for Jetson](docs/en/01-how-to-build/jetsons.md)
  - [Build for SNPE](docs/en/01-how-to-build/snpe.md)
+  - [Cross Build for aarch64](docs/en/01-how-to-build/cross_build_ncnn_aarch64.md)
 - User Guide
  - [How to convert model](docs/en/02-how-to-run/convert_model.md)
  - [How to write config](docs/en/02-how-to-run/write_config.md)
@ -148,6 +149,7 @@ This project is released under the [Apache 2.0 license](LICENSE).
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -86,6 +86,7 @@ MMDeploy 是 [OpenMMLab](https://openmmlab.com/) 模型部署工具箱，**为
  - [Build for Android](docs/zh_cn/01-how-to-build/android.md)
  - [Build for Jetson](docs/zh_cn/01-how-to-build/jetsons.md)
  - [Build for SNPE](docs/zh_cn/01-how-to-build/snpe.md)
+  - [Cross Build for aarch64](docs/zh_cn/01-how-to-build/cross_build_ncnn_aarch64.md)
 - 使用
  - [把模型转换到推理 Backend](docs/zh_cn/02-how-to-run/convert_model.md)
  - [配置转换参数](docs/zh_cn/02-how-to-run/write_config.md)
@ -153,6 +154,7 @@ MMDeploy 是 [OpenMMLab](https://openmmlab.com/) 模型部署工具箱，**为
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱和基准测试
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -42,6 +42,7 @@ if (NOT CMAKE_CUDA_ARCHITECTURES)
    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "8")
        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
    endif ()
    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "9")
        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
--- a/cmake/toolchains/aarch64-linux-gnu.cmake
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@ -0,0 +1,17 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
+set(CMAKE_LINKER "aarch64-linux-gnu-ld")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(CMAKE_C_FLAGS "-march=armv8-a")
+set(CMAKE_CXX_FLAGS "-march=armv8-a")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
--- a/cmake/toolchains/rknpu2-linux-gnu.cmake
+++ b/cmake/toolchains/rknpu2-linux-gnu.cmake
@ -0,0 +1,23 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR rockchip)
+
+if(DEFINED ENV{RKNN_TOOL_CHAIN})
+    file(TO_CMAKE_PATH $ENV{RKNN_TOOL_CHAIN} RKNN_TOOL_CHAIN)
+else()
+    message(FATAL_ERROR "RKNN_TOOL_CHAIN env must be defined")
+endif()
+
+set(CMAKE_C_COMPILER ${RKNN_TOOL_CHAIN}/bin/aarch64-rockchip-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER ${RKNN_TOOL_CHAIN}/bin/aarch64-rockchip-linux-gnu-g++)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_FLAGS "-Wl,--allow-shlib-undefined")
+set(CMAKE_CXX_FLAGS "-Wl,--allow-shlib-undefined")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
--- a/csrc/mmdeploy/archive/value_archive.h
+++ b/csrc/mmdeploy/archive/value_archive.h
@ -53,6 +53,10 @@ inline Value to_value(T&& val) {
  return value;
 }

+// fast path
+inline Value to_value(const Value& v) { return v; }
+inline Value to_value(Value&& v) { return std::move(v); }
+
 template <typename T>
 void from_value(const Value& value, T&& x);

@ -107,6 +111,9 @@ void from_value(const Value& value, T&& x) {
  archive(std::forward<T>(x));
 }

+// Required to avoid Value::Pointer being unwrapped by Value::get_to()
+inline void from_value(const Value& value, Value& x) { x = value; }
+
 template <typename T>
 inline T from_value(const Value& value) {
  T x{};
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
@ -156,7 +156,7 @@ bool SubgraphMatcher::SubgraphMatcherImpl::matchAttributes(const Node* n1, Node*
        n1is = n1->is(attr_name);
        n2is = n2->is(attr_name);
        if (n1is.size() != n2is.size()) return false;
-        for (int i = 0; i < n1is.size(); ++i) {
+        for (size_t i = 0; i < n1is.size(); ++i) {
          if (n1is[i] != n2is[i]) return false;
        }
        break;
@ -164,7 +164,7 @@ bool SubgraphMatcher::SubgraphMatcherImpl::matchAttributes(const Node* n1, Node*
        n1fs = n1->fs(attr_name);
        n2fs = n2->fs(attr_name);
        if (n1fs.size() != n2fs.size()) return false;
-        for (int i = 0; i < n1fs.size(); ++i) {
+        for (size_t i = 0; i < n1fs.size(); ++i) {
          if (n1fs[i] != n2fs[i]) return false;
        }
        break;
--- a/csrc/mmdeploy/graph/CMakeLists.txt
+++ b/csrc/mmdeploy/graph/CMakeLists.txt
@ -6,6 +6,7 @@ set(SRCS
        task.cpp
        static_router.cpp
        inference.cpp
-        pipeline.cpp)
+        pipeline.cpp
+        cond.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
 add_library(mmdeploy::graph ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/graph/cond.cpp
+++ b/csrc/mmdeploy/graph/cond.cpp
@ -0,0 +1,124 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/graph/cond.h"
+
+#include <algorithm>
+
+namespace mmdeploy::graph {
+
+namespace {
+
+std::vector<int> get_predicates(const Value::Array& xs) {
+  std::vector<int> ps;
+  ps.reserve(xs.size());
+  std::transform(std::begin(xs), std::end(xs), std::back_inserter(ps),
+                 [](const Value& x) { return static_cast<int>(x.get<bool>()); });
+  return ps;
+}
+
+std::pair<bool, int> choice(const std::vector<int>& xs) {
+  auto count = std::count(std::begin(xs), std::end(xs), 1);
+  if (count == 0 || count == xs.size()) {
+    return std::make_pair(true, count == xs.size());
+  }
+  return std::make_pair(false, false);
+}
+
+Value get_divergent_input(Value::Array& as, const std::vector<int>& ps) {
+  Value::Array ts(as.size(), Value::kArray);
+  for (size_t i = 0; i < ts.size(); ++i) {
+    auto& t = ts[i].array();
+    auto& a = as[i].array();
+    for (size_t j = 0; j < ps.size(); ++j) {
+      if (ps[j]) {
+        t.push_back(std::move(a[j]));
+      }
+    }
+  }
+  return ts;
+}
+
+Value get_divergent_output(Value::Array& rs, const vector<int>& ps) {
+  Value::Array ys(rs.size(), Value::kArray);
+  for (size_t i = 0; i < ys.size(); ++i) {
+    auto& y = ys[i].array();
+    auto& r = rs[i].array();
+    size_t k = 0;
+    for (const auto& p : ps) {
+      y.push_back(p ? std::move(r[k++]) : nullptr);
+    }
+  }
+  return ys;
+}
+
+}  // namespace
+
+Sender<Value> Cond::Process(Sender<Value> input) {
+  return LetValue(std::move(input), [this](Value& _input) -> Sender<Value> {
+    assert(_input.is_array());
+    auto& as = _input.array();
+    auto ps = get_predicates(as.front().array());
+    as.erase(as.begin());
+    auto [coherent, branch] = choice(ps);
+    if (coherent) {
+      if (branch) {
+        return node_->Process(Just(std::move(_input)));
+      } else {
+        Value::Array output(n_output_, Value::Array(ps.size(), nullptr));
+        return Just(Value(std::move(output)));
+      }
+    } else {
+      auto ts = get_divergent_input(as, ps);
+      return node_->Process(Just(Value(std::move(ts)))) |
+             Then([ps = std::move(ps)](Value rs) -> Value {
+               return get_divergent_output(rs.array(), ps);
+             });
+    }
+  });
+}
+
+CondBuilder::CondBuilder(Value config) : Builder(std::move(config)) {}
+
+Result<unique_ptr<Node>> CondBuilder::BuildImpl() {
+  try {
+    auto cond = std::make_unique<Cond>();
+    cond->n_output_ = static_cast<int>(config_["output"].size());
+
+    auto& body_config = config_["body"];
+    auto inputs = config_["input"].array();
+    inputs.erase(inputs.begin());
+
+    body_config["input"] = std::move(inputs);
+    body_config["output"] = config_["output"];
+
+    // propagate context
+    if (!body_config.contains("context")) {
+      body_config["context"] = Value::Object();
+    }
+    if (config_.contains("context")) {
+      update(body_config["context"].object(), config_["context"].object(), 2);
+    }
+
+    if (auto builder = Builder::CreateFromConfig(body_config).value()) {
+      if (auto node = builder->Build().value()) {
+        cond->node_ = std::move(node);
+        return std::move(cond);
+      }
+    }
+  } catch (const std::exception& e) {
+    MMDEPLOY_ERROR("error parsing config: {}", config_);
+  }
+  return Status(eFail);
+}
+
+class CondCreator : public Creator<Builder> {
+ public:
+  const char* GetName() const override { return "Cond"; }
+  unique_ptr<Builder> Create(const Value& config) override {
+    return std::make_unique<CondBuilder>(config);
+  }
+};
+
+REGISTER_MODULE(Builder, CondCreator);
+
+}  // namespace mmdeploy::graph
--- a/csrc/mmdeploy/graph/cond.h
+++ b/csrc/mmdeploy/graph/cond.h
@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_MMDEPLOY_GRAPH_COND_H_
+#define MMDEPLOY_CSRC_MMDEPLOY_GRAPH_COND_H_
+
+#include "mmdeploy/core/graph.h"
+
+namespace mmdeploy::graph {
+
+class Cond : public Node {
+  friend class CondBuilder;
+
+ public:
+  Sender<Value> Process(Sender<Value> input) override;
+
+ private:
+  std::unique_ptr<Node> node_;
+  int n_output_{0};
+};
+
+class CondBuilder : public Builder {
+ public:
+  explicit CondBuilder(Value config);
+
+ protected:
+  Result<unique_ptr<Node>> BuildImpl() override;
+};
+
+}  // namespace mmdeploy::graph
+
+#endif  // MMDEPLOY_CSRC_MMDEPLOY_GRAPH_COND_H_
--- a/csrc/mmdeploy/net/CMakeLists.txt
+++ b/csrc/mmdeploy/net/CMakeLists.txt
@ -38,5 +38,9 @@ if ("coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
    add_subdirectory(coreml)
 endif ()

+if ("rknn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    add_subdirectory(rknn)
+endif ()
+
 mmdeploy_add_module(${PROJECT_NAME} net_module.cpp)
 add_library(mmdeploy::net_module ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/net/ort/CMakeLists.txt
+++ b/csrc/mmdeploy/net/ort/CMakeLists.txt
@ -4,12 +4,8 @@ project(mmdeploy_ort_net)

 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindONNXRUNTIME.cmake)

-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    mmdeploy_add_module(${PROJECT_NAME} ort_net.cpp)
-    target_include_directories(${PROJECT_NAME} PRIVATE ${ONNXRUNTIME_DIR}/include)
-    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_onnxruntime_ops_obj)
-    target_link_libraries(${PROJECT_NAME} PUBLIC onnxruntime)
-    add_library(mmdeploy::ort_net ALIAS ${PROJECT_NAME})
-else ()
-    message(ERROR "'ort_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+mmdeploy_add_module(${PROJECT_NAME} ort_net.cpp)
+target_include_directories(${PROJECT_NAME} PRIVATE ${ONNXRUNTIME_DIR}/include)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_onnxruntime_ops_obj)
+target_link_libraries(${PROJECT_NAME} PUBLIC onnxruntime)
+add_library(mmdeploy::ort_net ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/net/rknn/CMakeLists.txt
+++ b/csrc/mmdeploy/net/rknn/CMakeLists.txt
@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_rknn_net)
+
+add_library(rknn SHARED IMPORTED)
+
+if(DEFINED ENV{RKNPU2_DEVICE_DIR})
+    file(TO_CMAKE_PATH $ENV{RKNPU2_DEVICE_DIR} RKNPU2_DEVICE_DIR)
+else()
+    message(FATAL_ERROR "RKNPU2_DEVICE_DIR env must be defined")
+endif()
+
+set_target_properties(rknn PROPERTIES
+  IMPORTED_LOCATION "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/aarch64/librknn_api.so"
+  INTERFACE_INCLUDE_DIRECTORIES "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/include"
+)
+
+mmdeploy_add_module(${PROJECT_NAME} rknn_net.cpp)
+target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
+add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/net/rknn/rknn_net.cpp
+++ b/csrc/mmdeploy/net/rknn/rknn_net.cpp
@ -0,0 +1,216 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "rknn_net.h"
+
+#include <stdio.h>
+
+#include <fstream>
+
+#include "mmdeploy/core/logger.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/utils/filesystem.h"
+#include "mmdeploy/core/utils/formatter.h"
+
+namespace mmdeploy::framework {
+
+Result<rknn_tensor_type> GetRKNNDataType(DataType data_type) {
+  switch (data_type) {
+    case DataType::kFLOAT:
+      return RKNN_TENSOR_FLOAT32;
+    case DataType::kHALF:
+      return RKNN_TENSOR_FLOAT16;
+    case DataType::kINT8:
+      return RKNN_TENSOR_INT8;
+    case DataType::kINT32:
+      return RKNN_TENSOR_INT32;
+    case DataType::kINT64:
+      return RKNN_TENSOR_INT64;
+    default:
+      return Status(eNotSupported);
+  }
+}
+
+Result<DataType> GetMMDeployDataType(rknn_tensor_type data_type) {
+  switch (data_type) {
+    case RKNN_TENSOR_FLOAT32:
+      return DataType::kFLOAT;
+    case RKNN_TENSOR_FLOAT16:
+      return DataType::kHALF;
+    case RKNN_TENSOR_INT8:
+      return DataType::kINT8;
+    case RKNN_TENSOR_INT32:
+      return DataType::kINT32;
+    case RKNN_TENSOR_INT64:
+      return DataType::kINT64;
+    default:
+      return Status(eNotSupported);
+  }
+}
+
+RKNNNet::~RKNNNet() { rknn_destroy(ctx_); }
+
+void RKNNNet::dump_tensor_attr(rknn_tensor_attr* attr) {
+  MMDEPLOY_INFO(
+      "  index={}, name={}, n_dims={}, dims=[{}, {}, {}, {}], n_elems={}, size={}, fmt={}, "
+      "type={}, qnt_type={}, "
+      "zp={}, scale=%f\n",
+      attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2],
+      attr->dims[3], attr->n_elems, attr->size, get_format_string(attr->fmt),
+      get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
+}
+
+Result<void> RKNNNet::Init(const Value& args) {
+  auto& context = args["context"];
+  device_ = context["device"].get<Device>();
+  stream_ = context["stream"].get<Stream>();
+  if (!device_.is_host()) {
+    return Status(eNotSupported);
+  }
+
+  auto name = args["name"].get<std::string>();
+  auto model = context["model"].get<Model>();
+  OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+  std::string content;
+  OUTCOME_TRY(content, model.ReadFile(config.net));
+  char* model_ptr = const_cast<char*>(content.data());
+  int ret = rknn_init(&ctx_, model_ptr, content.size(), 0, NULL);
+  if (ret != RKNN_SUCC) {
+    MMDEPLOY_ERROR("Load .rknn failed! ret= {}", ret);
+    return Status(eInvalidArgument);
+  }
+
+  // Get Model Input Output Info
+  rknn_input_output_num io_num;
+  ret = rknn_query(ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+  if (ret != RKNN_SUCC) {
+    MMDEPLOY_INFO("model input num: {}, output num: {}\n", io_num.n_input, io_num.n_output);
+    MMDEPLOY_ERROR("rknn_query fail! ret= {}", ret);
+    return Status(eFail);
+  }
+
+  for (int i = 0; i < io_num.n_input; i++) {
+    rknn_tensor_attr input_attr;
+    input_attr.index = i;
+    ret = rknn_query(ctx_, RKNN_QUERY_INPUT_ATTR, &(input_attr), sizeof(rknn_tensor_attr));
+    if (ret != RKNN_SUCC) {
+      MMDEPLOY_INFO("input tensors:\n");
+      dump_tensor_attr(&(input_attr));
+      MMDEPLOY_ERROR("rknn_query fail! ret= {}", ret);
+      return Status(eFail);
+    }
+    input_attrs_.push_back(input_attr);
+    OUTCOME_TRY(auto data_type, GetMMDeployDataType(input_attr.type));
+    input_tensors_.emplace_back(TensorDesc{device_, data_type, {}, input_attr.name});
+  }
+
+  for (int i = 0; i < io_num.n_output; i++) {
+    rknn_tensor_attr output_attr;
+    output_attr.index = i;
+    ret = rknn_query(ctx_, RKNN_QUERY_OUTPUT_ATTR, &(output_attr), sizeof(rknn_tensor_attr));
+    if (ret != RKNN_SUCC) {
+      MMDEPLOY_INFO("output tensors:\n");
+      dump_tensor_attr(&(output_attr));
+      MMDEPLOY_ERROR("rknn_query fail! ret= {}", ret);
+      return Status(eFail);
+    }
+    output_attrs_.push_back(output_attr);
+    OUTCOME_TRY(auto data_type, GetMMDeployDataType(output_attr.type));
+    output_tensors_.emplace_back(TensorDesc{device_, data_type, {}, output_attr.name});
+  }
+
+  return success();
+}
+
+Result<void> RKNNNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
+
+Result<void> RKNNNet::Deinit() { return success(); }
+
+Result<Span<Tensor>> RKNNNet::GetInputTensors() { return input_tensors_; }
+
+Result<Span<Tensor>> RKNNNet::GetOutputTensors() { return output_tensors_; }
+
+Result<void> RKNNNet::Reshape(Span<TensorShape> input_shapes) {
+  for (size_t i = 0; i < input_shapes.size(); ++i) {
+    input_tensors_[i].Reshape(input_shapes[i]);
+  }
+  return success();
+}
+
+Result<void> RKNNNet::Forward() {
+  OUTCOME_TRY(stream_.Wait());
+
+  std::vector<rknn_input> inputs;
+  for (int i = 0; i < input_tensors_.size(); i++) {
+    rknn_input input;
+    input.index = i;
+    input.pass_through = 0;
+    input.type = input_attrs_[i].type;
+    input.fmt = input_attrs_[i].fmt;
+    input.buf = input_tensors_[i].data<float>();
+    input.size = input_attrs_[i].size;
+    inputs.push_back(input);
+  }
+
+  // Set input
+  int ret = rknn_inputs_set(ctx_, input_tensors_.size(), inputs.data());
+  if (ret < 0) {
+    MMDEPLOY_ERROR("rknn_input_set fail! ret= {}", ret);
+    return Status(eFail);
+  }
+
+  // Get output
+  std::vector<rknn_output> outputs;
+  for (uint32_t i = 0; i < output_tensors_.size(); ++i) {
+    rknn_output output;
+    output.want_float = 1;
+    output.index = i;
+    output.is_prealloc = 0;
+    outputs.push_back(output);
+  }
+
+  ret = rknn_run(ctx_, NULL);
+  if (ret < 0) {
+    MMDEPLOY_ERROR("rknn_run fail! ret={}", ret);
+    return Status(eFail);
+  }
+
+  ret = rknn_outputs_get(ctx_, output_tensors_.size(), outputs.data(), NULL);
+  if (ret < 0) {
+    MMDEPLOY_ERROR("rknn_outputs_get fail! ret= {}", ret);
+    return Status(eFail);
+  }
+  for (int i = 0; i < output_tensors_.size(); i++) {
+    TensorShape tensor_shape;
+    for (int j = 0; j < output_attrs_[i].n_dims; ++j) {
+      tensor_shape.push_back(output_attrs_[i].dims[j]);
+    }
+    output_tensors_[i].Reshape(tensor_shape);
+    memcpy(output_tensors_[i].data<float>(), (float*)outputs[i].buf, output_attrs_[i].size);
+  }
+  OUTCOME_TRY(stream_.Wait());
+  return success();
+}
+
+class RKNNNetCreator : public Creator<Net> {
+ public:
+  const char* GetName() const override { return "rknn"; }
+  int GetVersion() const override { return 0; }
+  std::unique_ptr<Net> Create(const Value& args) override {
+    try {
+      auto p = std::make_unique<RKNNNet>();
+      if (auto r = p->Init(args)) {
+        return p;
+      } else {
+        MMDEPLOY_ERROR("error creating RKNNNet: {}", r.error().message().c_str());
+        return nullptr;
+      }
+    } catch (const std::exception& e) {
+      MMDEPLOY_ERROR("unhandled exception when creating RKNNNet: {}", e.what());
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_MODULE(Net, RKNNNetCreator);
+
+}  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/net/rknn/rknn_net.h
+++ b/csrc/mmdeploy/net/rknn/rknn_net.h
@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_NET_RKNN_RKNN_NET_H_
+#define MMDEPLOY_SRC_NET_RKNN_RKNN_NET_H_
+
+#include "mmdeploy/core/mpl/span.h"
+#include "mmdeploy/core/net.h"
+#include "rknn_api.h"
+
+namespace mmdeploy::framework {
+
+class RKNNNet : public Net {
+ public:
+  ~RKNNNet() override;
+
+  Result<void> Init(const Value& args) override;
+
+  Result<void> Deinit() override;
+
+  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+
+  Result<Span<Tensor> > GetInputTensors() override;
+
+  Result<Span<Tensor> > GetOutputTensors() override;
+
+  Result<void> Forward() override;
+
+  Result<void> ForwardAsync(Event* event) override;
+
+ private:
+  void dump_tensor_attr(rknn_tensor_attr* attr);
+
+  Device device_;
+  Stream stream_;
+  rknn_context ctx_;
+  std::vector<Tensor> input_tensors_;
+  std::vector<Tensor> output_tensors_;
+  std::vector<rknn_tensor_attr> input_attrs_;
+  std::vector<rknn_tensor_attr> output_attrs_;
+  static constexpr const auto kHost = Device(0);
+};
+
+}  // namespace mmdeploy::framework
+
+#endif  // MMDEPLOY_SRC_NET_RKNN_RKNN_NET_H_
--- a/demo/csrc/CMakeLists.txt
+++ b/demo/csrc/CMakeLists.txt
@ -30,7 +30,9 @@ function(add_example dep folder name)
 endfunction()

 add_example(classifier c image_classification)
+add_example(classifier c batch_image_classification)
 add_example(detector c object_detection)
+add_example(detector c batch_object_detection)
 add_example(segmentor c image_segmentation)
 add_example(restorer c image_restorer)
 add_example(text_detector c ocr)
@ -46,7 +48,8 @@ if (MMDEPLOY_BUILD_SDK_CXX_API)
    add_example(segmentor cpp segmentor)
    add_example(restorer cpp restorer)
    add_example(text_detector cpp text_ocr)
-    add_example("" cpp text_det_recog)
+    add_example(text_detector cpp text_det_recog)
    add_example(pose_detector cpp pose_detector)
    add_example(rotated_detector cpp rotated_detector)
+    add_example(pose_detector cpp pose_tracker)
 endif ()
--- a/demo/csrc/c/batch_image_classification.cpp
+++ b/demo/csrc/c/batch_image_classification.cpp
@ -0,0 +1,100 @@
+#include <fstream>
+#include <opencv2/imgcodecs/imgcodecs.hpp>
+#include <string>
+
+#include "mmdeploy/classifier.h"
+
+static int batch_inference(mmdeploy_classifier_t classifier,
+                           const std::vector<int>& image_ids,
+                           const std::vector<mmdeploy_mat_t>& mats);
+
+int main(int argc, char* argv[]) {
+  if (argc < 5) {
+    fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory "
+            "imagelist.txt batch_size\n");
+    return 1;
+  }
+  auto device_name = argv[1];
+  auto model_path = argv[2];
+
+  mmdeploy_classifier_t classifier{};
+  int status{};
+  status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
+  if (status != MMDEPLOY_SUCCESS) {
+    fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
+    return 1;
+  }
+
+  // `file_path` is the path of an image list file
+  std::string file_path = argv[3];
+  const int batch = std::stoi(argv[argc-1]);
+
+  // read image paths from the file
+  std::ifstream ifs(file_path);
+  std::string img_path;
+  std::vector<std::string> img_paths;
+  while (ifs >> img_path) {
+    img_paths.emplace_back(std::move(img_path));
+  }
+
+  // read images and process batch inference
+  std::vector<cv::Mat> images;
+  std::vector<int> image_ids;
+  std::vector<mmdeploy_mat_t> mats;
+  for (int i = 0; i < (int)img_paths.size(); ++i) {
+    auto img = cv::imread(img_paths[i]);
+    if (!img.data) {
+      fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
+      continue;
+    }
+    images.push_back(img);
+    image_ids.push_back(i);
+    mmdeploy_mat_t mat{
+        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mats.push_back(mat);
+
+    // process batch inference
+    if ((int)mats.size() == batch) {
+      if (batch_inference(classifier, image_ids, mats) != 0) {
+        continue;
+      }
+      // clear buffer for next batch
+      mats.clear();
+      image_ids.clear();
+      images.clear();
+    }
+  }
+  // process batch inference if there are still unhandled images
+  if (!mats.empty()) {
+    (void)batch_inference(classifier, image_ids, mats);
+  }
+
+  mmdeploy_classifier_destroy(classifier);
+
+  return 0;
+}
+
+
+int batch_inference(mmdeploy_classifier_t classifier, const std::vector<int>& image_ids,
+                    const std::vector<mmdeploy_mat_t>& mats) {
+  mmdeploy_classification_t* res{};
+  int* res_count{};
+  auto status = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(),
+                                          &res, &res_count);
+  if (status != MMDEPLOY_SUCCESS) {
+    fprintf(stderr, "failed to apply classifier to batch images %d, code: %d\n",
+            (int)mats.size(), (int)status);
+    return 1;
+  }
+  // print the inference results
+  auto res_ptr = res;
+  for (int j = 0; j < (int)mats.size(); ++j) {
+    fprintf(stderr, "results in the %d-th image:\n", image_ids[j]);
+    for (int k = 0; k < res_count[j]; ++k, ++res_ptr) {
+      fprintf(stderr, "  label: %d, score: %.4f\n", res_ptr->label_id, res_ptr->score);
+    }
+  }
+  // release results buffer
+  mmdeploy_classifier_release_result(res, res_count, (int)mats.size());
+  return 0;
+}
--- a/demo/csrc/c/batch_object_detection.cpp
+++ b/demo/csrc/c/batch_object_detection.cpp
@ -0,0 +1,147 @@
+#include <fstream>
+#include <opencv2/imgcodecs/imgcodecs.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+
+#include "mmdeploy/detector.h"
+
+static int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images,
+                           const std::vector<int>& image_ids,
+                           const std::vector<mmdeploy_mat_t>& mats);
+
+static void visualize_detection(const std::string& output_name, cv::Mat& image,
+                                const mmdeploy_detection_t* bboxes_ptr, int bboxes_num);
+
+int main(int argc, char* argv[]) {
+  if (argc < 5) {
+    fprintf(stderr, "usage:\n  object_detection device_name sdk_model_path "
+            "file_path batch_size\n");
+    return 1;
+  }
+  auto device_name = argv[1];
+  auto model_path = argv[2];
+
+  mmdeploy_detector_t detector{};
+  int status{};
+  status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
+  if (status != MMDEPLOY_SUCCESS) {
+    fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
+    return 1;
+  }
+
+  // file_path is the path of an image list file
+  std::string file_path = argv[3];
+  const int batch = std::stoi(argv[argc-1]);
+
+  // read image paths from the file
+  std::ifstream ifs(file_path);
+  std::string img_path;
+  std::vector<std::string> img_paths;
+  while (ifs >> img_path) {
+    img_paths.emplace_back(std::move(img_path));
+  }
+
+
+  // read images and process batch inference
+  std::vector<cv::Mat> images;
+  std::vector<int> image_ids;
+  std::vector<mmdeploy_mat_t> mats;
+  for (int i = 0; i < (int)img_paths.size(); ++i) {
+    auto img = cv::imread(img_paths[i]);
+    if (!img.data) {
+      fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
+      continue;
+    }
+    images.push_back(img);
+    image_ids.push_back(i);
+    mmdeploy_mat_t mat{
+        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mats.push_back(mat);
+
+    // process batch inference
+    if ((int)mats.size() == batch) {
+      if (batch_inference(detector, images, image_ids, mats) != 0) {
+        continue;
+      }
+      // clear buffer for next batch
+      mats.clear();
+      image_ids.clear();
+      images.clear();
+    }
+  }
+  // process batch inference if there are still unhandled images
+  if (!mats.empty()) {
+    (void)batch_inference(detector, images, image_ids, mats);
+  }
+
+  mmdeploy_detector_destroy(detector);
+  return 0;
+}
+
+int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images,
+                    const std::vector<int>& image_ids,
+                    const std::vector<mmdeploy_mat_t>& mats) {
+  mmdeploy_detection_t* bboxes{};
+  int* res_count{};
+  auto status = mmdeploy_detector_apply(detector, mats.data(), mats.size(), &bboxes, &res_count);
+  if (status != MMDEPLOY_SUCCESS) {
+    fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
+    return 1;
+  }
+
+  mmdeploy_detection_t* bboxes_ptr = bboxes;
+  for (int i = 0; i < (int)mats.size(); ++i) {
+    fprintf(stdout, "results in the %d-th image:\n  bbox_count=%d\n", image_ids[i], res_count[i]);
+    const std::string output_name = "output_detection_" + std::to_string(image_ids[i]) + ".png";
+    visualize_detection(output_name, images[i], bboxes_ptr, res_count[i]);
+    bboxes_ptr = bboxes_ptr + res_count[i];
+  }
+
+  mmdeploy_detector_release_result(bboxes, res_count, mats.size());
+  return 0;
+}
+
+
+void visualize_detection(const std::string& output_name, cv::Mat& image,
+                         const mmdeploy_detection_t* bboxes_ptr, int bbox_num) {
+  for (int i = 0; i < bbox_num; ++i, ++bboxes_ptr) {
+    const auto& box = bboxes_ptr->bbox;
+    const auto& mask = bboxes_ptr->mask;
+
+    fprintf(stdout,
+            "  box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, "
+            "label=%d, score=%.4f\n",
+            i, box.left, box.top, box.right, box.bottom, bboxes_ptr->label_id, bboxes_ptr->score);
+
+    // skip detections with invalid bbox size (bbox height or width < 1)
+    if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1) {
+      continue;
+    }
+
+    // skip detections less than specified score threshold
+    if (bboxes_ptr->score < 0.3) {
+      continue;
+    }
+
+    // generate mask overlay if model exports masks
+    if (mask != nullptr) {
+      fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
+
+      cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
+      auto x0 = std::max(std::floor(box.left) - 1, 0.f);
+      auto y0 = std::max(std::floor(box.top) - 1, 0.f);
+      cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
+
+      // split the RGB channels, overlay mask to a specific color channel
+      cv::Mat ch[3];
+      split(image, ch);
+      int col = 0;
+      cv::bitwise_or(imgMask, ch[col](roi), ch[col](roi));
+      merge(ch, 3, image);
+    }
+
+    cv::rectangle(image, cv::Point{(int)box.left, (int)box.top},
+                  cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
+  }
+  cv::imwrite(output_name, image);
+}
--- a/demo/csrc/cpp/pose_tracker.cpp
+++ b/demo/csrc/cpp/pose_tracker.cpp
@ -0,0 +1,427 @@
+
+
+#include "mmdeploy/archive/json_archive.h"
+#include "mmdeploy/archive/value_archive.h"
+#include "mmdeploy/common.hpp"
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/module.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/experimental/module_adapter.h"
+#include "mmdeploy/pipeline.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/videoio.hpp"
+
+const auto config_json = R"(
+{
+  "type": "Pipeline",
+  "input": ["data", "use_det", "state"],
+  "output": "targets",
+  "tasks": [
+    {
+      "type": "Cond",
+      "input": ["use_det", "data"],
+      "output": "dets",
+      "body": {
+        "name": "detection",
+        "type": "Inference",
+        "params": { "model": "detection" }
+      }
+    },
+    {
+      "type": "Task",
+      "module": "ProcessBboxes",
+      "input": ["dets", "data", "state"],
+      "output": "rois"
+    },
+    {
+      "input": "*rois",
+      "output": "*keypoints",
+      "name": "pose",
+      "type": "Inference",
+      "params": { "model": "pose" }
+    },
+    {
+      "type": "Task",
+      "module": "TrackPose",
+      "scheduler": "pool",
+      "input": ["keypoints", "state"],
+      "output": "targets"
+    }
+  ]
+}
+)"_json;
+
+namespace mmdeploy {
+
+#define REGISTER_SIMPLE_MODULE(name, fn)                                             \
+  class name##_Creator : public ::mmdeploy::Creator<Module> {                        \
+    const char* GetName() const override { return #name; }                           \
+    std::unique_ptr<Module> Create(const Value&) override { return CreateTask(fn); } \
+  };                                                                                 \
+  REGISTER_MODULE(Module, name##_Creator)
+
+std::optional<std::array<float, 4>> keypoints_to_bbox(const std::vector<cv::Point2f>& keypoints,
+                                                      const std::vector<float>& scores, float img_h,
+                                                      float img_w, float scale = 1.5,
+                                                      float kpt_thr = 0.3) {
+  auto valid = false;
+  auto x1 = static_cast<float>(img_w);
+  auto y1 = static_cast<float>(img_h);
+  auto x2 = 0.f;
+  auto y2 = 0.f;
+  for (size_t i = 0; i < keypoints.size(); ++i) {
+    auto& kpt = keypoints[i];
+    if (scores[i] > kpt_thr) {
+      x1 = std::min(x1, kpt.x);
+      y1 = std::min(y1, kpt.y);
+      x2 = std::max(x2, kpt.x);
+      y2 = std::max(y2, kpt.y);
+      valid = true;
+    }
+  }
+  if (!valid) {
+    return std::nullopt;
+  }
+  auto xc = .5f * (x1 + x2);
+  auto yc = .5f * (y1 + y2);
+  auto w = (x2 - x1) * scale;
+  auto h = (y2 - y1) * scale;
+
+  return std::array<float, 4>{
+      std::max(0.f, std::min(img_w, xc - .5f * w)),
+      std::max(0.f, std::min(img_h, yc - .5f * h)),
+      std::max(0.f, std::min(img_w, xc + .5f * w)),
+      std::max(0.f, std::min(img_h, yc + .5f * h)),
+  };
+}
+
+struct Track {
+  std::vector<std::vector<cv::Point2f>> keypoints;
+  std::vector<std::vector<float>> scores;
+  std::vector<std::array<float, 4>> bboxes;
+  int64_t track_id{-1};
+};
+
+struct TrackInfo {
+  std::vector<Track> tracks;
+  int64_t next_id{0};
+};
+
+MMDEPLOY_REGISTER_TYPE_ID(TrackInfo, 0xcfe87980aa895d3a);  // randomly generated type id
+
+Value::Array GetObjectsByTracking(Value& state, int img_h, int img_w) {
+  Value::Array objs;
+  auto& track_info = state["track_info"].get_ref<TrackInfo&>();
+  for (auto& track : track_info.tracks) {
+    auto bbox = keypoints_to_bbox(track.keypoints.back(), track.scores.back(),
+                                  static_cast<float>(img_h), static_cast<float>(img_w));
+    if (bbox) {
+      objs.push_back({{"bbox", to_value(*bbox)}});
+    }
+  }
+  return objs;
+}
+
+Value ProcessBboxes(const Value& detections, const Value& data, Value state) {
+  assert(state.is_pointer());
+  Value::Array bboxes;
+  if (detections.is_array()) {  // has detections
+    auto& dets = detections.array();
+    for (const auto& det : dets) {
+      if (det["label_id"].get<int>() == 0 && det["score"].get<float>() >= .3f) {
+        bboxes.push_back(det);
+      }
+    }
+    MMDEPLOY_INFO("bboxes by detection: {}", bboxes.size());
+    state["bboxes"] = bboxes;
+  } else {  // no detections, use tracked results
+    auto img_h = state["img_shape"][0].get<int>();
+    auto img_w = state["img_shape"][1].get<int>();
+    bboxes = GetObjectsByTracking(state, img_h, img_w);
+    MMDEPLOY_INFO("GetObjectsByTracking: {}", bboxes.size());
+  }
+  // attach bboxes to image data
+  for (auto& bbox : bboxes) {
+    auto img = data["ori_img"].get<framework::Mat>();
+    auto box = from_value<std::array<float, 4>>(bbox["bbox"]);
+    cv::Rect rect(cv::Rect2f(cv::Point2f(box[0], box[1]), cv::Point2f(box[2], box[3])));
+    bbox = Value::Object{
+        {"ori_img", img}, {"bbox", {rect.x, rect.y, rect.width, rect.height}}, {"rotation", 0.f}};
+  };
+  return bboxes;
+}
+REGISTER_SIMPLE_MODULE(ProcessBboxes, ProcessBboxes);
+
+// xyxy format
+float ComputeIoU(const std::array<float, 4>& a, const std::array<float, 4>& b) {
+  auto x1 = std::max(a[0], b[0]);
+  auto y1 = std::max(a[1], b[1]);
+  auto x2 = std::min(a[2], b[2]);
+  auto y2 = std::min(a[3], b[3]);
+
+  auto inter_area = std::max(0.f, x2 - x1) * std::max(0.f, y2 - y1);
+
+  auto a_area = (a[2] - a[0]) * (a[3] - a[1]);
+  auto b_area = (b[2] - b[0]) * (b[3] - b[1]);
+  auto union_area = a_area + b_area - inter_area;
+
+  if (union_area == 0.f) {
+    return 0;
+  }
+
+  return inter_area / union_area;
+}
+
+void UpdateTrack(Track& track, std::vector<cv::Point2f>& keypoints, std::vector<float>& score,
+                 const std::array<float, 4>& bbox, int n_history) {
+  if (track.scores.size() == n_history) {
+    std::rotate(track.keypoints.begin(), track.keypoints.begin() + 1, track.keypoints.end());
+    std::rotate(track.scores.begin(), track.scores.begin() + 1, track.scores.end());
+    std::rotate(track.bboxes.begin(), track.bboxes.begin() + 1, track.bboxes.end());
+    track.keypoints.back() = std::move(keypoints);
+    track.scores.back() = std::move(score);
+    track.bboxes.back() = bbox;
+  } else {
+    track.keypoints.push_back(std::move(keypoints));
+    track.scores.push_back(std::move(score));
+    track.bboxes.push_back(bbox);
+  }
+}
+
+std::vector<std::tuple<int, int, float>> GreedyAssignment(const std::vector<float>& scores,
+                                                          int n_rows, int n_cols, float thr) {
+  std::vector<int> used_rows(n_rows);
+  std::vector<int> used_cols(n_cols);
+  std::vector<std::tuple<int, int, float>> assignment;
+  assignment.reserve(std::max(n_rows, n_cols));
+  while (true) {
+    auto max_score = 0.f;
+    int max_row = -1;
+    int max_col = -1;
+    for (int i = 0; i < n_rows; ++i) {
+      if (!used_rows[i]) {
+        for (int j = 0; j < n_cols; ++j) {
+          if (!used_cols[j]) {
+            if (scores[i * n_cols + j] > max_score) {
+              max_score = scores[i * n_cols + j];
+              max_row = i;
+              max_col = j;
+            }
+          }
+        }
+      }
+    }
+    if (max_score < thr) {
+      break;
+    }
+    used_rows[max_row] = 1;
+    used_cols[max_col] = 1;
+    assignment.emplace_back(max_row, max_col, max_score);
+  }
+  return assignment;
+}
+
+void TrackStep(std::vector<std::vector<cv::Point2f>>& keypoints,
+               std::vector<std::vector<float>>& scores, TrackInfo& track_info, int img_h, int img_w,
+               float iou_thr, int min_keypoints, int n_history) {
+  auto& tracks = track_info.tracks;
+
+  std::vector<Track> new_tracks;
+  new_tracks.reserve(tracks.size());
+
+  std::vector<std::array<float, 4>> bboxes;
+  bboxes.reserve(keypoints.size());
+
+  std::vector<int> indices;
+  indices.reserve(keypoints.size());
+
+  for (size_t i = 0; i < keypoints.size(); ++i) {
+    if (auto bbox = keypoints_to_bbox(keypoints[i], scores[i], img_h, img_w, 1.f, 0.f)) {
+      bboxes.push_back(*bbox);
+      indices.push_back(i);
+    }
+  }
+
+  const auto n_rows = static_cast<int>(bboxes.size());
+  const auto n_cols = static_cast<int>(tracks.size());
+
+  std::vector<float> similarities(n_rows * n_cols);
+  for (size_t i = 0; i < n_rows; ++i) {
+    for (size_t j = 0; j < n_cols; ++j) {
+      similarities[i * n_cols + j] = ComputeIoU(bboxes[i], tracks[j].bboxes.back());
+    }
+  }
+
+  const auto assignment = GreedyAssignment(similarities, n_rows, n_cols, iou_thr);
+
+  std::vector<int> used(n_rows);
+  for (auto [i, j, _] : assignment) {
+    auto k = indices[i];
+    UpdateTrack(tracks[j], keypoints[k], scores[k], bboxes[i], n_history);
+    new_tracks.push_back(std::move(tracks[j]));
+    used[i] = true;
+  }
+
+  for (size_t i = 0; i < used.size(); ++i) {
+    if (used[i] == 0) {
+      auto k = indices[i];
+      auto count = std::count_if(scores[k].begin(), scores[k].end(), [](auto x) { return x > 0; });
+      if (count >= min_keypoints) {
+        auto& track = new_tracks.emplace_back();
+        track.track_id = track_info.next_id++;
+        UpdateTrack(track, keypoints[k], scores[k], bboxes[i], n_history);
+      }
+    }
+  }
+
+  tracks = std::move(new_tracks);
+}
+
+Value TrackPose(const Value& result, Value state) {
+  assert(state.is_pointer());
+  assert(result.is_array());
+  std::vector<std::vector<cv::Point2f>> keypoints;
+  std::vector<std::vector<float>> scores;
+  for (auto& output : result.array()) {
+    auto& k = keypoints.emplace_back();
+    auto& s = scores.emplace_back();
+    for (auto& kpt : output["key_points"].array()) {
+      k.push_back(cv::Point2f{kpt["bbox"][0].get<float>(), kpt["bbox"][1].get<float>()});
+      s.push_back(kpt["score"].get<float>());
+    }
+  }
+  auto& track_info = state["track_info"].get_ref<TrackInfo&>();
+  auto img_h = state["img_shape"][0].get<int>();
+  auto img_w = state["img_shape"][1].get<int>();
+  auto iou_thr = state["iou_thr"].get<float>();
+  auto min_keypoints = state["min_keypoints"].get<int>();
+  auto n_history = state["n_history"].get<int>();
+  TrackStep(keypoints, scores, track_info, img_h, img_w, iou_thr, min_keypoints, n_history);
+
+  Value::Array targets;
+  for (const auto& track : track_info.tracks) {
+    if (auto bbox = keypoints_to_bbox(track.keypoints.back(), track.scores.back(), img_h, img_w)) {
+      Value::Array kpts;
+      kpts.reserve(track.keypoints.back().size());
+      for (const auto& kpt : track.keypoints.back()) {
+        kpts.push_back(kpt.x);
+        kpts.push_back(kpt.y);
+      }
+      targets.push_back({{"bbox", to_value(*bbox)}, {"keypoints", std::move(kpts)}});
+    }
+  }
+  return targets;
+}
+REGISTER_SIMPLE_MODULE(TrackPose, TrackPose);
+
+class PoseTracker {
+ public:
+  using State = Value;
+
+ public:
+  PoseTracker(const Model& det_model, const Model& pose_model, Context context)
+      : pipeline_([&] {
+          context.Add("detection", det_model);
+          context.Add("pose", pose_model);
+          auto config = from_json<Value>(config_json);
+          return Pipeline{config, context};
+        }()) {}
+
+  State CreateState() {  // NOLINT
+    return make_pointer({{"frame_id", 0},
+                         {"n_history", 10},
+                         {"iou_thr", .3f},
+                         {"min_keypoints", 3},
+                         {"track_info", TrackInfo{}}});
+  }
+
+  Value Track(const Mat& img, State& state, int use_detector = -1) {
+    assert(state.is_pointer());
+    framework::Mat mat(img.desc().height, img.desc().width,
+                       static_cast<PixelFormat>(img.desc().format),
+                       static_cast<DataType>(img.desc().type), {img.desc().data, [](void*) {}});
+    // TODO: get_ref<int&> is not working
+    auto frame_id = state["frame_id"].get<int>();
+    if (use_detector < 0) {
+      use_detector = frame_id % 10 == 0;
+      if (use_detector) {
+        MMDEPLOY_WARN("use detector");
+      }
+    }
+    state["frame_id"] = frame_id + 1;
+    state["img_shape"] = {mat.height(), mat.width()};
+    Value::Object data{{"ori_img", mat}};
+    Value input{{data}, {use_detector}, {state}};
+    return pipeline_.Apply(input)[0][0];
+  }
+
+ private:
+  Pipeline pipeline_;
+};
+
+}  // namespace mmdeploy
+
+using namespace mmdeploy;
+
+void Visualize(cv::Mat& frame, const Value& result) {
+  static std::vector<std::pair<int, int>> skeleton{
+      {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12}, {5, 6}, {5, 7}, {6, 8},
+      {7, 9},   {8, 10},  {1, 2},   {0, 1},   {0, 2},   {1, 3},  {2, 4},  {3, 5}, {4, 6}};
+  const auto& targets = result.array();
+  for (const auto& target : targets) {
+    auto bbox = from_value<std::array<float, 4>>(target["bbox"]);
+    auto kpts = from_value<std::vector<float>>(target["keypoints"]);
+    cv::Point p1(bbox[0], bbox[1]);
+    cv::Point p2(bbox[2], bbox[3]);
+    cv::rectangle(frame, p1, p2, cv::Scalar(0, 255, 0));
+    for (int i = 0; i < kpts.size(); i += 2) {
+      cv::Point p(kpts[i], kpts[i + 1]);
+      cv::circle(frame, p, 1, cv::Scalar(0, 255, 255), 2, cv::LINE_AA);
+    }
+    for (int i = 0; i < skeleton.size(); ++i) {
+      auto [u, v] = skeleton[i];
+      cv::Point p_u(kpts[u * 2], kpts[u * 2 + 1]);
+      cv::Point p_v(kpts[v * 2], kpts[v * 2 + 1]);
+      cv::line(frame, p_u, p_v, cv::Scalar(0, 255, 255), 1, cv::LINE_AA);
+    }
+  }
+  cv::imshow("", frame);
+  cv::waitKey(10);
+}
+
+int main(int argc, char* argv[]) {
+  const auto device_name = argv[1];
+  const auto det_model_path = argv[2];
+  const auto pose_model_path = argv[3];
+  const auto video_path = argv[4];
+  Device device(device_name);
+  Context context(device);
+  auto pool = Scheduler::ThreadPool(4);
+  auto infer = Scheduler::Thread();
+  context.Add("pool", pool);
+  context.Add("infer", infer);
+  PoseTracker tracker(Model(det_model_path), Model(pose_model_path), context);
+  auto state = tracker.CreateState();
+
+  cv::Mat frame;
+  std::chrono::duration<double, std::milli> dt{};
+
+  int frame_id{};
+
+  cv::VideoCapture video(video_path);
+  while (true) {
+    video >> frame;
+    if (!frame.data) {
+      break;
+    }
+    auto t0 = std::chrono::high_resolution_clock::now();
+    auto result = tracker.Track(frame, state);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    dt += t1 - t0;
+    ++frame_id;
+    Visualize(frame, result);
+  }
+
+  MMDEPLOY_INFO("frames: {}, time {} ms", frame_id, dt.count());
+}
--- a/docs/en/01-how-to-build/build_from_script.md
+++ b/docs/en/01-how-to-build/build_from_script.md
@ -2,7 +2,13 @@

 Through user investigation, we know that most users are already familiar with python and torch before using mmdeploy. Therefore we provide scripts to simplify mmdeploy installation.

-Assuming you have a python ready (whether `conda` or `pyenv`), run this script to install mmdeploy + ncnn backend, `nproc` is not compulsory.
+Assuming you already have
+
+- python3 -m pip (`conda` or `pyenv`)
+- nvcc (depends on inference backend)
+- torch (not compulsory)
+
+run this script to install mmdeploy + ncnn backend, `nproc` is not compulsory.

 ```bash
 $ cd /path/to/mmdeploy
--- a/docs/en/01-how-to-build/build_from_source.md
+++ b/docs/en/01-how-to-build/build_from_source.md
@ -39,3 +39,4 @@ Please visit the following links to find out how to build MMDeploy according to
 - [NVIDIA Jetson](jetsons.md)
 - [SNPE](snpe.md)
 - [RISC-V](riscv.md)
+- [Rockchip](rockchip.md)
--- a/docs/en/01-how-to-build/cross_build_ncnn_aarch64.md
+++ b/docs/en/01-how-to-build/cross_build_ncnn_aarch64.md
@ -0,0 +1,108 @@
+# Ubuntu Cross Build aarch64
+
+mmdeploy chose ncnn as the inference backend for aarch64 embedded linux devices. There are two parts:
+
+Host
+
+- model conversion
+- cross build SDK and demo for embedded devices
+
+Device
+
+- Run converted model
+
+## 1. Model Convert on Host
+
+Refer to the doc to install [mmdeploy](../01-how-to-build/) and [mmcls](https://github.com/open-mmlab/mmclassification), and convert resnet18 for model package
+
+```bash
+export MODEL_CONFIG=/path/to/mmclassification/configs/resnet/resnet18_8xb32_in1k.py
+export MODEL_PATH=https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth
+
+# Convert resnet18
+cd /path/to/mmdeploy
+python tools/deploy.py \
+  configs/mmcls/classification_ncnn_static.py \
+  $MODEL_CONFIG \
+  $MODEL_PATH \
+  tests/data/tiger.jpeg \
+  --work-dir resnet18 \
+  --device cpu \
+  --dump-info
+```
+
+## 2. Cross Build on Host
+
+It is recommended to compile directly with the script
+
+```bash
+sh -x tools/scripts/ubuntu_cross_build_aarch64.sh
+```
+
+The following is the manual process corresponding to the script:
+
+a) Install aarch64 build tools
+
+```bash
+sudo apt install -y gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+```
+
+b) Cross build opencv and install to /tmp/ocv-aarch64
+
+```bash
+git clone https://github.com/opencv/opencv --depth=1 --branch=4.x --recursive
+cd opencv/platforms/linux/
+mkdir build && cd build
+cmake ../../.. \
+  -DCMAKE_INSTALL_PREFIX=/tmp/ocv-aarch64 \
+  -DCMAKE_TOOLCHAIN_FILE=../aarch64-gnu.toolchain.cmake
+make -j && make install
+ls -alh /tmp/ocv-aarch64
+..
+```
+
+c) Cross build ncnn and install to /tmp/ncnn-aarch64
+
+```bash
+git clone https://github.com/tencent/ncnn --branch 20220729 --depth=1
+mkdir build && cd build
+cmake .. \
+  -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \
+  -DCMAKE_INSTALL_PREFIX=/tmp/ncnn-aarch64
+make -j && make install
+ls -alh /tmp/ncnn-aarch64
+..
+```
+
+d) Cross build mmdeploy
+
+```bash
+git submodule init
+git submodule update
+mkdir build && cd build
+cmake .. \
+  -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/aarch64-linux-gnu.cmake \
+  -DMMDEPLOY_TARGET_DEVICES="cpu" \
+  -DMMDEPLOY_TARGET_BACKENDS="ncnn" \
+  -Dncnn_DIR=/tmp/ncnn-aarch64/lib/cmake/ncnn \
+  -DOpenCV_DIR=/tmp/ocv-aarch64/lib/cmake/opencv4
+make install
+ls -lah install/bin/*
+..
+```
+
+## 3. Execute on Device
+
+Make sure that `--dump-info` is used during model conversion, so that the `resnet18` directory contains the files required by the SDK such as `pipeline.json`.
+
+Copy the model folder(resnet18), executable(image_classification) file, test image(tests/data/tiger.jpeg) and prebuilt OpenCV(/tmp/ocv-aarch64) to the device.
+
+```bash
+./image_classification cpu ./resnet18  tiger.jpeg
+..
+label: 292, score: 0.9261
+label: 282, score: 0.0726
+label: 290, score: 0.0008
+label: 281, score: 0.0002
+label: 340, score: 0.0001
+```
--- a/docs/en/01-how-to-build/rockchip.md
+++ b/docs/en/01-how-to-build/rockchip.md
@ -0,0 +1,147 @@
+# Build for RKNN
+
+This tutorial is based on Linux systems like Ubuntu-18.04 and Rockchip NPU like `rk3588`.
+
+## Installation
+
+It is recommended to create a virtual environment for the project.
+
+1. get RKNN-Toolkit2 through:
+
+   ```
+   git clone git@github.com:rockchip-linux/rknn-toolkit2.git
+   ```
+
+2. install RKNN python package following [official doc](https://github.com/rockchip-linux/rknn-toolkit2/tree/master/doc). In our testing, we used the rknn-toolkit2 1.2.0 with commit id `834ba0b0a1ab8ee27024443d77b02b5ba48b67fc`. When installing rknn-toolkit2, it is better to append `--no-deps` after the commands to avoid dependency conflicts. For example:
+
+   ```
+   pip install packages/rknn_toolkit2-1.2.0_f7bb160f-cp36-cp36m-linux_x86_64.whl --no-deps
+   ```
+
+3. Install ONNX==1.8.0 before reinstall MMDeploy from source following the [instructions](../01-how-to-build/build_from_source.md). Note that there are conflicts between the pip dependencies of MMDeploy and RKNN. Here is the suggested packages versions for python 3.6:
+
+   ```
+   protobuf==3.19.4
+   onnx==1.8.0
+   onnxruntime==1.8.0
+   torch==1.8.0
+   torchvision==0.9.0
+   ```
+
+4. Install torch and torchvision using conda. For example:
+
+```
+conda install pytorch==1.8.0 torchvision==0.9.0 cudatoolkit=11.1 -c pytorch -c conda-forge
+```
+
+To work with models from [MMClassification](https://mmclassification.readthedocs.io/en/latest/getting_started.html), you may need to install it additionally.
+
+## Usage
+
+Example:
+
+```bash
+python tools/deploy.py \
+    configs/mmcls/classification_rknn_static.py \
+    /mmclassification_dir/configs/resnet/resnet50_8xb32_in1k.py \
+    https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth \
+    /mmclassification_dir/demo/demo.JPEG \
+    --work-dir ../resnet50 \
+    --device cpu
+```
+
+## Deployment config
+
+With the deployment config, you can modify the `backend_config` for your preference. An example `backend_config` of mmclassification is shown as below:
+
+```python
+backend_config = dict(
+    type='rknn',
+    common_config=dict(
+        mean_values=None,
+        std_values=None,
+        target_platform='rk3588',
+        optimization_level=3),
+    quantization_config=dict(do_quantization=False, dataset=None),
+    input_size_list=[[3, 224, 224]])
+
+```
+
+The contents of `common_config` are for `rknn.config()`. The contents of `quantization_config` are used to control `rknn.build()`.
+
+## Build SDK with Rockchip NPU
+
+1. get rknpu2 through:
+
+   ```
+   git clone git@github.com:rockchip-linux/rknpu2.git
+   ```
+
+2. for linux, download gcc cross compiler. The download link of the compiler from the official user guide of `rknpu2` was deprecated. You may use another verified [link](https://github.com/Caesar-github/gcc-buildroot-9.3.0-2020.03-x86_64_aarch64-rockchip-linux-gnu). After download and unzip the compiler, you may open the terminal, set `RKNN_TOOL_CHAIN` and `RKNPU2_DEVICE_DIR` by `export RKNN_TOOL_CHAIN=/path/to/gcc/usr;export RKNPU2_DEVICE_DIR=/path/to/rknpu2/runtime/RK3588`.
+
+3. after the above preparition, run the following commands:
+
+```shell
+cd /path/to/mmdeploy
+mkdir -p build && rm -rf build/CM* && cd build
+export LD_LIBRARY_PATH=$RKNN_TOOL_CHAIN/lib64:$LD_LIBRARY_PATH
+cmake \
+    -DCMAKE_TOOLCHAIN_FILE=/path/to/mmdeploy/cmake/toolchains/rknpu2-linux-gnu.cmake \
+    -DMMDEPLOY_BUILD_SDK=ON \
+    -DCMAKE_BUILD_TYPE=Debug \
+    -DOpenCV_DIR=${RKNPU2_DEVICE_DIR}/../../examples/3rdparty/opencv/opencv-linux-aarch64/share/OpenCV \
+    -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \
+    -DMMDEPLOY_TARGET_DEVICES="cpu" \
+    -DMMDEPLOY_TARGET_BACKENDS="rknn" \
+    -DMMDEPLOY_CODEBASES=all \
+    -DMMDEPLOY_BUILD_TEST=ON \
+    -DMMDEPLOY_BUILD_EXAMPLES=ON \
+    ..
+make && make install
+```
+
+## Run the demo with SDK
+
+First make sure that`--dump-info`is used during convert model, so that the working directory has the files required by the SDK such as `pipeline.json`.
+
+`adb push` the model directory, executable file and .so to the device.
+
+```bash
+cd /path/to/mmdeploy
+adb push resnet50  /data/local/tmp/resnet50
+adb push /mmclassification_dir/demo/demo.JPEG /data/local/tmp/resnet50/demo.JPEG
+cd build
+adb push lib /data/local/tmp/lib
+adb push bin/image_classification /data/local/tmp/image_classification
+```
+
+Set up environment variable and execute the sample.
+
+```bash
+adb shell
+cd /data/local/tmp
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/data/local/tmp/lib
+./image_classification cpu ./resnet50  ./resnet50/demo.JPEG
+..
+label: 65, score: 0.95
+```
+
+## Troubleshooting
+
+- Quantization fails.
+
+  Empirically, RKNN require the inputs not normalized if `do_quantization` is set to `True`. Please modify the settings of `Normalize` in the `model_cfg` from
+
+  ```python
+  img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+  ```
+
+  to
+
+  ```python
+  img_norm_cfg = dict(
+    mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
+  ```
+
+  Besides, the `mean_values` and `std_values` of deploy_cfg should be replaced with original normalization settings of `model_cfg`. Let `mean_values=[123.675, 116.28, 103.53]` and `std_values=[58.395, 57.12, 57.375]`.
--- a/docs/en/02-how-to-run/prebuilt_package_windows.md
+++ b/docs/en/02-how-to-run/prebuilt_package_windows.md
@ -21,7 +21,7 @@

 ______________________________________________________________________

-This tutorial takes `mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1.zip` and `mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip` as examples to show how to use the prebuilt packages.
+This tutorial takes `mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1.zip` and `mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip` as examples to show how to use the prebuilt packages.

 The directory structure of the prebuilt package is as follows, where the `dist` folder is about model converter, and the `sdk` folder is related to model inference.

@ -80,9 +80,9 @@ In order to use `ONNX Runtime` backend, you should also do the following steps.
 5. Install `mmdeploy` (Model Converter) and `mmdeploy_python` (SDK Python API).

   ```bash
-   # download mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1.zip
-   pip install .\mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\dist\mmdeploy-0.9.0-py38-none-win_amd64.whl
-   pip install .\mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\python\mmdeploy_python-0.9.0-cp38-none-win_amd64.whl
+   # download mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1.zip
+   pip install .\mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\dist\mmdeploy-0.10.0-py38-none-win_amd64.whl
+   pip install .\mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\python\mmdeploy_python-0.10.0-cp38-none-win_amd64.whl
   ```

   :point_right: If you have installed it before, please uninstall it first.
@ -107,9 +107,9 @@ In order to use `TensorRT` backend, you should also do the following steps.
 5. Install `mmdeploy` (Model Converter) and `mmdeploy_python` (SDK Python API).

   ```bash
-   # download mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip
-   pip install .\mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\dist\mmdeploy-0.9.0-py38-none-win_amd64.whl
-   pip install .\mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\python\mmdeploy_python-0.9.0-cp38-none-win_amd64.whl
+   # download mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip
+   pip install .\mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\dist\mmdeploy-0.10.0-py38-none-win_amd64.whl
+   pip install .\mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\python\mmdeploy_python-0.10.0-cp38-none-win_amd64.whl
   ```

   :point_right: If you have installed it before, please uninstall it first.
@ -138,7 +138,7 @@ After preparation work, the structure of the current working directory should be

 ```
 ..
-|-- mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1
+|-- mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1
 |-- mmclassification
 |-- mmdeploy
 `-- resnet18_8xb32_in1k_20210831-fbbb1da6.pth
@ -186,7 +186,7 @@ After installation of mmdeploy-tensorrt prebuilt package, the structure of the c

 ```
 ..
-|-- mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
+|-- mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
 |-- mmclassification
 |-- mmdeploy
 `-- resnet18_8xb32_in1k_20210831-fbbb1da6.pth
@ -249,8 +249,8 @@ The structure of current working directory：

 ```
 .
-|-- mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
-|-- mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1
+|-- mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
+|-- mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1
 |-- mmclassification
 |-- mmdeploy
 |-- resnet18_8xb32_in1k_20210831-fbbb1da6.pth
@ -311,7 +311,7 @@ The following describes how to use the SDK's C API for inference

 1. Build examples

-   Under `mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\example` directory
+   Under `mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\example` directory

   ```
   // Path should be modified according to the actual location
@ -319,7 +319,7 @@ The following describes how to use the SDK's C API for inference
   cd build
   cmake ..\cpp -A x64 -T v142 `
     -DOpenCV_DIR=C:\Deps\opencv\build\x64\vc15\lib `
-     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\lib\cmake\MMDeploy `
+     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\lib\cmake\MMDeploy `
     -DONNXRUNTIME_DIR=C:\Deps\onnxruntime\onnxruntime-win-gpu-x64-1.8.1

   cmake --build . --config Release
@ -329,7 +329,7 @@ The following describes how to use the SDK's C API for inference

   :point_right: The purpose is to make the exe find the relevant dll

-   If choose to add environment variables, add the runtime libraries path of `mmdeploy` (`mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\bin`) to the `PATH`.
+   If choose to add environment variables, add the runtime libraries path of `mmdeploy` (`mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\bin`) to the `PATH`.

   If choose to copy the dynamic libraries, copy the dll in the bin directory to the same level directory of the just compiled exe (build/Release).

@ -337,7 +337,7 @@ The following describes how to use the SDK's C API for inference

   It is recommended to use `CMD` here.

-   Under `mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\\sdk\\example\\build\\Release` directory：
+   Under `mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\\sdk\\example\\build\\Release` directory：

   ```
   .\image_classification.exe cpu C:\workspace\work_dir\onnx\resnet\ C:\workspace\mmclassification\demo\demo.JPEG
@ -347,7 +347,7 @@ The following describes how to use the SDK's C API for inference

 1. Build examples

-   Under `mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example` directory
+   Under `mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example` directory

   ```
   // Path should be modified according to the actual location
@ -355,7 +355,7 @@ The following describes how to use the SDK's C API for inference
   cd build
   cmake ..\cpp -A x64 -T v142 `
     -DOpenCV_DIR=C:\Deps\opencv\build\x64\vc15\lib `
-     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8 2.3.0\sdk\lib\cmake\MMDeploy `
+     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8 2.3.0\sdk\lib\cmake\MMDeploy `
     -DTENSORRT_DIR=C:\Deps\tensorrt\TensorRT-8.2.3.0 `
     -DCUDNN_DIR=C:\Deps\cudnn\8.2.1
   cmake --build . --config Release
@ -365,7 +365,7 @@ The following describes how to use the SDK's C API for inference

   :point_right: The purpose is to make the exe find the relevant dll

-   If choose to add environment variables, add the runtime libraries path of `mmdeploy` (`mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\bin`) to the `PATH`.
+   If choose to add environment variables, add the runtime libraries path of `mmdeploy` (`mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\bin`) to the `PATH`.

   If choose to copy the dynamic libraries, copy the dll in the bin directory to the same level directory of the just compiled exe (build/Release).

@ -373,7 +373,7 @@ The following describes how to use the SDK's C API for inference

   It is recommended to use `CMD` here.

-   Under `mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example\\build\\Release` directory
+   Under `mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example\\build\\Release` directory

   ```
   .\image_classification.exe cuda C:\workspace\work_dir\trt\resnet C:\workspace\mmclassification\demo\demo.JPEG
--- a/docs/en/05-supported-backends/rknn.md
+++ b/docs/en/05-supported-backends/rknn.md
@ -1,80 +1,9 @@
-# RKNN support
+# Supported RKNN feature

-This tutorial is based on Linux systems like Ubuntu-18.04 and Rockchip NPU like `rk3588`.
+Currently, MMDeploy only tests rk3588 with linux platform.

-## Installation
+The following features cannot be automatically enabled by mmdeploy and you need to manually modify the configuration in MMDeploy like [here](https://github.com/open-mmlab/mmdeploy/blob/master/configs/_base_/backends/rknn.py).

-It is recommended to create a virtual environment for the project.
-
-1. get RKNN-Toolkit2 through:
-
-   ```
-   git clone https://github.com/rockchip-linux/rknn-toolkit2
-   ```
-
-2. install RKNN python package following [official doc](https://github.com/rockchip-linux/rknn-toolkit2/tree/master/doc). In our testing, we used the rknn-toolkit 1.2.0 with commit id `834ba0b0a1ab8ee27024443d77b02b5ba48b67fc`.
-
-3. reinstall MMDeploy from source following the [instructions](../01-how-to-build/build_from_source.md). Note that there are conflicts between the pip dependencies of MMDeploy and RKNN. Here is the suggested packages versions for python 3.6:
-
-   ```
-   protobuf==3.19.4
-   onnx==1.8.0
-   onnxruntime==1.8.0
-   torch==1.8.0
-   torchvision==0.9.0
-   ```
-
-To work with models from [MMDetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/get_started.md), you may need to install it additionally.
-
-## Usage
-
-Example:
-
-```bash
-python tools/deploy.py \
-    configs/mmdet/detection/detection_rknn_static.py \
-    /mmdetection_dir/mmdetection/configs/yolo/yolov3_d53_mstrain-608_273e_coco.py \
-    /tmp/snapshots/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth \
-    tests/data/tiger.jpeg \
-    --work-dir ../deploy_result \
-    --device cpu
-```
-
-## Deployment config
-
-With the deployment config, you can modify the `backend_config` for your preference. An example `backend_config` of mmclassification is shown as below:
-
-```python
-backend_config = dict(
-    type='rknn',
-    common_config=dict(
-        mean_values=None,
-        std_values=None,
-        target_platform='rk3588',
-        optimization_level=3),
-    quantization_config=dict(do_quantization=False, dataset=None),
-    input_size_list=[[3, 224, 224]])
-
-```
-
-The contents of `common_config` are for `rknn.config()`. The contents of `quantization_config` are used to control `rknn.build()`.
-
-## Troubleshooting
-
- Quantization fails.
-
-  Empirically, RKNN require the inputs not normalized if `do_quantization` is set to `False`. Please modify the settings of `Normalize` in the `model_cfg` from
-
-  ```python
-  img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-  ```
-
-  to
-
-  ```python
-  img_norm_cfg = dict(
-    mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
-  ```
-
-  Besides, the `mean_values` and `std_values` of deploy_cfg should be replaced with original normalization settings of `model_cfg`. Let `mean_values=[123.675, 116.28, 103.53]` and `std_values=[58.395, 57.12, 57.375]`.
+- target_platform other than `3588`
+- quantization settings
+- optimization level other than 3
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@ -118,11 +118,11 @@ Take the latest precompiled package as example, you can install it as follows:

 ```shell
 # install MMDeploy
-wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.9.0/mmdeploy-0.9.0-linux-x86_64-onnxruntime1.8.1.tar.gz
-tar -zxvf mmdeploy-0.9.0-linux-x86_64-onnxruntime1.8.1.tar.gz
-cd mmdeploy-0.9.0-linux-x86_64-onnxruntime1.8.1
-pip install dist/mmdeploy-0.9.0-py3-none-linux_x86_64.whl
-pip install sdk/python/mmdeploy_python-0.9.0-cp38-none-linux_x86_64.whl
+wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.10.0/mmdeploy-0.10.0-linux-x86_64-onnxruntime1.8.1.tar.gz
+tar -zxvf mmdeploy-0.10.0-linux-x86_64-onnxruntime1.8.1.tar.gz
+cd mmdeploy-0.10.0-linux-x86_64-onnxruntime1.8.1
+pip install dist/mmdeploy-0.10.0-py3-none-linux_x86_64.whl
+pip install sdk/python/mmdeploy_python-0.10.0-cp38-none-linux_x86_64.whl
 cd ..
 # install inference engine: ONNX Runtime
 pip install onnxruntime==1.8.1
@ -139,11 +139,11 @@ export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH

 ```shell
 # install MMDeploy
-wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.9.0/mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
-tar -zxvf mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
-cd mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
-pip install dist/mmdeploy-0.9.0-py3-none-linux_x86_64.whl
-pip install sdk/python/mmdeploy_python-0.9.0-cp38-none-linux_x86_64.whl
+wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.10.0/mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
+tar -zxvf mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
+cd mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
+pip install dist/mmdeploy-0.10.0-py3-none-linux_x86_64.whl
+pip install sdk/python/mmdeploy_python-0.10.0-cp38-none-linux_x86_64.whl
 cd ..
 # install inference engine: TensorRT
 # !!! Download TensorRT-8.2.3.0 CUDA 11.x tar package from NVIDIA, and extract it to the current directory
@ -232,7 +232,7 @@ result = inference_model(
 You can directly run MMDeploy demo programs in the precompiled package to get inference results.

 ```shell
-cd mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
+cd mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
 # run python demo
 python sdk/example/python/object_detection.py cuda ../mmdeploy_model/faster-rcnn ../mmdetection/demo/demo.jpg
 # run C/C++ demo
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@ -61,6 +61,7 @@ You can switch between Chinese and English documents in the lower-left corner of
   05-supported-backends/snpe.md
   05-supported-backends/tensorrt.md
   05-supported-backends/torchscript.md
+   05-supported-backends/rknn.md

 .. toctree::
   :maxdepth: 1
--- a/docs/zh_cn/01-how-to-build/build_from_script.md
+++ b/docs/zh_cn/01-how-to-build/build_from_script.md
@ -2,7 +2,13 @@

 通过用户调研，我们得知多数使用者在了解 mmdeploy 前，已经熟知 python 和 torch 用法。因此我们提供脚本简化 mmdeploy 安装。

-假设您已经准备好 Python3.6 pip 以上环境（无论 conda 或 pyenv），运行这个脚本来安装 mmdeploy + ncnn backend，`nproc` 可以不指定。
+假设您已经准备好
+
+- python3 -m pip（必须，conda 或 pyenv 皆可）
+- nvcc（取决于推理后端）
+- torch（非必须，可延后安装）
+
+运行这个脚本来安装 mmdeploy + ncnn backend，`nproc` 可以不指定。

 ```bash
 $ cd /path/to/mmdeploy
--- a/docs/zh_cn/01-how-to-build/build_from_source.md
+++ b/docs/zh_cn/01-how-to-build/build_from_source.md
@ -42,3 +42,4 @@ git clone -b master git@github.com:open-mmlab/mmdeploy.git --recursive
 - [NVIDIA Jetson](jetsons.md)
 - [Qcom SNPE](snpe.md)
 - [RISC-V](riscv.md)
+- [Rockchip](rockchip.md)
--- a/docs/zh_cn/01-how-to-build/cross_build_ncnn_aarch64.md
+++ b/docs/zh_cn/01-how-to-build/cross_build_ncnn_aarch64.md
@ -0,0 +1,108 @@
+# ubuntu 交叉编译 aarch64
+
+mmdeploy 选 ncnn 作为 aarch64 嵌入式 linux 设备的推理后端。 完整的部署分为两部分：
+
+Host
+
+- 模型转换
+- 交叉编译嵌入式设备所需 SDK 和 bin
+
+Device
+
+- 运行编译结果
+
+## 1. Host 模型转换
+
+参照文档安装 [mmdeploy](../01-how-to-build/) 和 [mmcls](https://github.com/open-mmlab/mmclassification)，转换 resnet18 对应模型包
+
+```bash
+export MODEL_CONFIG=/path/to/mmclassification/configs/resnet/resnet18_8xb32_in1k.py
+export MODEL_PATH=https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth
+
+# 模型转换
+cd /path/to/mmdeploy
+python tools/deploy.py \
+  configs/mmcls/classification_ncnn_static.py \
+  $MODEL_CONFIG \
+  $MODEL_PATH \
+  tests/data/tiger.jpeg \
+  --work-dir resnet18 \
+  --device cpu \
+  --dump-info
+```
+
+## 2. Host 交叉编译
+
+建议直接用脚本编译
+
+```bash
+sh -x tools/scripts/ubuntu_cross_build_aarch64.sh
+```
+
+以下是脚本对应的手动过程
+
+a) 安装 aarch64 交叉编译工具
+
+```bash
+sudo apt install -y gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+```
+
+b) 交叉编译 opencv 安装到 tmp 目录
+
+```bash
+git clone https://github.com/opencv/opencv --depth=1 --branch=4.x --recursive
+cd opencv/platforms/linux/
+mkdir build && cd build
+cmake ../../.. \
+  -DCMAKE_INSTALL_PREFIX=/tmp/ocv-aarch64 \
+  -DCMAKE_TOOLCHAIN_FILE=../aarch64-gnu.toolchain.cmake
+make -j && make install
+ls -alh /tmp/ocv-aarch64
+..
+```
+
+c) 交叉编译 ncnn 安装到 tmp 目录
+
+```bash
+git clone https://github.com/tencent/ncnn --branch 20220729 --depth=1
+mkdir build && cd build
+cmake .. \
+  -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \
+  -DCMAKE_INSTALL_PREFIX=/tmp/ncnn-aarch64
+make -j && make install
+ls -alh /tmp/ncnn-aarch64
+..
+```
+
+d) 交叉编译 mmdeploy，install/bin 目录是可执行文件
+
+```bash
+git submodule init
+git submodule update
+mkdir build && cd build
+cmake .. \
+  -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/aarch64-linux-gnu.cmake \
+  -DMMDEPLOY_TARGET_DEVICES="cpu" \
+  -DMMDEPLOY_TARGET_BACKENDS="ncnn" \
+  -Dncnn_DIR=/tmp/ncnn-aarch64/lib/cmake/ncnn \
+  -DOpenCV_DIR=/tmp/ocv-aarch64/lib/cmake/opencv4
+make install
+ls -lah install/bin/*
+..
+```
+
+## 3. Device 执行
+
+确认转换模型用了 `--dump-info`，这样 `resnet18` 目录才有 `pipeline.json` 等 SDK 所需文件。
+
+把 dump 好的模型目录(resnet18)、可执行文件(image_classification)、测试图片(tests/data/tiger.jpeg)、交叉编译的 OpenCV(/tmp/ocv-aarch64) 拷贝到设备中
+
+```bash
+./image_classification cpu ./resnet18  tiger.jpeg
+..
+label: 292, score: 0.9261
+label: 282, score: 0.0726
+label: 290, score: 0.0008
+label: 281, score: 0.0002
+label: 340, score: 0.0001
+```
--- a/docs/zh_cn/01-how-to-build/rockchip.md
+++ b/docs/zh_cn/01-how-to-build/rockchip.md
@ -0,0 +1,147 @@
+# 支持 RKNN
+
+本教程基于 Ubuntu-18.04 和 Rockchip `rk3588` NPU。
+
+## 安装
+
+建议为项目创建一个虚拟环境。
+
+1. 获取 RKNN-Toolkit2:
+
+   ```
+   git clone git@github.com:rockchip-linux/rknn-toolkit2.git
+   ```
+
+2. 通过 [官方文档](https://github.com/rockchip-linux/rknn-toolkit2/tree/master/doc)，安装 RKNN python 安装包. 在我们的测试中, 使用的 rknn-toolkit 版本是 1.2.0，commit id `834ba0b0a1ab8ee27024443d77b02b5ba48b67fc`。安装 rknn-toolkit2 时，最好在安装命令后添加`--no-deps`，以避免依赖包的冲突。比如:
+
+   ```
+   pip install packages/rknn_toolkit2-1.2.0_f7bb160f-cp36-cp36m-linux_x86_64.whl --no-deps
+   ```
+
+3. 先安装onnx==1.8.0,跟着 [instructions](../01-how-to-build/build_from_source.md)，源码安装 MMDeploy。 需要注意的是， MMDeploy 和 RKNN 依赖的安装包间有冲突的内容. 这里提供建议在 python 3.6 环境中使用的安装包版本:
+
+   ```
+   protobuf==3.19.4
+   onnx==1.8.0
+   onnxruntime==1.8.0
+   torch==1.8.0
+   torchvision==0.9.0
+   ```
+
+4. 使用 conda 安装 torch and torchvision，比如:
+
+```
+conda install pytorch==1.8.0 torchvision==0.9.0 cudatoolkit=11.1 -c pytorch -c conda-forge
+```
+
+如要使用 [MMClassification](https://mmclassification.readthedocs.io/en/latest/getting_started.html)， 需要用户自己安装使用。
+
+## 使用
+
+例子:
+
+```bash
+python tools/deploy.py \
+    configs/mmcls/classification_rknn_static.py \
+    /mmclassification_dir/configs/resnet/resnet50_8xb32_in1k.py \
+    https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth \
+    /mmclassification_dir/demo/demo.JPEG \
+    --work-dir ../resnet50 \
+    --device cpu
+```
+
+## 部署 config
+
+部署 config，你可以根据需要修改 `backend_config` 字段. 一个 mmclassification 的 `backend_config`例子如下:
+
+```python
+backend_config = dict(
+    type='rknn',
+    common_config=dict(
+        mean_values=None,
+        std_values=None,
+        target_platform='rk3588',
+        optimization_level=3),
+    quantization_config=dict(do_quantization=False, dataset=None),
+    input_size_list=[[3, 224, 224]])
+
+```
+
+`common_config` 的内容服务于 `rknn.config()`. `quantization_config` 的内容服务于 `rknn.build()`。
+
+## 安装 SDK
+
+1. 获取 rknpu2:
+
+   ```
+   git clone git@github.com:rockchip-linux/rknpu2.git
+   ```
+
+2. 在 linux 系统, 下载 gcc 交叉编译器. `rknpu2` 的官方提供的下载链接无法使用了. 用户可以使用另一个 [链接](https://github.com/Caesar-github/gcc-buildroot-9.3.0-2020.03-x86_64_aarch64-rockchip-linux-gnu). 下载并解压完编译器, 打开终端, 设置 `RKNN_TOOL_CHAIN` 和 `RKNPU2_DEVICE_DIR` 为 `export RKNN_TOOL_CHAIN=/path/to/gcc/usr;export RKNPU2_DEVICE_DIR=/path/to/rknpu2/runtime/RK3588`。
+
+3. 上述准备工作完成后, 运行如下指令安装:
+
+```shell
+cd /path/to/mmdeploy
+mkdir -p build && rm -rf build/CM* && cd build
+export LD_LIBRARY_PATH=$RKNN_TOOL_CHAIN/lib64:$LD_LIBRARY_PATH
+cmake \
+    -DCMAKE_TOOLCHAIN_FILE=/path/to/mmdeploy/cmake/toolchains/rknpu2-linux-gnu.cmake \
+    -DMMDEPLOY_BUILD_SDK=ON \
+    -DCMAKE_BUILD_TYPE=Debug \
+    -DOpenCV_DIR=${RKNPU2_DEVICE_DIR}/../../examples/3rdparty/opencv/opencv-linux-aarch64/share/OpenCV \
+    -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \
+    -DMMDEPLOY_TARGET_DEVICES="cpu" \
+    -DMMDEPLOY_TARGET_BACKENDS="rknn" \
+    -DMMDEPLOY_CODEBASES=all \
+    -DMMDEPLOY_BUILD_TEST=ON \
+    -DMMDEPLOY_BUILD_EXAMPLES=ON \
+    ..
+make && make install
+```
+
+## 运行 SDK 的 demo
+
+首先，确保`--dump-info`在转模型的时候调用了, 这样工作目录下包含 SDK 需要的配置文件 `pipeline.json`。
+
+使用 `adb push` 将模型路径，执行文件和.so 文件传到板子上。
+
+```bash
+cd /path/to/mmdeploy
+adb push resnet50  /data/local/tmp/resnet50
+adb push /mmclassification_dir/demo/demo.JPEG /data/local/tmp/resnet50/demo.JPEG
+cd build
+adb push lib /data/local/tmp/lib
+adb push bin/image_classification /data/local/tmp/image_classification
+```
+
+设置环境变量，运行例子。
+
+```bash
+adb shell
+cd /data/local/tmp
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/data/local/tmp/lib
+./image_classification cpu ./resnet50  ./resnet50/demo.JPEG
+..
+label: 65, score: 0.95
+```
+
+## 问题点
+
+- 量化失败.
+
+  经验来说, 如果 `do_quantization` 被设置为 `True`，RKNN 需要的输入没有被归一化过。请修改 `Normalize` 在 `model_cfg` 的设置，如将
+
+  ```python
+  img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+  ```
+
+  改为
+
+  ```python
+  img_norm_cfg = dict(
+    mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True)
+  ```
+
+  此外, deploy_cfg 的 `mean_values` 和 `std_values` 应该被设置为 `model_cfg` 中归一化的设置. 使 `mean_values=[123.675, 116.28, 103.53]`， `std_values=[58.395, 57.12, 57.375]`。
--- a/docs/zh_cn/02-how-to-run/prebuilt_package_windows.md
+++ b/docs/zh_cn/02-how-to-run/prebuilt_package_windows.md
@ -23,7 +23,7 @@ ______________________________________________________________________

 目前，`MMDeploy`在`Windows`平台下提供`TensorRT`以及`ONNX Runtime`两种预编译包，可以从[Releases](https://github.com/open-mmlab/mmdeploy/releases)获取。

-本篇教程以`mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1.zip`和`mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip`为例，展示预编译包的使用方法。
+本篇教程以`mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1.zip`和`mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip`为例，展示预编译包的使用方法。

 为了方便使用者快速上手，本教程以分类模型(mmclassification)为例，展示两种预编译包的使用方法。

@ -88,9 +88,9 @@ ______________________________________________________________________
 5. 安装`mmdeploy`（模型转换）以及`mmdeploy_python`（模型推理Python API）的预编译包

   ```bash
-   # 先下载 mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1.zip
-   pip install .\mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\dist\mmdeploy-0.9.0-py38-none-win_amd64.whl
-   pip install .\mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\python\mmdeploy_python-0.9.0-cp38-none-win_amd64.whl
+   # 先下载 mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1.zip
+   pip install .\mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\dist\mmdeploy-0.10.0-py38-none-win_amd64.whl
+   pip install .\mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\python\mmdeploy_python-0.10.0-cp38-none-win_amd64.whl
   ```

   :point_right: 如果之前安装过，需要先卸载后再安装。
@ -115,9 +115,9 @@ ______________________________________________________________________
 5. 安装`mmdeploy`（模型转换）以及`mmdeploy_python`（模型推理Python API）的预编译包

   ```bash
-   # 先下载 mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip
-   pip install .\mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\dist\mmdeploy-0.9.0-py38-none-win_amd64.whl
-   pip install .\mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\python\mmdeploy_python-0.9.0-cp38-none-win_amd64.whl
+   # 先下载 mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0.zip
+   pip install .\mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\dist\mmdeploy-0.10.0-py38-none-win_amd64.whl
+   pip install .\mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\python\mmdeploy_python-0.10.0-cp38-none-win_amd64.whl
   ```

   :point_right: 如果之前安装过，需要先卸载后再安装
@ -146,7 +146,7 @@ ______________________________________________________________________

 ```
 ..
-|-- mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1
+|-- mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1
 |-- mmclassification
 |-- mmdeploy
 `-- resnet18_8xb32_in1k_20210831-fbbb1da6.pth
@ -194,7 +194,7 @@ export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)

 ```
 ..
-|-- mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
+|-- mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
 |-- mmclassification
 |-- mmdeploy
 `-- resnet18_8xb32_in1k_20210831-fbbb1da6.pth
@ -257,8 +257,8 @@ export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)

 ```
 .
-|-- mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
-|-- mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1
+|-- mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0
+|-- mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1
 |-- mmclassification
 |-- mmdeploy
 |-- resnet18_8xb32_in1k_20210831-fbbb1da6.pth
@ -327,7 +327,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet

 1. 编译 examples

-   在`mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\example`目录下
+   在`mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\example`目录下

   ```
   // 部分路径根据实际位置进行修改
@ -335,7 +335,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet
   cd build
   cmake ..\cpp -A x64 -T v142 `
     -DOpenCV_DIR=C:\Deps\opencv\build\x64\vc15\lib `
-     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\lib\cmake\MMDeploy `
+     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\lib\cmake\MMDeploy `
     -DONNXRUNTIME_DIR=C:\Deps\onnxruntime\onnxruntime-win-gpu-x64-1.8.1

   cmake --build . --config Release
@ -345,7 +345,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet

   :point_right: 目的是使exe运行时可以正确找到相关dll

-   若选择添加环境变量，则将`mmdeploy`的运行时库路径（`mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\sdk\bin`）添加到PATH，可参考onnxruntime的添加过程。
+   若选择添加环境变量，则将`mmdeploy`的运行时库路径（`mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\sdk\bin`）添加到PATH，可参考onnxruntime的添加过程。

   若选择拷贝动态库，而将bin目录中的dll拷贝到刚才编译出的exe(build/Release)的同级目录下。

@ -353,7 +353,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet

   这里建议使用cmd，这样如果exe运行时如果找不到相关的dll的话会有弹窗

-   在mmdeploy-0.9.0-windows-amd64-onnxruntime1.8.1\\sdk\\example\\build\\Release目录下：
+   在mmdeploy-0.10.0-windows-amd64-onnxruntime1.8.1\\sdk\\example\\build\\Release目录下：

   ```
   .\image_classification.exe cpu C:\workspace\work_dir\onnx\resnet\ C:\workspace\mmclassification\demo\demo.JPEG
@ -363,7 +363,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet

 1. 编译 examples

-   在mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example目录下
+   在mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example目录下

   ```
   // 部分路径根据所在硬盘的位置进行修改
@ -371,7 +371,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet
   cd build
   cmake ..\cpp -A x64 -T v142 `
     -DOpenCV_DIR=C:\Deps\opencv\build\x64\vc15\lib `
-     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8 2.3.0\sdk\lib\cmake\MMDeploy `
+     -DMMDeploy_DIR=C:\workspace\mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8 2.3.0\sdk\lib\cmake\MMDeploy `
     -DTENSORRT_DIR=C:\Deps\tensorrt\TensorRT-8.2.3.0 `
     -DCUDNN_DIR=C:\Deps\cudnn\8.2.1
   cmake --build . --config Release
@ -381,7 +381,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet

   :point_right: 目的是使exe运行时可以正确找到相关dll

-   若选择添加环境变量，则将`mmdeploy`的运行时库路径（`mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\bin`）添加到PATH，可参考onnxruntime的添加过程。
+   若选择添加环境变量，则将`mmdeploy`的运行时库路径（`mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\sdk\bin`）添加到PATH，可参考onnxruntime的添加过程。

   若选择拷贝动态库，而将bin目录中的dll拷贝到刚才编译出的exe(build/Release)的同级目录下。

@ -389,7 +389,7 @@ python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet

   这里建议使用cmd，这样如果exe运行时如果找不到相关的dll的话会有弹窗

-   在mmdeploy-0.9.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example\\build\\Release目录下：
+   在mmdeploy-0.10.0-windows-amd64-cuda11.1-tensorrt8.2.3.0\\sdk\\example\\build\\Release目录下：

   ```
   .\image_classification.exe cuda C:\workspace\work_dir\trt\resnet C:\workspace\mmclassification\demo\demo.JPEG
--- a/docs/zh_cn/05-supported-backends/rknn.md
+++ b/docs/zh_cn/05-supported-backends/rknn.md
@ -0,0 +1,9 @@
+# 支持的 RKNN 特征
+
+目前, MMDeploy 只在 rk3588 的 linux 平台上测试过.
+
+以下特性需要手动在 MMDeploy 自行配置，如[这里](https://github.com/open-mmlab/mmdeploy/blob/master/configs/_base_/backends/rknn.py).
+
+- target_platform ！= `3588`
+- quantization settings
+- optimization level ！= 3
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@ -113,11 +113,11 @@ mim install mmcv-full

 ```shell
 # 安装 MMDeploy ONNX Runtime 自定义算子库和推理 SDK
-wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.9.0/mmdeploy-0.9.0-linux-x86_64-onnxruntime1.8.1.tar.gz
-tar -zxvf mmdeploy-0.9.0-linux-x86_64-onnxruntime1.8.1.tar.gz
-cd mmdeploy-0.9.0-linux-x86_64-onnxruntime1.8.1
-pip install dist/mmdeploy-0.9.0-py3-none-linux_x86_64.whl
-pip install sdk/python/mmdeploy_python-0.9.0-cp38-none-linux_x86_64.whl
+wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.10.0/mmdeploy-0.10.0-linux-x86_64-onnxruntime1.8.1.tar.gz
+tar -zxvf mmdeploy-0.10.0-linux-x86_64-onnxruntime1.8.1.tar.gz
+cd mmdeploy-0.10.0-linux-x86_64-onnxruntime1.8.1
+pip install dist/mmdeploy-0.10.0-py3-none-linux_x86_64.whl
+pip install sdk/python/mmdeploy_python-0.10.0-cp38-none-linux_x86_64.whl
 cd ..
 # 安装推理引擎 ONNX Runtime
 pip install onnxruntime==1.8.1
@ -134,11 +134,11 @@ export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH

 ```shell
 # 安装 MMDeploy TensorRT 自定义算子库和推理 SDK
-wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.9.0/mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
-tar -zxvf mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
-cd mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
-pip install dist/mmdeploy-0.9.0-py3-none-linux_x86_64.whl
-pip install sdk/python/mmdeploy_python-0.9.0-cp38-none-linux_x86_64.whl
+wget https://github.com/open-mmlab/mmdeploy/releases/download/v0.10.0/mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
+tar -zxvf mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz
+cd mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
+pip install dist/mmdeploy-0.10.0-py3-none-linux_x86_64.whl
+pip install sdk/python/mmdeploy_python-0.10.0-cp38-none-linux_x86_64.whl
 cd ..
 # 安装推理引擎 TensorRT
 # !!! 从 NVIDIA 官网下载 TensorRT-8.2.3.0 CUDA 11.x 安装包并解压到当前目录
@ -226,7 +226,7 @@ result = inference_model(
 你可以直接运行预编译包中的 demo 程序，输入 SDK Model 和图像，进行推理，并查看推理结果。

 ```shell
-cd mmdeploy-0.9.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
+cd mmdeploy-0.10.0-linux-x86_64-cuda11.1-tensorrt8.2.3.0
 # 运行 python demo
 python sdk/example/python/object_detection.py cuda ../mmdeploy_model/faster-rcnn ../mmdetection/demo/demo.jpg
 # 运行 C/C++ demo
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@ -58,6 +58,7 @@
   05-supported-backends/onnxruntime.md
   05-supported-backends/openvino.md
   05-supported-backends/pplnn.md
+   05-supported-backends/rknn.md
   05-supported-backends/snpe.md
   05-supported-backends/tensorrt.md
   05-supported-backends/torchscript.md
--- a/mmdeploy/backend/tensorrt/utils.py
+++ b/mmdeploy/backend/tensorrt/utils.py
@ -169,7 +169,12 @@ def from_onnx(onnx_model: Union[str, onnx.ModelProto],
        builder.max_workspace_size = max_workspace_size

    config = builder.create_builder_config()
-    config.max_workspace_size = max_workspace_size
+
+    if hasattr(config, 'set_memory_pool_limit'):
+        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE,
+                                     max_workspace_size)
+    else:
+        config.max_workspace_size = max_workspace_size

    cuda_version = search_cuda_version()
    if cuda_version is not None:
@ -187,14 +192,19 @@ def from_onnx(onnx_model: Union[str, onnx.ModelProto],
        opt_shape = param['opt_shape']
        max_shape = param['max_shape']
        profile.set_shape(input_name, min_shape, opt_shape, max_shape)
-    config.add_optimization_profile(profile)
+    if config.add_optimization_profile(profile) < 0:
+        logger.warning(f'Invalid optimization profile {profile}.')

    if fp16_mode:
+        if not getattr(builder, 'platform_has_fast_fp16', True):
+            logger.warning('Platform does not has fast native fp16.')
        if version.parse(trt.__version__) < version.parse('8'):
            builder.fp16_mode = fp16_mode
        config.set_flag(trt.BuilderFlag.FP16)

    if int8_mode:
+        if not getattr(builder, 'platform_has_fast_int8', True):
+            logger.warning('Platform does not has fast native int8.')
        from .calib_utils import HDF5Calibrator
        config.set_flag(trt.BuilderFlag.INT8)
        assert int8_param is not None
--- a/mmdeploy/mmcv/cnn/init.py
+++ b/mmdeploy/mmcv/cnn/init.py
@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .transformer import (MultiHeadAttentionop,
-                          multiheadattention__forward__ncnn)
+from . import conv2d_adaptive_padding  # noqa: F401,F403
+from .transformer import MultiHeadAttentionop

-__all__ = ['multiheadattention__forward__ncnn', 'MultiHeadAttentionop']
+__all__ = ['conv2d_adaptive_padding', 'MultiHeadAttentionop']
--- a/mmdeploy/mmcv/cnn/conv2d_adaptive_padding.py
+++ b/mmdeploy/mmcv/cnn/conv2d_adaptive_padding.py
@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn.functional as F
+
+from mmdeploy.core import FUNCTION_REWRITER
+from mmdeploy.utils import Backend, is_dynamic_batch, is_dynamic_shape
+
+
+def compute_padding(input_size, kernel_size, stride, dilation):
+    """Compute padding."""
+
+    input_h, input_w = input_size
+    kernel_h, kernel_w = kernel_size
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    output_h = math.ceil(input_h / stride_h)
+    output_w = math.ceil(input_w / stride_w)
+    pad_h = max(
+        (output_h - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - input_h,
+        0)
+    pad_w = max(
+        (output_w - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - input_w,
+        0)
+    if pad_w > 0 or pad_h > 0:
+        padded = [
+            pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+        ]
+    else:
+        padded = None
+    return padded
+
+
+class AdaptivePadOp(torch.autograd.Function):
+    """Dummy adaptive pad op."""
+
+    @staticmethod
+    def forward(ctx, x, padded):
+        if padded is not None:
+            x = F.pad(x, padded)
+        return x
+
+    @staticmethod
+    def symbolic(g, x, padded):
+        if padded is None:
+            return g.op('Identity', x)
+        padded = g.op(
+            'Constant', value_t=torch.tensor(padded, dtype=torch.int64))
+        constant_value = g.op(
+            'Constant', value_t=torch.tensor(0, dtype=torch.int64))
+        return g.op(
+            'Pad', x, padded, constant_value, mode_s='constant', outputs=1)
+
+
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmcv.cnn.bricks.conv2d_adaptive_padding. \
+        Conv2dAdaptivePadding.forward',
+    backend=Backend.TENSORRT.value)
+def conv2d_adaptive_padding__forward__tensorrt(ctx, self, x):
+    """Rewrite `forward` of Conv2dAdaptivePadding used in EfficientNet for
+    TensorRT backend. Main changes of this rewritten function is to separate
+    the computation of padding and encapsulate it into another
+    `torch.autograd.Function` so that the adaptive padding could be parsed as
+    `Pad` ops in ONNX with the padding information computed in advance (Only
+    for static shape configuration).
+
+    Args:
+        x (Tensor): Input tensor of Conv2dAdaptivePadding ops
+    Returns:
+        Tensor: forward result of 2D convolution after padding
+    """
+
+    deploy_cfg = ctx.cfg
+    is_dynamic_flag = is_dynamic_shape(deploy_cfg)
+    if (not is_dynamic_flag) or is_dynamic_batch(deploy_cfg):
+        padded = compute_padding(x.shape[2:], self.weight.shape[2:],
+                                 self.stride, self.dilation)
+        if padded is not None:
+            padded = [int(_) for _ in padded]
+        x = AdaptivePadOp.apply(x, padded)
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+    else:
+        x = ctx.origin_func(x)
+        return x
--- a/mmdeploy/version.py
+++ b/mmdeploy/version.py
@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple

-__version__ = '0.9.0'
+__version__ = '0.10.0'
 short_version = __version__


--- a/resources/introduction.png
+++ b/resources/introduction.png
--- a/tests/regression/mmcls.yml
+++ b/tests/regression/mmcls.yml
@ -210,4 +210,13 @@ models:
    pipelines:
      - *pipeline_ts_fp32
      - *pipeline_ort_dynamic_fp32
-      - *pipeline_trt_dynamic_fp32
+      - *pipeline_trt_static_fp16
+
+  - name: EfficientNet
+    metafile: configs/efficientnet/metafile.yml
+    model_configs:
+      - configs/efficientnet/efficientnet-b0_8xb32_in1k.py
+    pipelines:
+      - *pipeline_ort_static_fp32
+      - convert_image: *convert_image
+        deploy_config: configs/mmcls/classification_tensorrt_dynamic-224x224-224x224.py
--- a/tests/test_csrc/CMakeLists.txt
+++ b/tests/test_csrc/CMakeLists.txt
@ -9,6 +9,7 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/core CORE_TC)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/preprocess TRANSFORM_TC)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/net NET_TC)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/model MODEL_TC)
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/graph GRAPH_TC)

 set(DEVICE_TC)
 foreach (DEVICE IN LISTS MMDEPLOY_TARGET_DEVICES)
@ -58,7 +59,8 @@ set(TC_SRCS
        ${MODEL_TC}
        ${NET_TC}
        ${DEVICE_TC}
-        ${CAPI_TC})
+        ${CAPI_TC}
+        ${GRAPH_TC})

 add_executable(mmdeploy_tests ${TC_SRCS})
 target_include_directories(mmdeploy_tests
--- a/tests/test_csrc/graph/test_cond.cpp
+++ b/tests/test_csrc/graph/test_cond.cpp
@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "catch.hpp"
+#include "mmdeploy/archive/json_archive.h"
+#include "mmdeploy/core/graph.h"
+#include "mmdeploy/core/registry.h"
+#include "mmdeploy/experimental/module_adapter.h"
+
+using namespace mmdeploy;
+
+namespace {
+
+class PlusCreator : public Creator<Module> {
+ public:
+  const char* GetName() const override { return "Plus"; }
+  std::unique_ptr<Module> Create(const Value&) override {
+    return CreateTask([](int a, int b) { return a + b; });
+  }
+};
+REGISTER_MODULE(Module, PlusCreator);
+
+const auto json_config1 = R"(
+{
+  "type": "Cond",
+  "input": ["pred", "a", "b"],
+  "output": "c",
+  "body": {
+    "type": "Task",
+    "module": "Plus"
+  }
+}
+)"_json;
+
+}  // namespace
+
+TEST_CASE("test Cond node", "[graph]") {
+  auto config = from_json<Value>(json_config1);
+  auto builder = graph::Builder::CreateFromConfig(config).value();
+  REQUIRE(builder);
+  auto node = builder->Build().value();
+  REQUIRE(node);
+  {
+    auto result = SyncWait(node->Process(Just(Value({{false}, {1}, {1}}))));
+    MMDEPLOY_INFO("{}", result);
+  }
+  {
+    auto result = SyncWait(node->Process(Just(Value({{true}, {1}, {1}}))));
+    MMDEPLOY_INFO("{}", result);
+  }
+  {
+    auto result = SyncWait(
+        node->Process(Just(Value({{false, false, false, false}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+    MMDEPLOY_INFO("{}", result);
+  }
+  {
+    auto result = SyncWait(
+        node->Process(Just(Value({{true, true, true, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+    MMDEPLOY_INFO("{}", result);
+  }
+  {
+    auto result = SyncWait(
+        node->Process(Just(Value({{true, false, false, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+    MMDEPLOY_INFO("{}", result);
+  }
+}
--- a/tests/test_csrc/graph/test_crnn.cpp
+++ b/tests/test_csrc/graph/test_crnn.cpp
--- a/tests/test_mmcv/test_mmcv_cnn.py
+++ b/tests/test_mmcv/test_mmcv_cnn.py
@ -30,3 +30,30 @@ def test_multiheadattention_ncnn():
    else:
        assert torch.allclose(
            model_outputs, rewrite_outputs[0], rtol=1e-03, atol=1e-05)
+
+
+def test_conv2d_adaptive_padding_tensorrt():
+    check_backend(Backend.TENSORRT)
+    from mmcv.cnn.bricks.conv2d_adaptive_padding import Conv2dAdaptivePadding
+    in_channels, out_channels = 3, 64
+    kernel_sz = 3
+    model = Conv2dAdaptivePadding(in_channels, out_channels, kernel_sz)
+    dummy_input = torch.rand(1, 3, 256, 256)
+
+    deploy_cfg = Config(
+        dict(
+            onnx_config=dict(input_shape=None),
+            backend_config=dict(type=Backend.TENSORRT.value),
+        ))
+    model_outputs = model(dummy_input)
+    rewrite_inputs = dict(x=dummy_input)
+    rewrite_outputs, is_backend_output = get_rewrite_outputs(
+        wrapped_model=model,
+        model_inputs=rewrite_inputs,
+        deploy_cfg=deploy_cfg,
+        run_with_backend=True)
+    if is_backend_output is None:
+        assert rewrite_outputs is not None
+    else:
+        assert torch.allclose(
+            model_outputs, rewrite_outputs[0], rtol=1e-03, atol=1e-05)
--- a/tools/package_tools/packaging/mmdeploy_python/version.py
+++ b/tools/package_tools/packaging/mmdeploy_python/version.py
@ -1,2 +1,2 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-__version__ = '0.9.0'
+__version__ = '0.10.0'
--- a/tools/scripts/build_ubuntu_x64_ncnn.py
+++ b/tools/scripts/build_ubuntu_x64_ncnn.py
@ -9,7 +9,8 @@ g_jobs = 2


 def install_protobuf(dep_dir) -> int:
-    """build and install protobuf.
+    """build and install protobuf. protobuf seems not support repeated install,
+    so clean build first.

    Args:
        wor_dir (_type_): _description_
@ -29,11 +30,22 @@ def install_protobuf(dep_dir) -> int:
    os.chdir(os.path.join(dep_dir, 'protobuf-3.20.0'))

    install_dir = os.path.join(dep_dir, 'pbinstall')
+    if os.path.exists(install_dir):
+        os.system('rm -rf {}'.format(install_dir))
+
+    os.system('make clean')
    os.system('./configure --prefix={}'.format(install_dir))
    os.system('make -j {} && make install'.format(g_jobs))
-    protoc = os.path.join(dep_dir, 'pbinstall', 'bin', 'protoc')
+    protoc = os.path.join(install_dir, 'bin', 'protoc')

    print('protoc \t:{}'.format(cmd_result('{} --version'.format(protoc))))
+
+    os.system(""" echo 'export PATH={}:$PATH' >> ~/mmdeploy.env """.format(
+        os.path.join(install_dir, 'bin')))
+    os.system(
+        """ echo 'export LD_LIBRARY_PATH={}:$LD_LIBRARY_PATH' >> ~/mmdeploy.env """  # noqa: E501
+        .format(os.path.join(install_dir, 'lib')))
+
    return 0


@ -60,6 +72,7 @@ def install_pyncnn(dep_dir):
        os.system('mkdir build')

    os.chdir(os.path.join(ncnn_dir, 'build'))
+    os.system('rm -rf CMakeCache.txt')
    pb_install = os.path.join(dep_dir, 'pbinstall')
    pb_bin = os.path.join(pb_install, 'bin', 'protoc')
    pb_lib = os.path.join(pb_install, 'lib', 'libprotobuf.so')
@ -101,9 +114,9 @@ def install_mmdeploy(work_dir, dep_dir, ncnn_cmake_dir):
    pb_lib = os.path.join(pb_install, 'lib', 'libprotobuf.so')
    pb_include = os.path.join(pb_install, 'include')

+    os.system('rm -rf build/CMakeCache.txt')
+
    cmd = 'cd build && cmake ..'
-    cmd += ' -DCMAKE_C_COMPILER=gcc-7 '
-    cmd += ' -DCMAKE_CXX_COMPILER=g++-7 '
    cmd += ' -DMMDEPLOY_BUILD_SDK=ON '
    cmd += ' -DMMDEPLOY_BUILD_EXAMPLES=ON '
    cmd += ' -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON '
@ -117,7 +130,14 @@ def install_mmdeploy(work_dir, dep_dir, ncnn_cmake_dir):

    os.system('cd build && make -j {} && make install'.format(g_jobs))
    os.system('python3 -m pip install -v -e .')
-    os.system('python3 tools/check_env.py')
+    os.system(""" echo 'export PATH={}:$PATH' >> ~/mmdeploy.env """.format(
+        os.path.join(work_dir, 'mmdeploy', 'backend', 'ncnn')))
+    try:
+        import mmcv
+        print(mmcv.__version__)
+        os.system('python3 tools/check_env.py')
+    except Exception:
+        print('Please install torch & mmcv later.. ╮(╯▽╰)╭')
    return 0


@ -143,7 +163,7 @@ def main():
            return -1
        os.mkdir(dep_dir)

-    success, envs = ensure_base_env(work_dir, dep_dir)
+    success = ensure_base_env(work_dir, dep_dir)
    if success != 0:
        return -1

@ -155,12 +175,9 @@ def main():
    if install_mmdeploy(work_dir, dep_dir, ncnn_cmake_dir) != 0:
        return -1

-    if len(envs) > 0:
-        print(
-            'We recommend that you set the following environment variables:\n')
-        for env in envs:
-            print(env)
-            print('\n')
+    if os.path.exists('~/mmdeploy.env'):
+        print('Please source ~/mmdeploy.env to setup your env !')
+        os.system('cat ~/mmdeploy.env')


 if __name__ == '__main__':
--- a/tools/scripts/build_ubuntu_x64_ort.py
+++ b/tools/scripts/build_ubuntu_x64_ort.py
@ -20,7 +20,7 @@ def install_ort(dep_dir):
    # git clone
    if not os.path.exists('onnxruntime-linux-x64-1.8.1'):
        os.system(
-            'wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz'  # noqa: E501
+            'wget -q --show-progress https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz'  # noqa: E501
        )
        os.system('tar xvf  onnxruntime-linux-x64-1.8.1.tgz')

@ -41,9 +41,9 @@ def install_mmdeploy(work_dir, ort_dir):
    if not os.path.exists('build'):
        os.system('mkdir build')

+    os.system('rm -rf build/CMakeCache.txt')
+
    cmd = 'cd build && cmake ..'
-    cmd += ' -DCMAKE_C_COMPILER=gcc-7 '
-    cmd += ' -DCMAKE_CXX_COMPILER=g++-7 '
    cmd += ' -DMMDEPLOY_BUILD_SDK=ON '
    cmd += ' -DMMDEPLOY_BUILD_EXAMPLES=ON '
    cmd += ' -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON '
@ -54,7 +54,12 @@ def install_mmdeploy(work_dir, ort_dir):

    os.system('cd build && make -j {} && make install'.format(g_jobs))
    os.system('python3 -m pip install -e .')
-    os.system('python3 tools/check_env.py')
+    try:
+        import mmcv
+        print(mmcv.__version__)
+        os.system('python3 tools/check_env.py')
+    except Exception:
+        print('Please install torch & mmcv later.. ⊙▽⊙')
    return 0


@ -80,7 +85,7 @@ def main():
            return -1
        os.mkdir(dep_dir)

-    success, envs = ensure_base_env(work_dir, dep_dir)
+    success = ensure_base_env(work_dir, dep_dir)
    if success != 0:
        return -1

@ -89,12 +94,9 @@ def main():
    if install_mmdeploy(work_dir, ort_dir) != 0:
        return -1

-    if len(envs) > 0:
-        print(
-            'We recommend that you set the following environment variables:\n')
-        for env in envs:
-            print(env)
-            print('\n')
+    if os.path.exists('~/mmdeploy.env'):
+        print('Please source ~/mmdeploy.env to setup your env !')
+        os.system('cat ~/mmdeploy.env')


 if __name__ == '__main__':
--- a/tools/scripts/build_ubuntu_x64_pplnn.py
+++ b/tools/scripts/build_ubuntu_x64_pplnn.py
@ -85,9 +85,9 @@ def install_mmdeploy(work_dir, pplnn_cmake_dir, pplcv_cmake_dir, build_cuda):
    if not os.path.exists('build'):
        os.system('mkdir build')

+    os.system('rm -rf build/CMakeCache.txt')
+
    cmd = 'cd build && cmake ..'
-    cmd += ' -DCMAKE_C_COMPILER=gcc-7 '
-    cmd += ' -DCMAKE_CXX_COMPILER=g++-7 '
    cmd += ' -DMMDEPLOY_BUILD_SDK=ON '
    cmd += ' -DMMDEPLOY_BUILD_EXAMPLES=ON '
    cmd += ' -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON '
@ -104,7 +104,12 @@ def install_mmdeploy(work_dir, pplnn_cmake_dir, pplcv_cmake_dir, build_cuda):

    os.system('cd build && make -j {} && make install'.format(g_jobs))
    os.system('python3 -m pip install -e .')
-    os.system('python3 tools/check_env.py')
+    try:
+        import mmcv
+        print(mmcv.__version__)
+        os.system('python3 tools/check_env.py')
+    except Exception:
+        print('Please install torch & mmcv later.. ∩▽∩')
    return 0


@ -130,23 +135,10 @@ def main():
            return -1
        os.mkdir(dep_dir)

-    success, envs = ensure_base_env(work_dir, dep_dir)
+    success = ensure_base_env(work_dir, dep_dir)
    if success != 0:
        return -1

-    # enable g++ and gcc
-    gplus = cmd_result('which g++')
-    if gplus is None or len(gplus) < 1:
-        sudo = 'sudo'
-        if 'root' in cmd_result('whoami'):
-            sudo = ''
-        os.system(
-            '{} update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 200'  # noqa: E501
-            .format(sudo))
-        os.system(
-            '{} update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-7 200'  # noqa: E501
-            .format(sudo))
-
    # install pplcv and pplnn
    nvcc = cmd_result('which nvcc')
    build_cuda = False
@ -158,12 +150,9 @@ def main():
                        build_cuda) != 0:
        return -1

-    if len(envs) > 0:
-        print(
-            'We recommend that you set the following environment variables:\n')
-        for env in envs:
-            print(env)
-            print('\n')
+    if os.path.exists('~/mmdeploy.env'):
+        print('Please source ~/mmdeploy.env to setup your env !')
+        os.system('cat ~/mmdeploy.env')


 if __name__ == '__main__':
--- a/tools/scripts/build_ubuntu_x64_torchscript.py
+++ b/tools/scripts/build_ubuntu_x64_torchscript.py
@ -3,7 +3,8 @@ import os
 import sys
 import time

-from ubuntu_utils import cmd_result, cu_version_name, ensure_base_env, get_job
+from ubuntu_utils import (cmd_result, cu_version_name, ensure_base_env,
+                          get_job, pytorch_version)

 g_jobs = 2

@ -17,15 +18,9 @@ def install_libtorch(dep_dir):
    if os.path.exists(unzipped_name):
        return os.path.join(dep_dir, unzipped_name)

-    torch_version = None
-    try:
-        import torch
-        torch_version = torch.__version__
-    except Exception:
-        pass
-
+    torch_version = pytorch_version()
    if torch_version is None:
-        print('torch version is None, use 1.11.0')
+        print('torch version is None, try 1.11.0')
        torch_version = '1.11.0'

    version_name = None
@ -46,7 +41,7 @@ def install_libtorch(dep_dir):
        torch_version, version_name)
    url = 'https://download.pytorch.org/libtorch/{}/{}'.format(
        version_name, filename)
-    os.system('wget {} -O libtorch.zip'.format(url))
+    os.system('wget -q --show-progress {} -O libtorch.zip'.format(url))
    os.system('unzip libtorch.zip')
    if not os.path.exists(unzipped_name):
        print(
@ -67,9 +62,9 @@ def install_mmdeploy(work_dir, libtorch_dir):
    if not os.path.exists('build'):
        os.system('mkdir build')

+    os.system('rm -rf build/CMakeCache.txt')
+
    cmd = 'cd build &&  Torch_DIR={} cmake ..'.format(libtorch_dir)
-    cmd += ' -DCMAKE_C_COMPILER=gcc-7 '
-    cmd += ' -DCMAKE_CXX_COMPILER=g++-7 '
    cmd += ' -DMMDEPLOY_BUILD_SDK=ON '
    cmd += ' -DMMDEPLOY_BUILD_EXAMPLES=ON '
    cmd += ' -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON '
@ -80,7 +75,12 @@ def install_mmdeploy(work_dir, libtorch_dir):

    os.system('cd build && make -j {} && make install'.format(g_jobs))
    os.system('python3 -m pip install -e .')
-    os.system('python3 tools/check_env.py')
+    try:
+        import mmcv
+        print(mmcv.__version__)
+        os.system('python3 tools/check_env.py')
+    except Exception:
+        print('Please install torch & mmcv later.. ≥▽≤')
    return 0


@ -106,7 +106,7 @@ def main():
            return -1
        os.mkdir(dep_dir)

-    success, envs = ensure_base_env(work_dir, dep_dir)
+    success = ensure_base_env(work_dir, dep_dir)
    if success != 0:
        return -1

@ -118,12 +118,9 @@ def main():
    if install_mmdeploy(work_dir, libtorch_dir) != 0:
        return -1

-    if len(envs) > 0:
-        print(
-            'We recommend that you set the following environment variables:\n')
-        for env in envs:
-            print(env)
-            print('\n')
+    if os.path.exists('~/mmdeploy.env'):
+        print('Please source ~/mmdeploy.env to setup your env !')
+        os.system('cat ~/mmdeploy.env')


 if __name__ == '__main__':
--- a/tools/scripts/ubuntu_cross_build_aarch64.sh
+++ b/tools/scripts/ubuntu_cross_build_aarch64.sh
@ -0,0 +1,105 @@
+#!/bin/bash
+# set -ex
+# get appropriate proc number: max(1, nproc-3)
+good_nproc() {
+  num=`nproc`
+  num=`expr $num - 3`
+  if [ $num -lt 1 ];then
+    return 1
+  fi
+  return ${num}
+}
+
+install_tools() {
+  sudo apt install -y gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+  aarch64-linux-gnu-g++ --version
+  aarch64-linux-gnu-gcc --version
+  aarch64-linux-gnu-ld --version
+
+  sudo apt install wget git git-lfs
+
+  python3 -m pip install cmake==3.22.0
+
+  echo 'export PATH=~/.local/bin:${PATH}' >> ~/mmdeploy.env
+  export PATH=~/.local/bin:${PATH}
+}
+
+build_ocv() {
+  if [ ! -e "opencv" ];then
+    git clone https://github.com/opencv/opencv --depth=1 --branch=4.6.0 --recursive
+  fi
+  if [ ! -e "opencv/platforms/linux/cross_build_aarch64" ];then
+    mkdir opencv/platforms/linux/cross_build_aarch64
+  fi
+  cd opencv/platforms/linux/cross_build_aarch64
+  rm -rf CMakeCache.txt
+  cmake ../../.. -DCMAKE_INSTALL_PREFIX=/tmp/ocv-aarch64 -DCMAKE_TOOLCHAIN_FILE=../aarch64-gnu.toolchain.cmake
+  good_nproc
+  jobs=$?
+  make -j${jobs}
+  make install
+  cd -
+}
+
+build_ncnn() {
+  if [ ! -e "ncnn" ];then
+    git clone https://github.com/tencent/ncnn --branch 20220729 --depth=1
+  fi
+  if [ ! -e "ncnn/build_aarch64" ];then
+    mkdir -p ncnn/build_aarch64
+  fi
+  cd ncnn/build_aarch64
+  rm -rf CMakeCache.txt
+  cmake .. \
+    -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \
+    -DCMAKE_INSTALL_PREFIX=/tmp/ncnn-aarch64
+  good_nproc
+  jobs=$?
+  make -j${jobs}
+  make install
+  cd -
+}
+
+build_mmdeploy() {
+  git submodule init
+  git submodule update
+
+  if [ ! -e "build_aarch64" ];then
+    mkdir build_aarch64
+  fi
+  cd build_aarch64
+
+  rm -rf CMakeCache.txt
+  cmake .. \
+    -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/aarch64-linux-gnu.cmake \
+    -DMMDEPLOY_TARGET_DEVICES="cpu" \
+    -DMMDEPLOY_TARGET_BACKENDS="ncnn" \
+    -Dncnn_DIR=/tmp/ncnn-aarch64/lib/cmake/ncnn \
+    -DOpenCV_DIR=/tmp/ocv-aarch64/lib/cmake/opencv4
+
+  good_nproc
+  jobs=$?
+  make -j${jobs}
+  make install
+
+  ls -lah install/bin/*
+}
+
+print_success() {
+  echo "----------------------------------------------------------------------"
+  echo "Cross build finished, PLS copy bin/model/test_data to the device.. QVQ"
+  echo "----------------------------------------------------------------------"
+}
+
+if [ ! -e "../mmdeploy-dep" ];then
+  mkdir ../mmdeploy-dep
+fi
+cd ../mmdeploy-dep
+
+install_tools
+build_ocv
+build_ncnn
+
+cd ../mmdeploy
+build_mmdeploy
+print_success
--- a/tools/scripts/ubuntu_utils.py
+++ b/tools/scripts/ubuntu_utils.py
@ -4,21 +4,33 @@ import re
 import time


+def pytorch_version():
+    version = None
+    try:
+        import torch
+        raw = torch.__version__
+        pattern = re.compile(r'[0-9]+\.[0-9]+\.[0-9]+')
+        version = pattern.findall(raw)[0]
+    except Exception:
+        pass
+    return version
+
+
 def cmd_result(txt: str):
    cmd = os.popen(txt)
    return cmd.read().rstrip().lstrip()


 def get_job(argv) -> int:
-    # get nprocs, if user not specified, use max(2, nproc-1)
+    # get nprocs, if user not specified, use max(1, nproc-2)
    job = 2
    if len(argv) <= 1:
        print('your can use `python3 {} N` to set make -j [N]'.format(argv[0]))
        nproc = cmd_result('nproc')
        if nproc is not None and len(nproc) > 0:
-            job = max(int(nproc) - 1, 2)
+            job = max(int(nproc) - 2, 1)
        else:
-            job = 2
+            job = 1
    else:
        job = int(argv[1])
    return job
@ -55,7 +67,7 @@ def ensure_base_env(work_dir, dep_dir):
    check python, root, pytorch version, auto install these binary:

    * make
-    * g++-7
+    * g++
    * git
    * wget
    * unzip
@ -63,7 +75,6 @@ def ensure_base_env(work_dir, dep_dir):
    * mmcv (not compulsory)
    """

-    envs = []
    print('-' * 10 + 'ensure base env' + '-' * 10)
    print(description)

@ -83,18 +94,18 @@ def ensure_base_env(work_dir, dep_dir):
    cmake = cmd_result('which cmake')
    if cmake is None or len(cmake) < 1:
        print('cmake not found, try install cmake ..', end='')
-        os.system('python3 -m pip install cmake>=3.14.0')
+        os.system('python3 -m pip install cmake')

        cmake = cmd_result('which cmake')
        if cmake is None or len(cmake) < 1:
            env = 'export PATH=${PATH}:~/.local/bin'
            os.system(env)
-            envs.append(env)
+            os.system(""" echo '{}' >> ~/mmdeploy.env """.format(env))

            cmake = cmd_result('which cmake')
            if cmake is None or len(cmake) < 1:
                print('Check cmake failed.')
-                return -1, envs
+                return -1
        print('success')

    # check  make
@ -109,14 +120,14 @@ def ensure_base_env(work_dir, dep_dir):
        make = cmd_result('which make')
        if make is None or len(make) < 1:
            print('Check make failed.')
-            return -1, envs
+            return -1
        print('success')

    # check g++ version
-    gplus = cmd_result('which g++-7')
+    gplus = cmd_result('which g++')
    if gplus is None or len(gplus) < 1:
        # install g++
-        print('g++-7 not found, try install g++-7 ..', end='')
+        print('g++ not found, try install g++ ..', end='')
        os.system(
            '{} DEBIAN_FRONTEND="noninteractive" apt install software-properties-common -y'  # noqa: E501
            .format(sudo))  # noqa: E501
@ -125,18 +136,12 @@ def ensure_base_env(work_dir, dep_dir):
            os.system(
                '{} add-apt-repository ppa:ubuntu-toolchain-r/test -y'.format(
                    sudo))
-        os.system('{} apt install gcc-7 g++-7 -y'.format(sudo))
+        os.system('{} apt install gcc g++ -y'.format(sudo))

-        gplus = cmd_result('which g++-7')
+        gplus = cmd_result('which g++')
        if gplus is None or len(gplus) < 1:
-            print('Check g++-7 failed.')
-            return -1, envs
-        os.system(
-            '{} update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 200'  # noqa: E501
-            .format(sudo))
-        os.system(
-            '{} update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-7 200'  # noqa: E501
-            .format(sudo))
+            print('Check g++ failed.')
+            return -1
        print('success')

    # wget
@ -197,7 +202,7 @@ def ensure_base_env(work_dir, dep_dir):
        ocv = cmd_result('which opencv_version')
        if ocv is None or len(ocv) < 1:
            print('Check ocv failed.')
-            return -1, envs
+            return -1
        print('success')

    # print all
@ -217,11 +222,11 @@ def ensure_base_env(work_dir, dep_dir):
        cmd_result(" make --version  | head -n 1 | awk '{print $3}' ")))

    print('wget bin\t:{}'.format(wget))
-    print('g++-7 bin\t:{}'.format(gplus))
+    print('g++ bin\t:{}'.format(gplus))

    print('mmcv version\t:{}'.format(mmcv_version))
    if mmcv_version is None:
-        print('\t please install an mm serials algorithm later.')
+        print('\t please install mmcv later.')
        time.sleep(2)

    print('torch version\t:{}'.format(torch_version))
@ -241,4 +246,4 @@ def ensure_base_env(work_dir, dep_dir):
    print('dep dir \t:{}'.format(dep_dir))

    print('\n')
-    return 0, envs
+    return 0