Merge master:ea7706cb into sync_master

2022-09-22 19:49:50 +08:00 · 2022-09-22 19:49:50 +08:00 · 40adb329a3
parent 1d38426891 ea7706cbfd
commit 40adb329a3
263 changed files with 8748 additions and 807 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@ -1,48 +0,0 @@
---
-name: Bug report
-about: Create a report to help us improve
-title: ''
-labels: ''
-assignees: ''
---
-
-Thanks for your bug report. We appreciate it a lot.
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. I have read the [FAQ documentation](https://github.com/open-mmlab/mmdeploy/blob/master/docs/en/faq.md) but cannot get the expected help.
-3. The bug has not been fixed in the latest version.
-
-**Describe the bug**
-
-A clear and concise description of what the bug is.
-
-**Reproduction**
-
-1. What command or script did you run?
-
-```none
-A placeholder for the command.
-```
-
-2. Did you make any modifications on the code or config? Did you understand what you have modified?
-
-**Environment**
-
-1. Please run `python tools/check_env.py` to collect necessary environment information and paste it here.
-2. You may add addition that may be helpful for locating the problem, such as
-   - How you installed PyTorch \[e.g., pip, conda, source\]
-   - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
-
-**Error traceback**
-
-If applicable, paste the error trackback here.
-
-```none
-A placeholder for trackback.
-```
-
-**Bug fix**
-
-If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -0,0 +1,56 @@
+name: Bug report
+description: Create a report to help us improve
+
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: I have searched related issues but cannot get the expected help.
+    - label: 2. I have read the [FAQ documentation](https://github.com/open-mmlab/mmdeploy/blob/master/docs/en/faq.md) but cannot get the expected help.
+    - label: 3. The bug has not been fixed in the latest version.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      1. What command or script did you run?
+
+      2. Did you make any modifications on the code or config? Did you understand what you have modified?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      1. Please run `python tools/check_env.py` to collect necessary environment information and paste it here.
+      2. You may add addition that may be helpful for locating the problem, such as
+         - How you installed PyTorch \[e.g., pip, conda, source\]
+         - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+    placeholder: Environment here.
+    render: Shell
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Error traceback
+    description: |
+      If applicable, paste the error trackback here.
+    placeholder: Logs and traceback here.
+    render: Shell
+- type: markdown
+  attributes:
+    value: >
+     If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
+
+     Thanks for your bug report. We appreciate it a lot.
+
+labels: ['Bug']
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -1,23 +0,0 @@
---
-name: Feature request
-about: Suggest an idea for this project
-title: ''
-labels: ''
-assignees: ''
---
-
-**Describe the feature**
-
-**Motivation**
-
-A clear and concise description of the motivation of the feature.
-Ex1. It is inconvenient when \[....\].
-
-**Related resources**
-
-If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
-
-**Additional context**
-
-Add any other context or screenshots about the feature request here.
-If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
--- a/.github/ISSUE_TEMPLATE/feature_request.yaml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yaml
@ -0,0 +1,27 @@
+name: Feature request
+description: Suggest an idea for this project
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      ## Describe the feature
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+      Ex1. It is inconvenient when \[....\].
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: |
+      Add any other context or screenshots about the feature request here.
+      If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
--- a/.github/scripts/doc_link_checker.py
+++ b/.github/scripts/doc_link_checker.py
@ -24,21 +24,29 @@ pattern = re.compile(r'\[.*?\]\(.*?\)')
 def analyze_doc(home, path):
    print('analyze {}'.format(path))
    problem_list = []
-    code_block = False
+    code_block = 0
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line.startswith('```'):
-                code_block = not code_block
-                continue
+                code_block = 1 - code_block

-            if code_block is True:
+            if code_block > 0:
                continue

            if '[' in line and ']' in line and '(' in line and ')' in line:
                all = pattern.findall(line)
                for item in all:
+                    # skip  ![]()
+                    if item.find('[') == item.find(']') - 1:
+                        continue
+
+                    # process the case [text()]()
+                    offset = item.find('](')
+                    if offset == -1:
+                        continue
+                    item = item[offset:]
                    start = item.find('(')
                    end = item.find(')')
                    ref = item[start + 1:end]
@ -62,7 +70,7 @@ def analyze_doc(home, path):

 def traverse(target):
    if os.path.isfile(target):
-        analyze_doc('./', target)
+        analyze_doc(os.path.dirname(target), target)
        return
    for home, dirs, files in os.walk(target):
        for filename in files:
--- a/.github/scripts/test_mmcls_full_pipeline.sh
+++ b/.github/scripts/test_mmcls_full_pipeline.sh
@ -0,0 +1,70 @@
+#!/bin/sh
+
+set -e
+# print env
+python3 tools/check_env.py
+
+deploy_cfg=configs/mmcls/classification_onnxruntime_dynamic.py
+device=cpu
+model_cfg=../mmclassification/configs/resnet/resnet18_8xb32_in1k.py
+checkpoint=https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth
+sdk_cfg=configs/mmcls/classification_sdk_dynamic.py
+input_img=../mmclassification/demo/demo.JPEG
+work_dir=work_dir
+
+echo "------------------------------------------------------------------------------------------------------------"
+echo "deploy_cfg=$deploy_cfg"
+echo "model_cfg=$model_cfg"
+echo "checkpoint=$checkpoint"
+echo "device=$device"
+echo "------------------------------------------------------------------------------------------------------------"
+
+mkdir -p $work_dir
+
+python3 tools/deploy.py \
+  $deploy_cfg \
+  $model_cfg \
+  $checkpoint \
+  $input_img \
+  --device $device \
+  --work-dir $work_dir \
+  --dump-info
+
+# prepare dataset
+wget -P data/ https://github.com/open-mmlab/mmdeploy/files/9401216/imagenet-val100.zip
+unzip data/imagenet-val100.zip -d data/
+
+echo "Running test with ort"
+
+python3 tools/test.py \
+  $deploy_cfg \
+  $model_cfg \
+  --model $work_dir/end2end.onnx \
+  --device $device \
+  --out $work_dir/ort_out.pkl \
+  --metrics accuracy \
+  --device $device \
+  --log2file $work_dir/test_ort.log \
+  --speed-test \
+  --log-interval 50 \
+  --warmup 20 \
+  --batch-size 32
+
+echo "Running test with sdk"
+
+# change topk for test
+sed -i 's/"topk": 5/"topk": 1000/g' work_dir/pipeline.json
+
+python3 tools/test.py \
+  $sdk_cfg \
+  $model_cfg \
+  --model $work_dir \
+  --device $device \
+  --out $work_dir/sdk_out.pkl \
+  --metrics accuracy \
+  --device $device \
+  --log2file $work_dir/test_sdk.log \
+  --speed-test \
+  --log-interval 50 \
+  --warmup 20 \
+  --batch-size 1
--- a/.github/workflows/backend-ascend.yml
+++ b/.github/workflows/backend-ascend.yml
@ -0,0 +1,54 @@
+name: backend-ascend
+
+on:
+  push:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+
+  pull_request:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+      - "docs/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_sdk_demo:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: update
+        run: sudo apt update
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libc++1-9 libc++abi1-9
+          sudo add-apt-repository ppa:ignaciovizzo/opencv3-nonfree
+          sudo apt install libopencv-dev
+          pkg-config --libs opencv
+      - name: Install Ascend Toolkit
+        run: |
+          mkdir -p $GITHUB_WORKSPACE/Ascend
+          wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%205.1.RC2/Ascend-cann-toolkit_5.1.RC2_linux-x86_64.run
+          sh Ascend-cann-toolkit_5.1.RC2_linux-x86_64.run --install --install-path=$GITHUB_WORKSPACE/Ascend --quiet --chip=Ascend310 --blacklist=devtools
+      - name: Build SDK Demo with Ascend backend
+        run: |
+          mkdir -p build && pushd build
+          source $GITHUB_WORKSPACE/Ascend/ascend-toolkit/set_env.sh
+          export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/Ascend/ascend-toolkit/latest/runtime/lib64/stub:$LD_LIBRARY_PATH
+          cmake .. -DCMAKE_CXX_COMPILER=g++-7    -DMMDEPLOY_SHARED_LIBS=ON   -DMMDEPLOY_BUILD_SDK=ON       -DMMDEPLOY_BUILD_SDK_PYTHON_API=OFF  -DMMDEPLOY_TARGET_DEVICES=cpu   -DMMDEPLOY_BUILD_EXAMPLES=ON    -DMMDEPLOY_TARGET_BACKENDS=acl  -DMMDEPLOY_CODEBASES=all
+          make install -j4
--- a/.github/workflows/backend-coreml.yml
+++ b/.github/workflows/backend-coreml.yml
@ -0,0 +1,71 @@
+name: backend-coreml
+
+on:
+  push:
+    paths:
+      - "csrc/**"
+      - "demo/csrc/**"
+      - "CMakeLists.txt"
+
+  pull_request:
+    paths:
+      - "csrc/**"
+      - "demo/csrc/**"
+      - "CMakeLists.txt"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+env:
+  DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
+permissions:
+  contents: read
+
+jobs:
+  build_macos_arm64:
+    runs-on: macos-12
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: install opencv
+        run: |
+          wget https://github.com/irexyc/mmdeploy-ci-resource/releases/download/opencv/opencv-osx-arm64-4.6.0.tar.gz
+          mkdir $GITHUB_WORKSPACE/opencv-install
+          tar xf opencv-osx-arm64-4.6.0.tar.gz -C $GITHUB_WORKSPACE/opencv-install
+      - name: install libtorch
+        run: |
+          wget https://github.com/irexyc/mmdeploy-ci-resource/releases/download/libtorch/libtorch-osx-arm64-1.8.0.tar.gz
+          mkdir $GITHUB_WORKSPACE/libtorch-install
+          tar xf libtorch-osx-arm64-1.8.0.tar.gz -C $GITHUB_WORKSPACE/libtorch-install
+      - name: build
+        run: |
+          mkdir build && cd build
+          cmake .. -DCMAKE_OSX_ARCHITECTURES="arm64" \
+            -DCMAKE_SYSTEM_PROCESSOR="arm64" \
+            -DMMDEPLOY_BUILD_SDK=ON \
+            -DMMDEPLOY_TARGET_DEVICES="cpu" \
+            -DMMDEPLOY_CODEBASES=all \
+            -DOpenCV_DIR=$GITHUB_WORKSPACE/opencv-install/lib/cmake/opencv4 \
+            -DTorch_DIR=$GITHUB_WORKSPACE/libtorch-install/share/cmake/Torch \
+            -DMMDEPLOY_TARGET_BACKENDS="coreml" \
+            -DMMDEPLOY_BUILD_EXAMPLES=ON \
+            -DMMDEPLOY_SHARED_LIBS=OFF
+          cmake --build . -j 3
+          cmake --build . --target install
+      - name: build-shared
+        run: |
+          mkdir build-shared && cd build-shared
+          cmake .. -DCMAKE_OSX_ARCHITECTURES="arm64" \
+            -DCMAKE_SYSTEM_PROCESSOR="arm64" \
+            -DMMDEPLOY_BUILD_SDK=ON \
+            -DMMDEPLOY_TARGET_DEVICES="cpu" \
+            -DMMDEPLOY_CODEBASES=all \
+            -DOpenCV_DIR=$GITHUB_WORKSPACE/opencv-install/lib/cmake/opencv4 \
+            -DTorch_DIR=$GITHUB_WORKSPACE/libtorch-install/share/cmake/Torch \
+            -DMMDEPLOY_TARGET_BACKENDS="coreml" \
+            -DMMDEPLOY_BUILD_EXAMPLES=ON \
+            -DMMDEPLOY_SHARED_LIBS=ON
+          cmake --build . -j 3
+          cmake --build . --target install
--- a/.github/workflows/backend-ncnn.yml
+++ b/.github/workflows/backend-ncnn.yml
@ -22,11 +22,6 @@ jobs:
    strategy:
      matrix:
        python-version: [3.7]
-        torch: [1.9.0]
-        include:
-          - torch: 1.9.0
-            torch_version: torch1.9
-            torchvision: 0.10.0
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
@ -65,3 +60,23 @@ jobs:
          echo $(pwd)
          ln -s build/bin/mmdeploy_onnx2ncnn ./
          python3 .github/scripts/test_onnx2ncnn.py --run 1
+  script_install:
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install mmdeploy
+        run: |
+          python3 tools/scripts/build_ubuntu_x64_ncnn.py
+          python3 -m pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu
+          python3 -m pip install mmcv-full==1.5.1  -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
+          python3 -c 'import mmdeploy.apis.ncnn as ncnn_api; assert ncnn_api.is_available() and ncnn_api.is_custom_ops_available()'
--- a/.github/workflows/backend-ort.yml
+++ b/.github/workflows/backend-ort.yml
@ -0,0 +1,48 @@
+name: backend-ort
+
+on:
+  push:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+
+  pull_request:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+      - "docs/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  script_install:
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install mmdeploy
+        run: |
+          python3 tools/scripts/build_ubuntu_x64_ort.py
+          python3 -m pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu
+          python3 -m pip install mmcv-full==1.5.1  -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
+          python3 -c 'import mmdeploy.apis.onnxruntime as ort_api; assert ort_api.is_available() and ort_api.is_custom_ops_available()'
+      - name: test mmcls full pipeline
+        run: |
+          pip install openmim
+          mim install mmcls
+          git clone  --depth 1 --single-branch --branch master https://github.com/open-mmlab/mmclassification.git ../mmclassification
+          export MMDEPLOY_DIR=$(pwd)
+          export ONNXRUNTIME_DIR=$MMDEPLOY_DIR/../mmdeploy-dep/onnxruntime-linux-x64-1.8.1
+          export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$MMDEPLOY_DIR/build/install/lib:$LD_LIBRARY_PATH
+          bash .github/scripts/test_mmcls_full_pipeline.sh
--- a/.github/workflows/backend-pplnn.yml
+++ b/.github/workflows/backend-pplnn.yml
@ -0,0 +1,39 @@
+name: backend-pplnn
+
+on:
+  push:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+
+  pull_request:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+      - "docs/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  script_install:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install mmdeploy
+        run: |
+          python3 tools/scripts/build_ubuntu_x64_pplnn.py
+          python3 -m pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu
+          python3 -m pip install mmcv-full==1.5.1  -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
+          python3 -c 'import mmdeploy.apis.pplnn as pplnn_api; assert pplnn_api.is_available()'
--- a/.github/workflows/backend-torchscript.yml
+++ b/.github/workflows/backend-torchscript.yml
@ -0,0 +1,36 @@
+name: backend-ort
+
+on:
+  push:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+
+  pull_request:
+    paths-ignore:
+      - "demo/**"
+      - "tools/**"
+      - "docs/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  script_install:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install mmdeploy
+        run: |
+          python3 tools/scripts/build_ubuntu_x64_torchscript.py
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -121,7 +121,7 @@ jobs:
        run: |
          python -V
          python -m pip install mmcv-full==${{matrix.mmcv}} -f https://download.openmmlab.com/mmcv/dist/cu102/${{matrix.torch_version}}/index.html
-          python -m pip install -r requirements.txt
+          CFLAGS=`python -c 'import sysconfig;print("-I"+sysconfig.get_paths()["include"])'` python -m pip install -r requirements.txt
          pip install -U pycuda
          python -m pip install -U numpy
      - name: Build and install
@ -167,7 +167,7 @@ jobs:
        run: |
          python -V
          python -m pip install mmcv-full==${{matrix.mmcv}} -f https://download.openmmlab.com/mmcv/dist/cu111/${{matrix.torch_version}}/index.html
-          python -m pip install -r requirements.txt
+          CFLAGS=`python -c 'import sysconfig;print("-I"+sysconfig.get_paths()["include"])'` python -m pip install -r requirements.txt
          pip install -U pycuda
          python -m pip install -U numpy
      - name: Build and install
--- a/.github/workflows/linux-riscv64-gcc.yml
+++ b/.github/workflows/linux-riscv64-gcc.yml
@ -0,0 +1,56 @@
+name: build_riscv64_gcc
+
+on:
+  push:
+    paths:
+      - "csrc/**"
+      - "demo/csrc/**"
+      - "CMakeLists.txt"
+
+  pull_request:
+    paths-ignore:
+      - "csrc/**"
+      - "demo/csrc/**"
+      - "CMakeLists.txt"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_riscv64_gcc:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - name: riscv64-gnu-toolchain
+        run: |
+          sudo apt-get update
+          sudo apt-get install g++-riscv64-linux-gnu
+      - name: install opencv
+        run: |
+          mkdir $GITHUB_WORKSPACE/opencv-install
+          wget https://github.com/irexyc/mmdeploy-ci-resource/raw/opencv/opencv_4.6.0_linux_riscv64.tar.gz
+          tar xf opencv_4.6.0_linux_riscv64.tar.gz -C $GITHUB_WORKSPACE/opencv-install
+      - name: install ncnn
+        run: |
+          mkdir $GITHUB_WORKSPACE/ncnn-install
+          wget https://github.com/irexyc/mmdeploy-ci-resource/raw/ncnn/ncnn_20220729_linux_riscv64.tar.gz
+          tar xf ncnn_20220729_linux_riscv64.tar.gz -C $GITHUB_WORKSPACE/ncnn-install
+      - name: build
+        run: |
+          mkdir build && cd build
+          cmake .. \
+            -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/riscv64-linux-gnu.cmake \
+            -DMMDEPLOY_BUILD_SDK=ON \
+            -DMMDEPLOY_SHARED_LIBS=ON \
+            -DMMDEPLOY_BUILD_EXAMPLES=ON \
+            -DMMDEPLOY_TARGET_DEVICES="cpu" \
+            -DMMDEPLOY_TARGET_BACKENDS="ncnn" \
+            -Dncnn_DIR=$GITHUB_WORKSPACE/ncnn-install/lib/cmake/ncnn/ \
+            -DMMDEPLOY_CODEBASES=all \
+            -DOpenCV_DIR=$GITHUB_WORKSPACE/opencv-install/lib/cmake/opencv4
+          make -j$(nproc)
+          make install
--- a/.github/workflows/quantize.yml
+++ b/.github/workflows/quantize.yml
@ -50,7 +50,7 @@ jobs:
        run: |
          python -V
          python -m pip install mmcv-full==${{matrix.mmcv}} -f https://download.openmmlab.com/mmcv/dist/cu111/${{matrix.torch_version}}/index.html
-          python -m pip install -r requirements.txt
+          CFLAGS=`python -c 'import sysconfig;print("-I"+sysconfig.get_paths()["include"])'` python -m pip install -r requirements.txt
          python -m pip install -U numpy

      - name: Install mmcls
--- a/.gitignore
+++ b/.gitignore
@ -155,6 +155,14 @@ mmdeploy/backend/ncnn/onnx2ncnn

 # OCR dicts
 dicts
+# ascend
+fusion_result.json
+
 # snpe
 grpc-cpp-plugin
 service/snpe/grpc_cpp_plugin
+
+# elena-code
+csrc/mmdeploy/preprocess/elena/json
+csrc/mmdeploy/preprocess/elena/cpu_kernel/*
+csrc/mmdeploy/preprocess/elena/cuda_kernel/*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -5,7 +5,7 @@ endif ()
 message(STATUS "CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")

 cmake_minimum_required(VERSION 3.14)
-project(MMDeploy VERSION 0.7.0)
+project(MMDeploy VERSION 0.8.0)

 set(CMAKE_CXX_STANDARD 17)

@ -34,6 +34,7 @@ option(MMDEPLOY_BUILD_EXAMPLES "build examples" OFF)
 option(MMDEPLOY_SPDLOG_EXTERNAL "use external spdlog" OFF)
 option(MMDEPLOY_ZIP_MODEL "support SDK model in zip format" OFF)
 option(MMDEPLOY_COVERAGE "build SDK for coverage" OFF)
+option(MMDEPLOY_ELENA_FUSION "use elena to fuse preprocess" OFF)

 set(MMDEPLOY_TARGET_DEVICES "cpu" CACHE STRING "target devices to support")
 set(MMDEPLOY_TARGET_BACKENDS "" CACHE STRING "target inference engines to support")
@ -77,6 +78,10 @@ if (MSVC)
    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/wd4251>)
 endif ()

+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fobjc-arc")
+endif()
+
 add_library(MMDeployStaticModules INTERFACE)
 add_library(MMDeployDynamicModules INTERFACE)
 add_library(MMDeployLibs INTERFACE)
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -53,9 +53,9 @@ MMDeploy 是 [OpenMMLab](https://openmmlab.com/) 模型部署工具箱，**为

 ### 支持多种推理后端

-| ONNX Runtime | TensorRT | ppl.nn | ncnn | OpenVINO | LibTorch | snpe | Ascend | Core ML | RKNN | more                                              |
-| ------------ | -------- | ------ | ---- | -------- | -------- | ---- | ------ | ------- | ---- | ------------------------------------------------- |
-| ✔️           | ✔️       | ✔️     | ✔️   | ✔️       | ✔️       | ✔️   | ✔️     | ✔️      | ✔️   | [benchmark](docs/zh_cn/03-benchmark/benchmark.md) |
+| ONNX Runtime | TensorRT | ppl.nn | ncnn | OpenVINO | LibTorch | snpe | Ascend | Core ML | RKNN | more                                           |
+| ------------ | -------- | ------ | ---- | -------- | -------- | ---- | ------ | ------- | ---- | ---------------------------------------------- |
+| ✔️           | ✔️       | ✔️     | ✔️   | ✔️       | ✔️       | ✔️   | ✔️     | ✔️      | ✔️   | [benchmark](docs/en/03-benchmark/benchmark.md) |

 ### SDK 可高度定制化

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -16,19 +16,19 @@ find_package(CUDA REQUIRED)

 if (MSVC)
    set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc.exe)
-    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler=/wd4819,/wd4828")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/wd4819,/wd4828")
    if (HAVE_CXX_FLAG_UTF_8)
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler=/utf-8")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/utf-8")
    endif ()
 else ()
    set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
    # Explicitly set the cuda host compiler. Because the default host compiler #
    # selected by cmake maybe wrong.
    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-    set(CUDA_NVCC_FLAGS
-            "${CUDA_NVCC_FLAGS} -Xcompiler=-fPIC,-Wall,-fvisibility=hidden")
+    set(CMAKE_CUDA_FLAGS
+            "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC,-Wall,-fvisibility=hidden")
    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler=-fno-gnu-unique")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-gnu-unique")
    endif ()
 endif ()

@ -62,10 +62,12 @@ if (NOT CMAKE_CUDA_ARCHITECTURES)
    endif ()
 endif ()

-set(CUDA_NVCC_FLAGS_DEBUG "-g -O0")
-set(CUDA_NVCC_FLAGS_RELEASE "-O3")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CMAKE_CUDA_FLAGS_DEBUG "-g -O0")
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMMDEPLOY_USE_CUDA=1")
+
 if (NOT MSVC)
    set(CMAKE_CUDA_STANDARD 14)
 endif ()
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_NVCC_FLAGS} ${_NVCC_FLAGS}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${_NVCC_FLAGS}")
--- a/cmake/toolchains/riscv64-linux-gnu.cmake
+++ b/cmake/toolchains/riscv64-linux-gnu.cmake
@ -0,0 +1,17 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv)
+
+set(CMAKE_C_COMPILER "riscv64-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "riscv64-linux-gnu-g++")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_FLAGS "-march=rv64gc")
+set(CMAKE_CXX_FLAGS "-march=rv64gc")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
--- a/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
+++ b/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
@ -0,0 +1,26 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv)
+
+if(DEFINED ENV{RISCV_ROOT_PATH})
+    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+else()
+    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+endif()
+
+set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
+
+set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot" CACHE PATH "riscv sysroot")
+set(CMAKE_FIND_ROOT_PATH ${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_FLAGS "-march=rv64gc")
+set(CMAKE_CXX_FLAGS "-march=rv64gc")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
--- a/configs/_base_/backends/ascend.py
+++ b/configs/_base_/backends/ascend.py
@ -0,0 +1 @@
+backend_config = dict(type='ascend')
--- a/configs/_base_/backends/coreml.py
+++ b/configs/_base_/backends/coreml.py
@ -0,0 +1 @@
+backend_config = dict(type='coreml', convert_to='mlprogram')
--- a/configs/_base_/backends/rknn.py
+++ b/configs/_base_/backends/rknn.py
@ -0,0 +1,8 @@
+backend_config = dict(
+    type='rknn',
+    common_config=dict(
+        mean_values=None,
+        std_values=None,
+        target_platform='rk3588',
+        optimization_level=3),
+    quantization_config=dict(do_quantization=False, dataset=None))
--- a/configs/mmcls/classification_ascend_dynamic-224x224-224x224.py
+++ b/configs/mmcls/classification_ascend_dynamic-224x224-224x224.py
@ -0,0 +1,9 @@
+_base_ = ['./classification_dynamic.py', '../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[224, 224])
+
+backend_config = dict(model_inputs=[
+    dict(
+        dynamic_batch_size=[1, 2, 4, 8],
+        input_shapes=dict(input=[-1, 3, 224, 224]))
+])
--- a/configs/mmcls/classification_ascend_static-224x224.py
+++ b/configs/mmcls/classification_ascend_static-224x224.py
@ -0,0 +1,5 @@
+_base_ = ['./classification_static.py', '../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[224, 224])
+backend_config = dict(
+    model_inputs=[dict(input_shapes=dict(input=[1, 3, 224, 224]))])
--- a/configs/mmcls/classification_coreml_dynamic-224x224-224x224.py
+++ b/configs/mmcls/classification_coreml_dynamic-224x224-224x224.py
@ -0,0 +1,12 @@
+_base_ = ['../_base_/torchscript_config.py', '../_base_/backends/coreml.py']
+
+codebase_config = dict(type='mmcls', task='Classification')
+
+backend_config = dict(model_inputs=[
+    dict(
+        input_shapes=dict(
+            input=dict(
+                min_shape=[1, 3, 224, 224],
+                max_shape=[8, 3, 224, 224],
+                default_shape=[1, 3, 224, 224])))
+])
--- a/configs/mmcls/classification_rknn_static.py
+++ b/configs/mmcls/classification_rknn_static.py
@ -0,0 +1,5 @@
+_base_ = ['./classification_static.py', '../_base_/backends/rknn.py']
+
+onnx_config = dict(input_shape=[224, 224])
+codebase_config = dict(model_type='rknn')
+backend_config = dict(input_size_list=[[3, 224, 224]])
--- a/configs/mmcls/classification_tensorrt-fp16_dynamic-224x224-224x224.py
+++ b/configs/mmcls/classification_tensorrt-fp16_dynamic-224x224-224x224.py
@ -9,5 +9,5 @@ backend_config = dict(
                input=dict(
                    min_shape=[1, 3, 224, 224],
                    opt_shape=[4, 3, 224, 224],
-                    max_shape=[64, 3, 224, 224])))
+                    max_shape=[8, 3, 224, 224])))
    ])
--- a/configs/mmcls/classification_tensorrt-int8_dynamic-224x224-224x224.py
+++ b/configs/mmcls/classification_tensorrt-int8_dynamic-224x224-224x224.py
@ -9,5 +9,5 @@ backend_config = dict(
                input=dict(
                    min_shape=[1, 3, 224, 224],
                    opt_shape=[4, 3, 224, 224],
-                    max_shape=[64, 3, 224, 224])))
+                    max_shape=[8, 3, 224, 224])))
    ])
--- a/configs/mmcls/classification_tensorrt_dynamic-224x224-224x224.py
+++ b/configs/mmcls/classification_tensorrt_dynamic-224x224-224x224.py
@ -9,5 +9,5 @@ backend_config = dict(
                input=dict(
                    min_shape=[1, 3, 224, 224],
                    opt_shape=[4, 3, 224, 224],
-                    max_shape=[64, 3, 224, 224])))
+                    max_shape=[8, 3, 224, 224])))
    ])
--- a/configs/mmdet/_base_/base_coreml_static-800x1344.py
+++ b/configs/mmdet/_base_/base_coreml_static-800x1344.py
@ -0,0 +1,11 @@
+_base_ = ['./base_torchscript.py', '../../_base_/backends/coreml.py']
+
+ir_config = dict(input_shape=(1344, 800))
+backend_config = dict(model_inputs=[
+    dict(
+        input_shapes=dict(
+            input=dict(
+                min_shape=[1, 3, 800, 1344],
+                max_shape=[1, 3, 800, 1344],
+                default_shape=[1, 3, 800, 1344])))
+])
--- a/configs/mmdet/detection/detection_ascend_dynamic-800x1344.py
+++ b/configs/mmdet/detection/detection_ascend_dynamic-800x1344.py
@ -0,0 +1,8 @@
+_base_ = ['../_base_/base_dynamic.py', '../../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[1344, 800])
+backend_config = dict(model_inputs=[
+    dict(
+        dynamic_image_size=[(800, 1344), (1344, 800)],
+        input_shapes=dict(input=[1, 3, -1, -1]))
+])
--- a/configs/mmdet/detection/detection_ascend_static-640x640.py
+++ b/configs/mmdet/detection/detection_ascend_static-640x640.py
@ -0,0 +1,5 @@
+_base_ = ['../_base_/base_static.py', '../../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[640, 640])
+backend_config = dict(
+    model_inputs=[dict(input_shapes=dict(input=[1, 3, 640, 640]))])
--- a/configs/mmdet/detection/detection_ascend_static-800x1344.py
+++ b/configs/mmdet/detection/detection_ascend_static-800x1344.py
@ -0,0 +1,5 @@
+_base_ = ['../_base_/base_static.py', '../../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[1344, 800])
+backend_config = dict(
+    model_inputs=[dict(input_shapes=dict(input=[1, 3, 800, 1344]))])
--- a/configs/mmdet/detection/detection_coreml_static-800x1344.py
+++ b/configs/mmdet/detection/detection_coreml_static-800x1344.py
@ -0,0 +1 @@
+_base_ = ['../_base_/base_coreml_static-800x1344.py']
--- a/configs/mmdet/detection/detection_rknn_static.py
+++ b/configs/mmdet/detection/detection_rknn_static.py
@ -0,0 +1,17 @@
+_base_ = ['../_base_/base_static.py', '../../_base_/backends/rknn.py']
+
+onnx_config = dict(input_shape=[640, 640])
+
+codebase_config = dict(model_type='rknn')
+
+backend_config = dict(input_size_list=[[3, 640, 640]])
+
+partition_config = dict(
+    type='rknn',  # the partition policy name
+    apply_marks=True,  # should always be set to True
+    partition_cfg=[
+        dict(
+            save_file='model.onnx',  # name to save the partitioned onnx model
+            start=['detector_forward:input'],  # [mark_name:input/output, ...]
+            end=['yolo_head:input'])  # [mark_name:input/output, ...]
+    ])
--- a/configs/mmdet3d/voxel-detection/voxel-detection_openvino_dynamic-kitti-32x4.py
+++ b/configs/mmdet3d/voxel-detection/voxel-detection_openvino_dynamic-kitti-32x4.py
--- a/configs/mmdet3d/voxel-detection/voxel-detection_openvino_dynamic-nus-20x5.py
+++ b/configs/mmdet3d/voxel-detection/voxel-detection_openvino_dynamic-nus-20x5.py
--- a/configs/mmdet3d/voxel-detection/voxel-detection_openvino_dynamic-nus-64x4.py
+++ b/configs/mmdet3d/voxel-detection/voxel-detection_openvino_dynamic-nus-64x4.py
@ -0,0 +1,9 @@
+_base_ = ['./voxel-detection_dynamic.py', '../../_base_/backends/openvino.py']
+
+onnx_config = dict(input_shape=None)
+
+backend_config = dict(model_inputs=[
+    dict(
+        opt_shapes=dict(
+            voxels=[20000, 64, 4], num_points=[20000], coors=[20000, 4]))
+])
--- a/configs/mmdet3d/voxel-detection/voxel-detection_tensorrt_dynamic-kitti-32x4.py
+++ b/configs/mmdet3d/voxel-detection/voxel-detection_tensorrt_dynamic-kitti-32x4.py
--- a/configs/mmdet3d/voxel-detection/voxel-detection_tensorrt_dynamic-nus-20x5.py
+++ b/configs/mmdet3d/voxel-detection/voxel-detection_tensorrt_dynamic-nus-20x5.py
@ -1,6 +1,6 @@
 _base_ = ['./voxel-detection_dynamic.py', '../../_base_/backends/tensorrt.py']
 backend_config = dict(
-    common_config=dict(max_workspace_size=1 << 30),
+    common_config=dict(max_workspace_size=1 << 32),
    model_inputs=[
        dict(
            input_shapes=dict(
--- a/configs/mmdet3d/voxel-detection/voxel-detection_tensorrt_dynamic-nus-64x4.py
+++ b/configs/mmdet3d/voxel-detection/voxel-detection_tensorrt_dynamic-nus-64x4.py
@ -0,0 +1,18 @@
+_base_ = ['./voxel-detection_dynamic.py', '../../_base_/backends/tensorrt.py']
+backend_config = dict(
+    common_config=dict(max_workspace_size=1 << 32),
+    model_inputs=[
+        dict(
+            input_shapes=dict(
+                voxels=dict(
+                    min_shape=[5000, 64, 4],
+                    opt_shape=[20000, 64, 4],
+                    max_shape=[30000, 64, 4]),
+                num_points=dict(
+                    min_shape=[5000], opt_shape=[20000], max_shape=[30000]),
+                coors=dict(
+                    min_shape=[5000, 4],
+                    opt_shape=[20000, 4],
+                    max_shape=[30000, 4]),
+            ))
+    ])
--- a/configs/mmocr/text-detection/text-detection_ascend_dynamic-640x640-1280x1280.py
+++ b/configs/mmocr/text-detection/text-detection_ascend_dynamic-640x640-1280x1280.py
@ -0,0 +1,8 @@
+_base_ = ['./text-detection_dynamic.py', '../../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=None)
+backend_config = dict(model_inputs=[
+    dict(
+        input_shapes=dict(input=[-1, 3, -1, -1]),
+        dynamic_dims=[(1, 640, 640), (4, 640, 640), (1, 1280, 1280)])
+])
--- a/configs/mmocr/text-detection/text-detection_ascend_static-640x640.py
+++ b/configs/mmocr/text-detection/text-detection_ascend_static-640x640.py
@ -0,0 +1,5 @@
+_base_ = ['./text-detection_static.py', '../../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[640, 640])
+backend_config = dict(
+    model_inputs=[dict(input_shapes=dict(input=[1, 3, 640, 640]))])
--- a/configs/mmpose/pose-detection_ncnn-int8_static-256x256.py
+++ b/configs/mmpose/pose-detection_ncnn-int8_static-256x256.py
@ -0,0 +1,3 @@
+_base_ = ['./pose-detection_static.py', '../_base_/backends/ncnn-int8.py']
+
+onnx_config = dict(input_shape=[256, 256])
--- a/configs/mmseg/segmentation_ascend_static-1024x2048.py
+++ b/configs/mmseg/segmentation_ascend_static-1024x2048.py
@ -0,0 +1,5 @@
+_base_ = ['./segmentation_static.py', '../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[2048, 1024])
+backend_config = dict(
+    model_inputs=[dict(input_shapes=dict(input=[1, 3, 1024, 2048]))])
--- a/configs/mmseg/segmentation_ascend_static-512x1024.py
+++ b/configs/mmseg/segmentation_ascend_static-512x1024.py
@ -0,0 +1,5 @@
+_base_ = ['./segmentation_static.py', '../_base_/backends/ascend.py']
+
+onnx_config = dict(input_shape=[1024, 512])
+backend_config = dict(
+    model_inputs=[dict(input_shapes=dict(input=[1, 3, 512, 1024]))])
--- a/configs/mmseg/segmentation_coreml_static-512x1024.py
+++ b/configs/mmseg/segmentation_coreml_static-512x1024.py
@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/torchscript_config.py', '../_base_/backends/coreml.py',
+    './segmentation_static.py'
+]
+
+ir_config = dict(input_shape=[1024, 512])
+backend_config = dict(model_inputs=[
+    dict(
+        input_shapes=dict(
+            input=dict(
+                min_shape=[1, 3, 512, 1024],
+                max_shape=[1, 3, 512, 1024],
+                default_shape=[1, 3, 512, 1024])))
+])
--- a/configs/mmseg/segmentation_rknn_static.py
+++ b/configs/mmseg/segmentation_rknn_static.py
@ -0,0 +1,7 @@
+_base_ = ['./segmentation_static.py', '../_base_/backends/rknn.py']
+
+onnx_config = dict(input_shape=[512, 512])
+
+codebase_config = dict(model_type='rknn')
+
+backend_config = dict(input_size_list=[[3, 512, 512]])
--- a/configs/text-detection_ncnn-int8_static.py
+++ b/configs/text-detection_ncnn-int8_static.py
@ -0,0 +1,3 @@
+_base_ = ['./text-detection_static.py', '../../_base_/backends/ncnn-int8.py']
+
+onnx_config = dict(input_shape=None)
--- a/csrc/mmdeploy/backend_ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/CMakeLists.txt
@ -32,7 +32,8 @@ if ("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
 endif ()

 # build TorchScript ops
-if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-  message(STATUS "Build torchsciprt custom ops")
+if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS
+    OR "coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  message(STATUS "Build torchscript custom ops")
  add_subdirectory(torchscript)
 endif ()
--- a/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
@ -10,7 +10,7 @@ else ()
 endif ()


-if (NOT ANDROID AND NOT IOS)
+if (NOT ANDROID AND NOT IOS AND NOT CMAKE_CROSSCOMPILING)
    add_subdirectory(ops)
    add_subdirectory(onnx2ncnn)
    add_subdirectory(pyncnn_ext)
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
@ -0,0 +1,150 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "gather_topk.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "NvInferVersion.h"
+#include "gather_topk_kernel.hpp"
+#include "trt_serialize.hpp"
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"GatherTopk"};
+}  // namespace
+
+GatherTopk::GatherTopk(const std::string &name) : TRTPluginBase(name) {}
+
+GatherTopk::GatherTopk(const std::string name, const void *data, size_t length)
+    : TRTPluginBase(name) {}
+
+nvinfer1::IPluginV2DynamicExt *GatherTopk::clone() const TRT_NOEXCEPT {
+  GatherTopk *plugin = new GatherTopk(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  assert(inputs[0].nbDims >= inputs[1].nbDims);
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  for (int i = 0; i < inputs[1].nbDims; ++i) {
+    ret.d[i] = inputs[1].d[i];
+  }
+  for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i) {
+    ret.d[i] = inputs[0].d[i];
+  }
+  return ret;
+}
+
+bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
+                                           int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  switch (pos) {
+    case 0:
+      // data
+      return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+             (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    case 1:
+      // indices
+      return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+             ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    case 2:
+      // output
+      return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+    default:
+      return true;
+  }
+  return true;
+}
+
+void GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+                                 const nvinfer1::DynamicPluginTensorDesc *outputs,
+                                 int nbOutputs) TRT_NOEXCEPT {}
+
+size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                                    const nvinfer1::PluginTensorDesc *outputs,
+                                    int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                        const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                        void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  const int *indices_dims = &(inputDesc[1].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+  int indice_nbDims = inputDesc[1].dims.nbDims;
+
+  const void *data = inputs[0];
+  const void *indices = inputs[1];
+  void *output = outputs[0];
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      gather_topk_impl<float>((float *)data, (int *)indices, dims, nbDims, indices_dims,
+                              indice_nbDims, (float *)output, stream);
+      break;
+
+    case nvinfer1::DataType::kINT32:
+      gather_topk_impl<int>((int *)data, (int *)indices, dims, nbDims, indices_dims, indice_nbDims,
+                            (int *)output, stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                                 int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *GatherTopk::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *GatherTopk::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+int GatherTopk::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT { return 0; }
+
+void GatherTopk::serialize(void *buffer) const TRT_NOEXCEPT {}
+
+GatherTopkCreator::GatherTopkCreator() {
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *GatherTopkCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
+
+nvinfer1::IPluginV2 *GatherTopkCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  auto *plugin = new GatherTopk(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *GatherTopkCreator::deserializePlugin(const char *name, const void *serialData,
+                                                          size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new GatherTopk(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
@ -0,0 +1,64 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_SCATTERND_HPP
+#define TRT_SCATTERND_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class GatherTopk : public TRTPluginBase {
+ public:
+  GatherTopk(const std::string &name);
+
+  GatherTopk(const std::string name, const void *data, size_t length);
+
+  GatherTopk() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+};
+
+class GatherTopkCreator : public TRTPluginCreatorBase {
+ public:
+  GatherTopkCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_SCATTERND_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
@ -0,0 +1,46 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "gather_topk_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output,
+                                   int batch, int num_input, int num_indices, int channel) {
+  CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel) {
+    const int b_id = index / (num_indices * channel);
+    const int n_id = (index / channel) % num_indices;
+    const int c_id = index % channel;
+
+    const int input_n_id = indices[b_id * num_indices + n_id];
+    const scalar_t value = input[b_id * num_input * channel + input_n_id * channel + c_id];
+    output[b_id * num_indices * channel + n_id * channel + c_id] = value;
+  }
+}
+
+template <typename scalar_t>
+void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
+                      const int* indices_dims, int indice_nbDims, scalar_t* output,
+                      cudaStream_t stream) {
+  int batch = 1;
+  for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
+  int num_input = dims[indice_nbDims - 1];
+  int num_indices = indices_dims[indice_nbDims - 1];
+  int channel = 1;
+  for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
+  const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
+  gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch,
+                                                                  num_input, num_indices, channel);
+}
+
+template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims,
+                                      int nbDims, const int* indices_dims, int indice_nbDims,
+                                      float* output, cudaStream_t stream);
+
+template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims,
+                                        int nbDims, const int* indices_dims, int indice_nbDims,
+                                        int32_t* output, cudaStream_t stream);
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_GRID_SAMPLER_KERNEL_HPP
+#define TRT_GRID_SAMPLER_KERNEL_HPP
+#include <cuda_runtime.h>
+
+template <typename scalar_t>
+void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
+                      const int* indices_dims, int indice_nbDims, scalar_t* output,
+                      cudaStream_t stream);
+#endif  // TRT_GRID_SAMPLER_KERNEL_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
@ -0,0 +1,183 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "scaled_dot_product_attention.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "scaled_dot_product_attention_kernel.hpp"
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+namespace mmdeploy {
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"ScaledDotProductAttentionTRT"};
+}  // namespace
+
+ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string &name)
+    : TRTPluginBase(name), mask_dim(0) {}
+
+ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name, const void *data,
+                                                           size_t length)
+    : TRTPluginBase(name), mask_dim(0) {}
+
+ScaledDotProductAttentionTRT::~ScaledDotProductAttentionTRT() {}
+
+nvinfer1::IPluginV2DynamicExt *ScaledDotProductAttentionTRT::clone() const TRT_NOEXCEPT {
+  ScaledDotProductAttentionTRT *plugin = new ScaledDotProductAttentionTRT(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::DimsExprs ScaledDotProductAttentionTRT::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
+  if (outputIndex == 0) return inputs[0];
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 3;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  ret.d[2] = inputs[1].d[1];
+
+  return ret;
+}
+
+bool ScaledDotProductAttentionTRT::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
+  if (pos == 0) {
+    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+  }
+}
+
+// Attach the plugin object to an execution context and grant the plugin the
+// access to some context resource.
+void ScaledDotProductAttentionTRT::attachToContext(cudnnContext *cudnnContext,
+                                                   cublasContext *cublasContext,
+                                                   IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
+  _cublas_handle = cublasContext;
+  _cudnn_handle = cudnnContext;
+  cudnnCreateTensorDescriptor(&_x_desc);
+  cudnnCreateTensorDescriptor(&_y_desc);
+  cudnnCreateTensorDescriptor(&_mask_desc);
+}
+
+// Detach the plugin object from its execution context.
+void ScaledDotProductAttentionTRT::detachFromContext() TRT_NOEXCEPT {
+  cudnnDestroyTensorDescriptor(_y_desc);
+  cudnnDestroyTensorDescriptor(_x_desc);
+  cudnnDestroyTensorDescriptor(_mask_desc);
+}
+
+void ScaledDotProductAttentionTRT::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                                                   int nbInputs,
+                                                   const nvinfer1::DynamicPluginTensorDesc *out,
+                                                   int nbOutputs) TRT_NOEXCEPT {
+  if (nbInputs != 4) {
+    mask_dim = 0;
+  } else {
+    mask_dim = in[3].desc.dims.nbDims;
+  }
+}
+
+int ScaledDotProductAttentionTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                          const nvinfer1::PluginTensorDesc *outputDesc,
+                                          const void *const *inputs, void *const *outputs,
+                                          void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
+  if (CUDNN_STATUS_SUCCESS != cudnnSetStream(_cudnn_handle, stream)) return 1;
+  if (CUBLAS_STATUS_SUCCESS != cublasSetStream(_cublas_handle, stream)) return 1;
+  int B = inputDesc[0].dims.d[0];  // batch * heads
+  int Nt = inputDesc[0].dims.d[1];
+  int Ns = inputDesc[1].dims.d[1];
+  int E = inputDesc[0].dims.d[2];  // embeding size
+
+  const void *query = inputs[0];
+  const void *key = inputs[1];
+  const void *value = inputs[2];
+  const void *mask = nullptr;
+
+  int mask_dims[3];
+  mask_dims[0] = 0;
+  if (mask_dim > 0) {
+    mask = inputs[3];
+    // check if mask need broadcast
+    if (mask_dim == 2) {
+      mask_dims[0] = 1;
+      mask_dims[1] = inputDesc[3].dims.d[0];
+      mask_dims[2] = inputDesc[3].dims.d[1];
+    } else {
+      mask_dims[0] = inputDesc[3].dims.d[0];
+      mask_dims[1] = inputDesc[3].dims.d[1];
+      mask_dims[2] = inputDesc[3].dims.d[2];
+    }
+  }
+
+  void *output = outputs[0];
+  void *attn = outputs[1];
+
+  auto data_type = inputDesc[0].type;
+  cudnnDataType_t cudnn_dtype{};
+  convert_trt2cudnn_dtype(data_type, &cudnn_dtype);
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      dot_product_attention_impl<float>((float *)query, (float *)key, (float *)value, (float *)mask,
+                                        (float *)attn, (float *)output, B, Nt, Ns, E, &mask_dims[0],
+                                        _x_desc, _y_desc, _mask_desc, cudnn_dtype, stream,
+                                        _cublas_handle, _cudnn_handle);
+      break;
+    default:
+      return 1;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ScaledDotProductAttentionTRT::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ScaledDotProductAttentionTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
+
+const char *ScaledDotProductAttentionTRT::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+int ScaledDotProductAttentionTRT::getNbOutputs() const TRT_NOEXCEPT { return 2; }
+
+size_t ScaledDotProductAttentionTRT::getSerializationSize() const TRT_NOEXCEPT { return 0; }
+
+void ScaledDotProductAttentionTRT::serialize(void *buffer) const TRT_NOEXCEPT {}
+
+////////////////////// creator /////////////////////////////
+
+ScaledDotProductAttentionTRTCreator::ScaledDotProductAttentionTRTCreator() {}
+
+const char *ScaledDotProductAttentionTRTCreator::getPluginName() const TRT_NOEXCEPT {
+  return PLUGIN_NAME;
+}
+
+const char *ScaledDotProductAttentionTRTCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2 *ScaledDotProductAttentionTRTCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
+  ScaledDotProductAttentionTRT *plugin = new ScaledDotProductAttentionTRT(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *ScaledDotProductAttentionTRTCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
+  auto plugin = new ScaledDotProductAttentionTRT(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+REGISTER_TENSORRT_PLUGIN(ScaledDotProductAttentionTRTCreator);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
@ -0,0 +1,73 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef TRT_SCALED_DOT_PRODUCT_ATTENTION_HPP
+#define TRT_SCALED_DOT_PRODUCT_ATTENTION_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_base.hpp"
+
+namespace mmdeploy {
+class ScaledDotProductAttentionTRT : public TRTPluginBase {
+ public:
+  ScaledDotProductAttentionTRT(const std::string &name);
+
+  ScaledDotProductAttentionTRT(const std::string name, const void *data, size_t length);
+
+  ScaledDotProductAttentionTRT() = delete;
+
+  ~ScaledDotProductAttentionTRT() TRT_NOEXCEPT override;
+
+  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                               const nvinfer1::DynamicPluginTensorDesc *out,
+                               int nbOutputs) TRT_NOEXCEPT override;
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
+                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
+      TRT_NOEXCEPT override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const TRT_NOEXCEPT override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const TRT_NOEXCEPT override;
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void *buffer) const TRT_NOEXCEPT override;
+  void attachToContext(cudnnContext *cudnn, cublasContext *cublas,
+                       nvinfer1::IGpuAllocator *allocator) TRT_NOEXCEPT override;
+  void detachFromContext() TRT_NOEXCEPT override;
+
+ private:
+  int mask_dim;
+  cublasHandle_t _cublas_handle{};
+  cudnnHandle_t _cudnn_handle{};
+  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _mask_desc{};
+};
+
+class ScaledDotProductAttentionTRTCreator : public TRTPluginCreatorBase {
+ public:
+  ScaledDotProductAttentionTRTCreator();
+
+  const char *getPluginName() const TRT_NOEXCEPT override;
+
+  const char *getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
+                                         size_t serialLength) TRT_NOEXCEPT override;
+};
+}  // namespace mmdeploy
+#endif  // TRT_SCALED_DOT_PRODUCT_ATTENTION_HPP
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
@ -0,0 +1,103 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+
+#include <cmath>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "scaled_dot_product_attention_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t handle, cublasOperation_t transa,
+                                            cublasOperation_t transb, int m, int n, int k,
+                                            const scalar_t* alpha, const scalar_t* A, int lda,
+                                            long long int strideA, const scalar_t* B, int ldb,
+                                            long long int strideB, const scalar_t* beta,
+                                            scalar_t* C, int ldc, long long int strideC,
+                                            int batchCount);
+
+template <>
+cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
+                                                   cublasOperation_t transb, int m, int n, int k,
+                                                   const float* alpha, const float* A, int lda,
+                                                   long long int strideA, const float* B, int ldb,
+                                                   long long int strideB, const float* beta,
+                                                   float* C, int ldc, long long int strideC,
+                                                   int batchCount) {
+  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb,
+                                   strideB, beta, C, ldc, strideC, batchCount);
+}
+
+template <>
+cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t handle, cublasOperation_t transa,
+                                                    cublasOperation_t transb, int m, int n, int k,
+                                                    const __half* alpha, const __half* A, int lda,
+                                                    long long int strideA, const __half* B, int ldb,
+                                                    long long int strideB, const __half* beta,
+                                                    __half* C, int ldc, long long int strideC,
+                                                    int batchCount) {
+  return cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb,
+                                   strideB, beta, C, ldc, strideC, batchCount);
+}
+
+template <typename scalar_t>
+void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value,
+                                const scalar_t* mask, scalar_t* attn, scalar_t* output, int B,
+                                int Nt, int Ns, int E, const int* mask_dims,
+                                cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
+                                cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype,
+                                cudaStream_t stream, cublasHandle_t cublas_handle,
+                                cudnnHandle_t cudnn_handle) {
+  {
+    // Q @ K
+    const int m = Ns;
+    const int n = Nt;
+    const int k = E;
+    const auto alpha = scalar_t(1.0f / sqrt(float(E)));
+    const auto beta = scalar_t(0);
+    cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, key, k,
+                                 Ns * E, query, k, Nt * E, &beta, attn, m, Nt * Ns, B);
+  }
+
+  if (mask_dims != nullptr && mask_dims[0] != 0) {
+    const auto alpha = scalar_t(1);
+    const auto beta = scalar_t(1);
+    cudnnSetTensor4dDescriptor(mask_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, mask_dims[0],
+                               mask_dims[1], mask_dims[2]);
+    cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, B, Nt, Ns);
+    cudnnAddTensor(cudnn_handle, &alpha, mask_desc, mask, &beta, x_desc, attn);
+  }
+
+  {
+    // softmax attention
+    const auto alpha = scalar_t(1);
+    const auto beta = scalar_t(0);
+    cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
+    cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
+    cudnnSoftmaxForward(cudnn_handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, &alpha,
+                        x_desc, attn, &beta, y_desc, attn);
+  }
+
+  {
+    // attn @ v
+    const int m = E;
+    const int n = Nt;
+    const int k = Ns;
+    const auto alpha = scalar_t(1);
+    const auto beta = scalar_t(0);
+    cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, value, m,
+                                 Ns * E, (const scalar_t*)(attn), k, Ns * Nt, &beta, output, m,
+                                 Nt * E, B);
+  }
+}
+
+template void dot_product_attention_impl<float>(
+    const float* query, const float* key, const float* value, const float* mask, float* attn,
+    float* output, int B, int Nt, int Ns, int E, const int* mask_dims,
+    cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
+    cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream,
+    cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle);
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_SCALED_DOT_PRODUCT_ATTENTION_KERNEL_HPP
+#define TRT_SCALED_DOT_PRODUCT_ATTENTION_KERNEL_HPP
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+template <typename scalar_t>
+void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value,
+                                const scalar_t* mask, scalar_t* attn, scalar_t* output, int B,
+                                int Nt, int Ns, int E, const int* mask_dims,
+                                cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
+                                cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype,
+                                cudaStream_t stream, cublasHandle_t cublas_handle,
+                                cudnnHandle_t cudnn_handle);
+
+#endif
--- a/csrc/mmdeploy/backend_ops/torchscript/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/bind.cpp
@ -1,10 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include "torch/script.h"
-
-TORCH_LIBRARY(mmdeploy, m) {
-  m.def(
-      "modulated_deform_conv(Tensor input, Tensor weight, Tensor bias, Tensor offset, Tensor "
-      "mask, "
-      "int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int "
-      "dilation_h,int dilation_w, int groups, int deform_groups, bool with_bias) -> Tensor");
-}
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "torch/script.h"
+
+TORCH_LIBRARY(mmdeploy, m) {
+  m.def(
+       "modulated_deform_conv(Tensor input, Tensor weight, Tensor bias, Tensor offset, Tensor "
+       "mask, "
+       "int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int "
+       "dilation_h,int dilation_w, int groups, int deform_groups, bool with_bias) -> Tensor")
+      .def(
+          "coreml_nms(Tensor boxes, Tensor scores, float iou_threshold, "
+          "float score_threshold, int max_boxes) -> Tensor[]");
+}
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
@ -0,0 +1,31 @@
+
+#include <assert.h>
+
+#include <vector>
+
+#include "torch/script.h"
+namespace mmdeploy {
+
+using at::Tensor;
+
+std::vector<Tensor> coreml_nms_cpu(Tensor boxes, Tensor scores, double iou_threshold,
+                                   double score_threshold, int64_t max_boxes) {
+  assert(boxes.dim() == 3);  // bboxes with shape (batch_size, num_bboxes, 4)
+  assert(boxes.size(2) == 4);
+  assert(boxes.size(0) == scores.size(0));  // check batch size
+  assert(boxes.size(1) == scores.size(1));  // check num boxes
+
+  auto batch_size = boxes.size(0);
+  auto num_boxes = boxes.size(1);
+  auto num_classes = scores.size(2);
+
+  Tensor ret_boxes = at::zeros({batch_size, max_boxes, 4});
+  Tensor ret_scores = at::zeros({batch_size, max_boxes, num_classes});
+  Tensor indices = at::zeros({batch_size, max_boxes}, at::kInt);
+  Tensor num_outputs = at::zeros({batch_size}, at::kInt);
+
+  return std::vector<Tensor>({ret_boxes, ret_scores, indices, num_outputs});
+}
+
+TORCH_LIBRARY_IMPL(mmdeploy, CPU, m) { m.impl("coreml_nms", coreml_nms_cpu); }
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <algorithm>
+#include <numeric>
+
+#include "mmdeploy/codebase/mmcls/mmcls.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/experimental/module_adapter.h"
+
+using std::vector;
+
+namespace mmdeploy::mmcls {
+
+class MultiLabelLinearClsHead : public MMClassification {
+ public:
+  explicit MultiLabelLinearClsHead(const Value& cfg) : MMClassification(cfg) {}
+  Result<Value> operator()(const Value& infer_res) {
+    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+    auto output = infer_res["output"].get<Tensor>();
+
+    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
+      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
+                     (int)output.data_type());
+      return Status(eNotSupported);
+    }
+
+    auto class_num = (int)output.shape(1);
+
+    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+    OUTCOME_TRY(stream().Wait());
+
+    return GetLabels(_scores, class_num);
+  }
+
+ private:
+  Value GetLabels(const Tensor& scores, int class_num) const {
+    auto scores_data = scores.data<float>();
+    ClassifyOutput output;
+    for (int i = 0; i < class_num; ++i) {
+      auto label = ClassifyOutput::Label{i, scores_data[i]};
+      MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
+      output.labels.push_back(label);
+    }
+    return to_value(std::move(output));
+  }
+
+ private:
+  static constexpr const auto kHost = Device{0};
+};
+
+REGISTER_CODEBASE_COMPONENT(MMClassification, MultiLabelLinearClsHead);
+
+}  // namespace mmdeploy::mmcls
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
@ -4,6 +4,7 @@

 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/experimental/module_adapter.h"

 using namespace std;
--- a/csrc/mmdeploy/core/device.h
+++ b/csrc/mmdeploy/core/device.h
@ -68,7 +68,7 @@ class Device {
  constexpr explicit Device(int platform_id, int device_id = 0)
      : platform_id_(platform_id), device_id_(device_id) {}

-  MMDEPLOY_API explicit Device(const char *platform_name, int device_id = 0);
+  MMDEPLOY_API explicit Device(const char* platform_name, int device_id = 0);

  constexpr int device_id() const noexcept { return device_id_; }

@ -78,11 +78,11 @@ class Device {

  constexpr bool is_device() const noexcept { return platform_id() > 0; }

-  constexpr bool operator==(const Device &other) const noexcept {
+  constexpr bool operator==(const Device& other) const noexcept {
    return platform_id_ == other.platform_id_ && device_id_ == other.device_id_;
  }

-  constexpr bool operator!=(const Device &other) const noexcept { return !(*this == other); }
+  constexpr bool operator!=(const Device& other) const noexcept { return !(*this == other); }

  constexpr explicit operator bool() const noexcept { return platform_id_ >= 0 && device_id_ >= 0; }

@ -104,7 +104,7 @@ enum class MemcpyKind : int { HtoD, DtoH, DtoD };
 class MMDEPLOY_API Platform {
 public:
  // throws if not found
-  explicit Platform(const char *platform_name);
+  explicit Platform(const char* platform_name);

  // throws if not found
  explicit Platform(int platform_id);
@ -113,11 +113,11 @@ class MMDEPLOY_API Platform {
  int GetPlatformId() const;

  // "" if invalid
-  const char *GetPlatformName() const;
+  const char* GetPlatformName() const;

-  bool operator==(const Platform &other) { return impl_ == other.impl_; }
+  bool operator==(const Platform& other) { return impl_ == other.impl_; }

-  bool operator!=(const Platform &other) { return !(*this == other); }
+  bool operator!=(const Platform& other) { return !(*this == other); }

  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }

@ -132,7 +132,7 @@ class MMDEPLOY_API Platform {

 Platform GetPlatform(int platform_id);

-Platform GetPlatform(const char *platform_name);
+Platform GetPlatform(const char* platform_name);

 class MMDEPLOY_API Stream {
 public:
@ -140,7 +140,7 @@ class MMDEPLOY_API Stream {

  explicit Stream(Device device, uint64_t flags = 0);

-  explicit Stream(Device device, void *native, uint64_t flags = 0);
+  explicit Stream(Device device, void* native, uint64_t flags = 0);

  explicit Stream(Device device, std::shared_ptr<void> native, uint64_t flags = 0);

@ -150,25 +150,25 @@ class MMDEPLOY_API Stream {

  Result<void> Wait();

-  Result<void> DependsOn(Event &event);
+  Result<void> DependsOn(Event& event);

-  Result<void> Submit(Kernel &kernel);
+  Result<void> Submit(Kernel& kernel);

-  void *GetNative(ErrorCode *ec = nullptr);
+  void* GetNative(ErrorCode* ec = nullptr);

-  Result<void> Copy(const Buffer &src, Buffer &dst, size_t size = -1, size_t src_offset = 0,
+  Result<void> Copy(const Buffer& src, Buffer& dst, size_t size = -1, size_t src_offset = 0,
                    size_t dst_offset = 0);

-  Result<void> Copy(const void *host_ptr, Buffer &dst, size_t size = -1, size_t dst_offset = 0);
+  Result<void> Copy(const void* host_ptr, Buffer& dst, size_t size = -1, size_t dst_offset = 0);

-  Result<void> Copy(const Buffer &src, void *host_ptr, size_t size = -1, size_t src_offset = 0);
+  Result<void> Copy(const Buffer& src, void* host_ptr, size_t size = -1, size_t src_offset = 0);

-  Result<void> Fill(const Buffer &dst, void *pattern, size_t pattern_size, size_t size = -1,
+  Result<void> Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size = -1,
                    size_t offset = 0);

-  bool operator==(const Stream &other) const { return impl_ == other.impl_; }
+  bool operator==(const Stream& other) const { return impl_ == other.impl_; }

-  bool operator!=(const Stream &other) const { return !(*this == other); }
+  bool operator!=(const Stream& other) const { return !(*this == other); }

  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }

@ -184,7 +184,7 @@ class MMDEPLOY_API Stream {
 };

 template <typename T>
-T GetNative(Stream &stream, ErrorCode *ec = nullptr) {
+T GetNative(Stream& stream, ErrorCode* ec = nullptr) {
  return reinterpret_cast<T>(stream.GetNative(ec));
 }

@ -194,7 +194,7 @@ class MMDEPLOY_API Event {

  explicit Event(Device device, uint64_t flags = 0);

-  explicit Event(Device device, void *native, uint64_t flags = 0);
+  explicit Event(Device device, void* native, uint64_t flags = 0);

  explicit Event(Device device, std::shared_ptr<void> native, uint64_t flags = 0);

@ -204,13 +204,13 @@ class MMDEPLOY_API Event {

  Result<void> Wait();

-  Result<void> Record(Stream &stream);
+  Result<void> Record(Stream& stream);

-  void *GetNative(ErrorCode *ec = nullptr);
+  void* GetNative(ErrorCode* ec = nullptr);

-  bool operator==(const Event &other) const { return impl_ == other.impl_; }
+  bool operator==(const Event& other) const { return impl_ == other.impl_; }

-  bool operator!=(const Event &other) const { return !(*this == other); }
+  bool operator!=(const Event& other) const { return !(*this == other); }

  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }

@ -223,7 +223,7 @@ class MMDEPLOY_API Event {
 };

 template <typename T>
-T GetNative(Event &event, ErrorCode *ec = nullptr) {
+T GetNative(Event& event, ErrorCode* ec = nullptr) {
  return reinterpret_cast<T>(event.GetNative(ec));
 }

@ -234,7 +234,7 @@ class MMDEPLOY_API Kernel {

  Device GetDevice() const;

-  void *GetNative(ErrorCode *ec = nullptr);
+  void* GetNative(ErrorCode* ec = nullptr);

  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }

@ -243,7 +243,7 @@ class MMDEPLOY_API Kernel {
 };

 template <typename T>
-T GetNative(Kernel &kernel, ErrorCode *ec = nullptr) {
+T GetNative(Kernel& kernel, ErrorCode* ec = nullptr) {
  return reinterpret_cast<T>(kernel.GetNative(ec));
 }

@ -269,25 +269,25 @@ class MMDEPLOY_API Buffer {

  Buffer(Device device, size_t size, Allocator allocator, size_t alignment = 1, uint64_t flags = 0);

-  Buffer(Device device, size_t size, void *native, uint64_t flags = 0);
+  Buffer(Device device, size_t size, void* native, uint64_t flags = 0);

  Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags = 0);
  // create sub-buffer
-  Buffer(Buffer &buffer, size_t offset, size_t size, uint64_t flags = 0);
+  Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags = 0);

-  size_t GetSize(ErrorCode *ec = nullptr) const;
+  size_t GetSize(ErrorCode* ec = nullptr) const;

-  //  bool IsSubBuffer(ErrorCode *ec = nullptr);
+  //  bool IsSubBuffer(ErrorCode* ec = nullptr);

-  void *GetNative(ErrorCode *ec = nullptr) const;
+  void* GetNative(ErrorCode* ec = nullptr) const;

  Device GetDevice() const;

  Allocator GetAllocator() const;

-  bool operator==(const Buffer &other) const { return impl_ == other.impl_; }
+  bool operator==(const Buffer& other) const { return impl_ == other.impl_; }

-  bool operator!=(const Buffer &other) const { return !(*this == other); }
+  bool operator!=(const Buffer& other) const { return !(*this == other); }

  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }

@ -300,12 +300,12 @@ class MMDEPLOY_API Buffer {
 };

 template <typename T>
-T GetNative(Buffer &buffer, ErrorCode *ec = nullptr) {
+T GetNative(Buffer& buffer, ErrorCode* ec = nullptr) {
  return reinterpret_cast<T>(buffer.GetNative(ec));
 }

 template <typename T>
-T GetNative(const Buffer &buffer, ErrorCode *ec = nullptr) {
+T GetNative(const Buffer& buffer, ErrorCode* ec = nullptr) {
  return reinterpret_cast<T>(buffer.GetNative(ec));
 }

@ -315,13 +315,15 @@ class MMDEPLOY_API PlatformRegistry {

  int Register(Creator creator);

-  int GetPlatform(const char *name, Platform *platform);
+  int AddAlias(const char* name, const char* target);

-  int GetPlatform(int id, Platform *platform);
+  int GetPlatform(const char* name, Platform* platform);

-  int GetPlatformId(const char *name);
+  int GetPlatform(int id, Platform* platform);

-  PlatformImpl *GetPlatformImpl(PlatformId id);
+  int GetPlatformId(const char* name);
+
+  PlatformImpl* GetPlatformImpl(PlatformId id);

 private:
  int GetNextId();
@ -335,8 +337,9 @@ class MMDEPLOY_API PlatformRegistry {
    Platform platform;
  };
  std::vector<Entry> entries_;
+  std::vector<std::pair<std::string, std::string>> aliases_;
 };

-MMDEPLOY_API PlatformRegistry &gPlatformRegistry();
+MMDEPLOY_API PlatformRegistry& gPlatformRegistry();

 }  // namespace mmdeploy
--- a/csrc/mmdeploy/core/device_impl.cpp
+++ b/csrc/mmdeploy/core/device_impl.cpp
@ -321,6 +321,11 @@ int PlatformRegistry::Register(Creator creator) {
  return 0;
 }

+int PlatformRegistry::AddAlias(const char* name, const char* target) {
+  aliases_.emplace_back(name, target);
+  return 0;
+}
+
 int PlatformRegistry::GetNextId() {
  for (int i = 1;; ++i) {
    if (IsAvailable(i)) {
@ -339,6 +344,12 @@ bool PlatformRegistry::IsAvailable(int id) {
 }

 int PlatformRegistry::GetPlatform(const char* name, Platform* platform) {
+  for (const auto& alias : aliases_) {
+    if (name == alias.first) {
+      name = alias.second.c_str();
+      break;
+    }
+  }
  for (const auto& entry : entries_) {
    if (entry.name == name) {
      *platform = entry.platform;
@ -357,7 +368,14 @@ int PlatformRegistry::GetPlatform(int id, Platform* platform) {
  }
  return -1;
 }
+
 int PlatformRegistry::GetPlatformId(const char* name) {
+  for (const auto& alias : aliases_) {
+    if (name == alias.first) {
+      name = alias.second.c_str();
+      break;
+    }
+  }
  for (const auto& entry : entries_) {
    if (entry.name == name) {
      return entry.id;
--- a/csrc/mmdeploy/core/model.cpp
+++ b/csrc/mmdeploy/core/model.cpp
@ -20,6 +20,7 @@ Model::Model(const std::string& model_path) {
 Model::Model(const void* buffer, size_t size) { Init(buffer, size).value(); }

 Result<void> Model::Init(const std::string& model_path) {
+  model_path_ = model_path;
  if (!fs::exists(model_path)) {
    MMDEPLOY_ERROR("'{}' doesn't exist", model_path);
    return Status(eFileNotExist);
@ -45,6 +46,8 @@ Result<void> Model::Init(const std::string& model_path) {
  return Status(eNotSupported);
 }

+const std::string& Model::GetModelPath() const { return model_path_; }
+
 Result<void> Model::Init(const void* buffer, size_t size) {
  auto registry = ModelRegistry::Get();
  auto entries = registry.ListEntries();
--- a/csrc/mmdeploy/core/model.h
+++ b/csrc/mmdeploy/core/model.h
@ -94,7 +94,14 @@ class MMDEPLOY_API Model {
   */
  explicit operator bool() const { return impl_ != nullptr; }

+  /**
+   * @brief get model_path that init with DirectoryModel
+   * @return file path of an sdk model
+   */
+  const std::string& GetModelPath() const;
+
 private:
+  std::string model_path_;
  std::shared_ptr<ModelImpl> impl_;
  deploy_meta_info_t meta_;
 };
--- a/csrc/mmdeploy/core/mpl/span.h
+++ b/csrc/mmdeploy/core/mpl/span.h
@ -94,17 +94,23 @@ class Span {

  constexpr Span& operator=(const Span& other) noexcept = default;

-  friend bool operator==(const Span& a, const Span& b) {
-    if (a.size() != b.size()) return false;
+  template <typename U>
+  friend bool operator!=(const Span& a, const Span<U>& b) {
+    if (a.size() != b.size()) {
+      return true;
+    }
    for (size_type i = 0; i < a.size(); ++i) {
      if (a[i] != b[i]) {
-        return false;
+        return true;
      }
    }
-    return true;
+    return false;
  }

-  friend bool operator!=(const Span& a, const Span& b) { return !(a == b); }
+  template <typename U>
+  friend bool operator==(const Span& a, const Span<U>& b) {
+    return !(a != b);
+  }

 private:
  T* data_;
--- a/csrc/mmdeploy/core/tensor.cpp
+++ b/csrc/mmdeploy/core/tensor.cpp
@ -115,9 +115,9 @@ Result<void> Tensor::CopyFrom(const Tensor& tensor, Stream stream) {
  if (!stream) {
    auto device = desc_.device.is_device() ? desc_.device : tensor.desc().device;
    auto default_stream = Stream::GetDefault(device);
-    OUTCOME_TRY(default_stream.Copy(tensor.buffer(), buffer_));
+    OUTCOME_TRY(default_stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
  } else {
-    OUTCOME_TRY(stream.Copy(tensor.buffer(), buffer_));
+    OUTCOME_TRY(stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
  }
  return success();
 }
@ -141,9 +141,9 @@ Result<void> Tensor::CopyTo(Tensor& tensor, Stream stream) const {
  if (!stream) {
    Device device = desc_.device.is_device() ? desc_.device : tensor.desc().device;
    Stream default_stream = Stream::GetDefault(device);
-    return default_stream.Copy(buffer_, tensor.buffer());
+    return default_stream.Copy(buffer_, tensor.buffer(), byte_size());
  } else {
-    return stream.Copy(buffer_, tensor.buffer());
+    return stream.Copy(buffer_, tensor.buffer(), byte_size());
  }
 }

@ -158,9 +158,9 @@ Result<void> Tensor::CopyFrom(void* host_ptr, Stream stream) {
  Allocate();
  if (!stream) {
    auto default_stream = Stream::GetDefault(desc_.device);
-    return default_stream.Copy(host_ptr, buffer_, buffer_.GetSize());
+    return default_stream.Copy(host_ptr, buffer_, byte_size());
  } else {
-    return stream.Copy(host_ptr, buffer_, buffer_.GetSize());
+    return stream.Copy(host_ptr, buffer_, byte_size());
  }
 }

@ -174,9 +174,9 @@ Result<void> Tensor::CopyTo(void* host_ptr, Stream stream) const {
  }
  if (!stream) {
    auto default_stream = Stream::GetDefault(desc_.device);
-    return default_stream.Copy(buffer_, host_ptr, buffer_.GetSize());
+    return default_stream.Copy(buffer_, host_ptr, byte_size());
  } else {
-    return stream.Copy(buffer_, host_ptr, buffer_.GetSize());
+    return stream.Copy(buffer_, host_ptr, byte_size());
  }
 }

--- a/csrc/mmdeploy/device/CMakeLists.txt
+++ b/csrc/mmdeploy/device/CMakeLists.txt
@ -5,3 +5,7 @@ add_subdirectory(cpu)
 if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
    add_subdirectory(cuda)
 endif ()
+
+if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    add_subdirectory(acl)
+endif ()
--- a/csrc/mmdeploy/device/acl/CMakeLists.txt
+++ b/csrc/mmdeploy/device/acl/CMakeLists.txt
@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_acl_device)
+
+file(GLOB_RECURSE SRCS "*.cpp")
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
--- a/csrc/mmdeploy/device/acl/acl_device.cpp
+++ b/csrc/mmdeploy/device/acl/acl_device.cpp
@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/core/device_impl.h"
+
+namespace mmdeploy {
+
+class AclPlatformRegisterer {
+ public:
+  AclPlatformRegisterer() { gPlatformRegistry().AddAlias("npu", "cpu"); }
+};
+
+AclPlatformRegisterer g_acl_platform_registerer;
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/device/cpu/cpu_device.cpp
+++ b/csrc/mmdeploy/device/cpu/cpu_device.cpp
@ -105,7 +105,7 @@ Result<void> CpuPlatformImpl::CopyImpl(const void* src, void* dst, size_t src_si
    task();
    return success();
  }
-  if (st.GetDevice() != Device(0, 0)) {
+  if (st.GetDevice().platform_id() != 0) {
    return Status(eInvalidArgument);
  }
  auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
@ -126,6 +126,7 @@ Result<void> CpuPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size
  }
  return CopyImpl(host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
 }
+
 Result<void> CpuPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
                                   Stream stream) {
  auto src_ptr = src.GetNative();
@ -145,7 +146,7 @@ Result<void> CpuPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t s
    return Status(eInvalidArgument);
  }
  auto device = src.GetDevice();
-  if (device.platform_id() != 0 || device != dst.GetDevice()) {
+  if (device.platform_id() != 0 || device.platform_id() != dst.GetDevice().platform_id()) {
    return Status(eInvalidArgument);
  }
  return CopyImpl(src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size,
--- a/csrc/mmdeploy/net/CMakeLists.txt
+++ b/csrc/mmdeploy/net/CMakeLists.txt
@ -26,5 +26,17 @@ if ("snpe" IN_LIST MMDEPLOY_TARGET_BACKENDS)
    add_subdirectory(snpe)
 endif ()

+if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    add_subdirectory(acl)
+endif ()
+
+if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    add_subdirectory(torchscript)
+endif ()
+
+if ("coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    add_subdirectory(coreml)
+endif ()
+
 mmdeploy_add_module(${PROJECT_NAME} net_module.cpp)
 add_library(mmdeploy::net_module ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/net/acl/CMakeLists.txt
+++ b/csrc/mmdeploy/net/acl/CMakeLists.txt
@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_acl_net)
+
+if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    if (NOT DEFINED ASCEND_TOOLKIT_HOME)
+        set(ASCEND_TOOLKIT_HOME $ENV{ASCEND_TOOLKIT_HOME})
+    endif ()
+    mmdeploy_add_module(${PROJECT_NAME} acl_net.cpp)
+    target_include_directories(${PROJECT_NAME} PRIVATE
+        $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/include>)
+    target_link_libraries(${PROJECT_NAME} PRIVATE
+        $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub/libascendcl.so>)
+endif ()
--- a/csrc/mmdeploy/net/acl/acl_net.cpp
+++ b/csrc/mmdeploy/net/acl/acl_net.cpp
@ -0,0 +1,659 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/net/acl/acl_net.h"
+
+#include "mmdeploy/core/logger.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/utils/formatter.h"
+
+std::ostream& operator<<(std::ostream& os, const aclmdlIODims& dims) {
+  os << dims.name << " [";
+  for (int i = 0; i < dims.dimCount; ++i) {
+    os << (i ? ", " : "") << dims.dims[i];
+  }
+  os << "]";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const aclmdlBatch& batch) {
+  os << "batch [";
+  for (int i = 0; i < batch.batchCount; ++i) {
+    os << (i ? ", " : "") << batch.batch[i];
+  }
+  os << "]";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const aclmdlHW& hw) {
+  os << "HW [";
+  for (int i = 0; i < hw.hwCount; ++i) {
+    os << (i ? ", " : "") << "(" << hw.hw[i][0] << ", " << hw.hw[i][1] << ")";
+  }
+  os << "]";
+  return os;
+}
+
+namespace mmdeploy {
+
+namespace {
+
+inline Result<void> _m(aclError ec, SourceLocation loc = SourceLocation::current()) {
+  if (ec == ACL_SUCCESS) {
+    return success();
+  } else {
+    return Status(eFail, loc);
+  }
+}
+
+template <typename T>
+inline Result<T*> _p(T* ptr, SourceLocation loc = SourceLocation::current()) {
+  if (ptr) {
+    return ptr;
+  } else {
+    return Status(eFail, loc);
+  }
+}
+
+struct Context {
+  Context() {
+    std::lock_guard lock{mutex_};
+    if (ref_count_++ != 0) {
+      return;
+    }
+    auto ret = aclInit(nullptr);
+    if (ret == ACL_SUCCESS) {
+      MMDEPLOY_INFO("ACL initialized.");
+      owned_acl_ = true;
+    } else if (ret == ACL_ERROR_REPEAT_INITIALIZE) {
+      MMDEPLOY_INFO("ACL has already been initialized.");
+    } else {
+      MMDEPLOY_ERROR("aclInit() failed: {}", ret);
+      assert(ret == 0);
+    }
+  }
+  ~Context() {
+    std::lock_guard lock{mutex_};
+    if (--ref_count_ != 0) {
+      return;
+    }
+    // skip aclFinalize if aclInit is not successfully called by us.
+    if (owned_acl_) {
+      auto ret = aclFinalize();
+      if (ret == ACL_SUCCESS) {
+        MMDEPLOY_INFO("ACL finalized.");
+        owned_acl_ = false;
+      } else if (ret == ACL_ERROR_REPEAT_FINALIZE) {
+        MMDEPLOY_INFO("ACL has already been finalized.");
+      } else {
+        MMDEPLOY_ERROR("aclFinalize() failed: {}", ret);
+      }
+    }
+  }
+  static bool owned_acl_;
+  static int ref_count_;
+  static std::mutex mutex_;
+};
+
+bool Context::owned_acl_ = false;
+int Context::ref_count_ = 0;
+std::mutex Context::mutex_{};
+
+}  // namespace
+
+AclNet::~AclNet() {
+  auto dtor = [&]() -> Result<void> {
+    auto n_inputs = aclmdlGetDatasetNumBuffers(input_dataset_);
+    for (int i = 0; i < n_inputs; ++i) {
+      auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
+      auto data = aclGetDataBufferAddr(buffer);
+      OUTCOME_TRY(_m(aclrtFree(data)));
+    }
+    input_tensor_.clear();
+    OUTCOME_TRY(_m(aclmdlDestroyDataset(input_dataset_)));
+
+    auto n_outputs = aclmdlGetDatasetNumBuffers(output_dataset_);
+    for (int i = 0; i < n_outputs; ++i) {
+      auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
+      auto data = aclGetDataBufferAddr(buffer);
+      OUTCOME_TRY(_m(aclrtFree(data)));
+    }
+    output_tensor_.clear();
+    OUTCOME_TRY(_m(aclmdlDestroyDataset(output_dataset_)));
+
+    OUTCOME_TRY(_m(aclmdlDestroyDesc(model_desc_)));
+    OUTCOME_TRY(_m(aclmdlUnload(model_id_)));
+    return success();
+  };
+  if (auto r = dtor(); !r) {
+    MMDEPLOY_ERROR("uninit failed: {}", r.error().message().c_str());
+  }
+}
+
+namespace {
+
+Result<DataType> FromAclDataType(aclDataType data_type) {
+  switch (data_type) {
+    case ACL_FLOAT:
+      return DataType::kFLOAT;
+    case ACL_FLOAT16:
+      return DataType::kHALF;
+    case ACL_INT8:
+      return DataType::kINT8;
+    case ACL_INT32:
+      return DataType::kINT32;
+    case ACL_INT64:
+      return DataType::kINT64;
+    default:
+      return Status(eNotSupported);
+  }
+}
+
+Result<aclDataType> ToAclDataType(DataType data_type) {
+  switch (data_type) {
+    case DataType::kFLOAT:
+      return ACL_FLOAT;
+    case DataType::kHALF:
+      return ACL_FLOAT16;
+    case DataType::kINT8:
+      return ACL_INT8;
+    case DataType::kINT32:
+      return ACL_INT32;
+    case DataType::kINT64:
+      return ACL_INT64;
+    default:
+      return Status(eNotSupported);
+  }
+}
+
+Result<TensorDesc> ToTensorDesc(const aclmdlIODims& dims, aclDataType data_type) {
+  auto extract_name = [](const std::string& name) {
+    if (auto pos = name.find_last_of(':'); pos != std::string::npos) {
+      return name.substr(pos + 1);
+    } else {
+      return name;
+    }
+  };
+  OUTCOME_TRY(auto _data_type, FromAclDataType(data_type));
+  return TensorDesc{Device(0), _data_type,
+                    TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount),
+                    extract_name(dims.name)};
+}
+
+Result<size_t> GetByteSize(const aclmdlIODims& dims, aclDataType data_type) {
+  size_t byte_size = aclDataTypeSize(data_type);
+  for (int i = 0; i < dims.dimCount; ++i) {
+    if (dims.dims[i] < 0) {
+      return Status(eInvalidArgument);
+    }
+    byte_size *= dims.dims[i];
+  }
+  return byte_size;
+}
+
+}  // namespace
+
+// all dims must be fixed
+auto AclNet::CreateBuffers(const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers> {
+  OUTCOME_TRY(auto byte_size, GetByteSize(dims, data_type));
+  Buffers pair{};
+  void* dev_ptr{};
+  OUTCOME_TRY(_m(aclrtMalloc(&dev_ptr, byte_size, ACL_MEM_MALLOC_HUGE_FIRST)));
+  OUTCOME_TRY(_m(aclrtMemset(dev_ptr, byte_size, 0, byte_size)));
+  OUTCOME_TRY(pair.device_buffer, _p(aclCreateDataBuffer(dev_ptr, byte_size)));
+  OUTCOME_TRY(auto desc, ToTensorDesc(dims, data_type));
+  void* host_ptr{};
+  OUTCOME_TRY(_m(aclrtMallocHost(&host_ptr, byte_size)));
+  memset(host_ptr, 0, byte_size);
+  pair.host_tensor =
+      Tensor(desc, std::shared_ptr<void>(host_ptr, [](void* p) { aclrtFreeHost(p); }));
+  return pair;
+}
+
+auto AclNet::CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type)
+    -> Result<Buffers> {
+  for (int i = 0; i < dims.dimCount; ++i) {
+    if (dims.dims[i] == -1) {
+      dims.dims[i] = dynamic_batch_size_.back();
+    }
+  }
+  return CreateBuffers(dims, data_type);
+}
+
+auto AclNet::CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type)
+    -> Result<Buffers> {
+  aclmdlHW hw_desc{};
+  OUTCOME_TRY(_m(aclmdlGetDynamicHW(model_desc_, index, &hw_desc)));
+  if (hw_desc.hwCount > 0) {
+    auto& val = *std::max_element(hw_desc.hw, hw_desc.hw + hw_desc.hwCount,
+                                  [](auto u, auto v) { return u[0] * u[1] < v[0] * v[1]; });
+    int ptr = 0;
+    for (int i = 0; i < dims.dimCount; ++i) {
+      if (dims.dims[i] == -1) {
+        if (ptr == 2) {
+          return Status(eInvalidArgument);
+        }
+        dims.dims[i] = val[ptr++];
+      }
+    }
+    if (ptr != 2) {
+      return Status(eInvalidArgument);
+    }
+  }
+  return CreateBuffers(dims, data_type);
+}
+
+auto AclNet::CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims,
+                                      aclDataType data_type) -> Result<Buffers> {
+  int max_index = -1;
+  size_t max_value = 0;
+  aclmdlIODims max_shape{};
+  for (int j = 0; j < dynamic_input_dims_.size(); ++j) {
+    aclmdlIODims shape{};
+    strncpy(shape.name, dims.name, sizeof(shape.name));
+    shape.dimCount = dims.dimCount;
+    std::copy(dynamic_input_dims_[j].dims + dim_count,
+              dynamic_input_dims_[j].dims + dim_count + dims.dimCount, shape.dims);
+    OUTCOME_TRY(auto byte_size, GetByteSize(shape, data_type));
+    if (byte_size > max_value) {
+      max_index = j;
+      max_value = byte_size;
+      max_shape = shape;
+    }
+  }
+  if (max_index < 0) {
+    return Status(eInvalidArgument);
+  }
+  MMDEPLOY_INFO("max shape for input {}: {}", index, max_shape);
+  return CreateBuffers(max_shape, data_type);
+}
+
+Result<void> AclNet::ConfigDynamicShapes() {
+  aclError status = ACL_SUCCESS;
+  {
+    size_t dynamic_tensor_index{};
+    status = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &dynamic_tensor_index);
+    if (status == ACL_SUCCESS) {
+      dynamic_tensor_index_ = static_cast<int>(dynamic_tensor_index);
+      MMDEPLOY_INFO("dynamic tensor index: {}", dynamic_tensor_index);
+    }
+  }
+
+  if (dynamic_tensor_index_ >= 0) {
+    aclmdlBatch batch_desc{};
+    status = aclmdlGetDynamicBatch(model_desc_, &batch_desc);
+    if (status == ACL_SUCCESS && batch_desc.batchCount > 0) {
+      MMDEPLOY_INFO("{}, status = {}", batch_desc, status);
+      input_shape_type_ = kDynamicBatchSize;
+      dynamic_batch_size_.insert(dynamic_batch_size_.end(), batch_desc.batch,
+                                 batch_desc.batch + batch_desc.batchCount);
+      std::sort(dynamic_batch_size_.begin(), dynamic_batch_size_.end());
+    }
+
+    size_t dynamic_gear_count{0};
+    if (input_shape_type_ == kStatic) {
+      status = aclmdlGetInputDynamicGearCount(model_desc_, -1, &dynamic_gear_count);
+      dynamic_input_dims_.resize(dynamic_gear_count);
+      if (status == ACL_SUCCESS && dynamic_gear_count > 0) {
+        status = aclmdlGetInputDynamicDims(model_desc_, -1, dynamic_input_dims_.data(),
+                                           dynamic_gear_count);
+        for (const auto& dims : dynamic_input_dims_) {
+          MMDEPLOY_INFO("dynamic input dims: {}", dims);
+        }
+        input_shape_type_ = kDynamicDims;
+      } else {
+        input_shape_type_ = kDynamicImageSize;
+      }
+    }
+  }
+  return success();
+}
+
+Result<void> AclNet::CreateInputBuffers() {
+  input_dataset_ = aclmdlCreateDataset();
+  auto n_inputs = aclmdlGetNumInputs(model_desc_);
+  MMDEPLOY_INFO("n_inputs = {}, dynamic_tensor_index_ = {}", n_inputs, dynamic_tensor_index_);
+  int dim_count = 0;
+  for (int i = 0; i < n_inputs; ++i) {
+    if (i == dynamic_tensor_index_) {
+      void* data{};
+      auto input_len = aclmdlGetInputSizeByIndex(model_desc_, i);
+      OUTCOME_TRY(_m(aclrtMalloc(&data, input_len, ACL_MEM_MALLOC_HUGE_FIRST)));
+      OUTCOME_TRY(auto buffer, _p(aclCreateDataBuffer(data, input_len)));
+      OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffer)));
+    } else {
+      Buffers buffers{};
+      aclmdlIODims dims{};
+      OUTCOME_TRY(_m(aclmdlGetInputDims(model_desc_, i, &dims)));
+      input_dims_.push_back(dims);
+      auto data_type = aclmdlGetInputDataType(model_desc_, i);
+      input_data_type_.push_back(data_type);
+      MMDEPLOY_INFO("{}", dims);
+
+      switch (input_shape_type_) {
+        case kStatic: {
+          OUTCOME_TRY(buffers, CreateBuffers(dims, data_type));
+          break;
+        }
+        case kDynamicBatchSize: {
+          OUTCOME_TRY(buffers, CreateBuffersDynamicBatchSize(dims, data_type));
+          break;
+        }
+        case kDynamicImageSize: {
+          OUTCOME_TRY(buffers, CreateBuffersDynamicImageSize(i, dims, data_type));
+          break;
+        }
+        case kDynamicDims: {
+          OUTCOME_TRY(buffers, CreateBuffersDynamicDims(i, dim_count, dims, data_type));
+          break;
+        }
+        default:
+          return Status(eInvalidArgument);
+      }
+
+      OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffers.device_buffer)));
+      input_tensor_.push_back(std::move(buffers.host_tensor));
+      dim_count += dims.dimCount;
+    }
+  }
+  return success();
+}
+
+Result<void> AclNet::CreateOutputBuffers() {
+  output_dataset_ = aclmdlCreateDataset();
+  auto n_outputs = aclmdlGetNumOutputs(model_desc_);
+  std::vector<aclmdlIODims> output_dims;
+  for (int i = 0; i < n_outputs; ++i) {
+    aclmdlIODims dims{};
+    OUTCOME_TRY(_m(aclmdlGetOutputDims(model_desc_, i, &dims)));  // return max dims
+    output_dims_.push_back(dims);
+    MMDEPLOY_INFO("{}", dims);
+    auto data_type = aclmdlGetOutputDataType(model_desc_, i);
+    output_data_type_.push_back(data_type);
+    OUTCOME_TRY(auto buffers, CreateBuffers(dims, data_type));
+    OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(output_dataset_, buffers.device_buffer)));
+    output_tensor_.push_back(std::move(buffers.host_tensor));
+  }
+  return success();
+}
+
+Result<void> AclNet::Init(const Value& args) {
+  auto& context = args["context"];
+  cpu_stream_ = context["stream"].get<Stream>();
+
+  auto name = args["name"].get<std::string>();
+  auto model = context["model"].get<Model>();
+
+  device_id_ = context["device"].get<Device>().device_id();
+  acl_context_ = std::make_shared<Context>();
+
+  OUTCOME_TRY(auto config, model.GetModelConfig(name));
+  OUTCOME_TRY(auto binary, model.ReadFile(config.net));
+
+  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
+
+  OUTCOME_TRY(_m(aclmdlLoadFromMem(binary.data(), binary.size(), &model_id_)));
+
+  model_desc_ = aclmdlCreateDesc();
+  OUTCOME_TRY(_m(aclmdlGetDesc(model_desc_, model_id_)));
+
+  // dynamic_tensor_index_
+  // input_shape_type_
+  // dynamic_batch_size_
+  // dynamic_input_dims_
+  if (auto r = ConfigDynamicShapes(); !r) {
+    MMDEPLOY_ERROR("Failed to config dynamic shapes");
+    return r.as_failure();
+  }
+
+  // input_dataset_
+  // input_data_type_
+  // input_dims_
+  // input_tensor_
+  if (auto r = CreateInputBuffers(); !r) {
+    MMDEPLOY_ERROR("Failed to create input buffers");
+    return r.as_failure();
+  }
+
+  // output_dataset_
+  // output_data_type_
+  // output_dims_
+  // output_tensor_
+  if (auto r = CreateOutputBuffers(); !r) {
+    MMDEPLOY_ERROR("Failed to create output buffers");
+    return r.as_failure();
+  }
+
+  return success();
+}
+
+Result<void> AclNet::Deinit() { return success(); }
+
+Result<Span<Tensor>> AclNet::GetInputTensors() { return input_tensor_; }
+
+Result<Span<Tensor>> AclNet::GetOutputTensors() { return output_tensor_; }
+
+Result<void> AclNet::Reshape(Span<TensorShape> input_shapes) {
+  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
+  // Sanity checks
+  if (input_shapes.size() != input_dims_.size()) {
+    MMDEPLOY_ERROR("inconsistent num inputs");
+    return Status(eInvalidArgument);
+  }
+  for (int i = 0; i < input_dims_.size(); ++i) {
+    if (input_shapes[i].size() != input_dims_[i].dimCount) {
+      MMDEPLOY_ERROR("inconsistent num of dims");
+      return Status(eInvalidArgument);
+    }
+  }
+
+  switch (input_shape_type_) {
+    case kStatic: {
+      OUTCOME_TRY(ReshapeStatic(input_shapes));
+      break;
+    }
+    case kDynamicBatchSize: {
+      OUTCOME_TRY(ReshapeDynamicBatchSize(input_shapes));
+      break;
+    }
+    case kDynamicImageSize: {
+      OUTCOME_TRY(ReshapeDynamicImageSize(input_shapes));
+      break;
+    }
+    case kDynamicDims: {
+      OUTCOME_TRY(ReshapeDynamicDims(input_shapes));
+      break;
+    }
+    default:
+      return Status(eInvalidArgument);
+  }
+
+  for (int i = 0; i < input_shapes.size(); ++i) {
+    auto buffer = input_tensor_[i].buffer();
+    auto desc = input_tensor_[i].desc();
+    desc.shape = input_shapes[i];
+    input_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
+  }
+
+  for (int i = 0; i < output_dims_.size(); ++i) {
+    aclmdlIODims dims{};
+    OUTCOME_TRY(_m(aclmdlGetCurOutputDims(model_desc_, i, &dims)));
+    auto buffer = output_tensor_[i].buffer();
+    auto desc = output_tensor_[i].desc();
+    desc.shape = TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount);
+    output_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
+  }
+
+  return success();
+}
+
+Result<void> AclNet::ReshapeStatic(Span<TensorShape> input_shapes) {
+  for (int i = 0; i < input_dims_.size(); ++i) {
+    Span src(input_shapes[i]);
+    Span ref(input_dims_[i].dims, input_dims_[i].dimCount);
+    if (src != ref) {
+      MMDEPLOY_ERROR("Shape mismatch {} vs {}", src, ref);
+      return Status(eInvalidArgument);
+    }
+  }
+  return success();
+}
+
+Result<void> AclNet::ReshapeDynamicBatchSize(Span<TensorShape> input_shapes) {
+  int batch_size = -1;
+  for (int i = 0; i < input_dims_.size(); ++i) {
+    for (int j = 0; j < input_dims_[i].dimCount; ++j) {
+      if (input_dims_[i].dims[j] == -1) {
+        if (batch_size != -1 && batch_size != input_shapes[i][j]) {
+          // inconsistent batch size
+          return Status(eInvalidArgument);
+        }
+        batch_size = input_shapes[i][j];
+      }
+    }
+  }
+  if (batch_size < 0) {
+    MMDEPLOY_ERROR("unable to determine batch size");
+    return Status(eFail);
+  }
+  MMDEPLOY_INFO("batch size {} {}", batch_size, dynamic_tensor_index_);
+  auto index =
+      std::lower_bound(dynamic_batch_size_.begin(), dynamic_batch_size_.end(), batch_size) -
+      dynamic_batch_size_.begin();
+  if (index == dynamic_batch_size_.size()) {
+    MMDEPLOY_ERROR("Unsupported batch size: {}", batch_size);
+  }
+  // TODO: memset padding memory to avoid potential extra computation
+  OUTCOME_TRY(_m(aclmdlSetDynamicBatchSize(model_id_, input_dataset_, dynamic_tensor_index_,
+                                           dynamic_batch_size_[index])));
+  return success();
+}
+
+Result<void> AclNet::ReshapeDynamicImageSize(Span<TensorShape> input_shapes) {
+  uint64_t hw[2];
+  bool found = false;
+  for (int i = 0; i < input_dims_.size(); ++i) {
+    uint64_t tmp[2];
+    int ptr = 0;
+    for (int j = 0; j < input_dims_[i].dimCount; ++j) {
+      if (input_dims_[i].dims[j] == -1) {
+        if (ptr == 2) {
+          MMDEPLOY_ERROR("dynamic HW size out of bounds: {}", input_dims_[i]);
+          return Status(eInvalidArgument);
+        } else {
+          tmp[ptr++] = input_shapes[i][j];
+        }
+      }
+    }
+    if (ptr && ptr != 2) {
+      MMDEPLOY_ERROR("Partially determined dynamic HW size: {}", input_dims_[i]);
+      return Status(eInvalidArgument);
+    }
+    if (ptr == 2) {
+      if (found) {
+        if (hw[0] != tmp[0] || hw[1] != tmp[1]) {
+          MMDEPLOY_ERROR("Inconsistent dynamic HW size: ({}, {}) vs ({}, {})", hw[0], hw[1], tmp[0],
+                         tmp[1]);
+          return Status(eInvalidArgument);
+        }
+      } else {
+        found = true;
+        hw[0] = tmp[0];
+        hw[1] = tmp[1];
+      }
+    }
+  }
+  if (!found) {
+    MMDEPLOY_ERROR("Unable to determine image size");
+    return Status(eInvalidArgument);
+  }
+  MMDEPLOY_INFO("dynamic HW size ({}, {})", hw[0], hw[1]);
+  OUTCOME_TRY(
+      _m(aclmdlSetDynamicHWSize(model_id_, input_dataset_, dynamic_tensor_index_, hw[0], hw[1])));
+  return success();
+}
+
+Result<void> AclNet::ReshapeDynamicDims(Span<TensorShape> input_shapes) {
+  std::vector<int> match(dynamic_input_dims_.size(), 1);
+  aclmdlIODims dims{};
+  for (int i = 0; i < input_shapes.size(); ++i) {
+    const auto& shape = input_shapes[i];
+    for (int j = 0; j < shape.size(); ++j) {
+      if (input_dims_[i].dims[j] == -1) {
+        for (int k = 0; k < dynamic_input_dims_.size(); ++k) {
+          // disable profile when dims mismatch, except for the first dim (batch size)
+          if (j == 0 && shape[j] < dynamic_input_dims_[k].dims[dims.dimCount]) {
+            // pass
+          } else if (shape[j] != dynamic_input_dims_[k].dims[dims.dimCount]) {
+            match[k] = 0;
+          }
+        }
+      } else {
+        if (input_dims_[i].dims[j] != shape[j]) {
+          return Status(eNotSupported);
+        }
+      }
+      dims.dims[dims.dimCount++] = shape[j];
+    }
+  }
+  int dims_index = std::find(match.begin(), match.end(), 1) - match.begin();
+  if (dims_index == match.size()) {
+    MMDEPLOY_ERROR("Shape not supported: {}", dims);
+    return Status(eNotSupported);
+  }
+  // TODO: memset padding memory to avoid potential extra computation
+  OUTCOME_TRY(_m(aclmdlSetInputDynamicDims(model_id_, input_dataset_, dynamic_tensor_index_,
+                                           &dynamic_input_dims_[dims_index])));
+  return success();
+}
+
+Result<void> AclNet::Forward() {
+  OUTCOME_TRY(cpu_stream_.Wait());
+
+  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
+
+  for (int i = 0; i < input_tensor_.size(); ++i) {
+    auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
+    auto buffer_size = aclGetDataBufferSizeV2(buffer);
+    auto buffer_data = aclGetDataBufferAddr(buffer);
+    auto host_ptr = input_tensor_[i].data();
+    OUTCOME_TRY(_m(aclrtMemcpy(buffer_data, buffer_size, host_ptr, input_tensor_[i].byte_size(),
+                               ACL_MEMCPY_HOST_TO_DEVICE)));
+  }
+
+  OUTCOME_TRY(_m(aclmdlExecute(model_id_, input_dataset_, output_dataset_)));
+
+  for (int i = 0; i < output_tensor_.size(); ++i) {
+    auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
+    auto buffer_data = aclGetDataBufferAddr(buffer);
+    auto host_ptr = output_tensor_[i].data();
+    OUTCOME_TRY(_m(aclrtMemcpy(host_ptr, output_tensor_[i].byte_size(), buffer_data,
+                               output_tensor_[i].byte_size(), ACL_MEMCPY_DEVICE_TO_HOST)));
+  }
+  return success();
+}
+
+Result<void> AclNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
+
+class AclNetCreator : public Creator<Net> {
+ public:
+  const char* GetName() const override { return "ascend"; }
+  int GetVersion() const override { return 0; }
+  std::unique_ptr<Net> Create(const Value& args) override {
+    try {
+      auto p = std::make_unique<AclNet>();
+      if (auto r = p->Init(args)) {
+        return p;
+      } else {
+        MMDEPLOY_ERROR("error creating AclNet: {}", r.error().message().c_str());
+        return nullptr;
+      }
+    } catch (const std::exception& e) {
+      MMDEPLOY_ERROR("unhandled exception when creating AclNet: {}", e.what());
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_MODULE(Net, AclNetCreator);
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/net/acl/acl_net.h
+++ b/csrc/mmdeploy/net/acl/acl_net.h
@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_NET_ACL_ACL_NET_H_
+#define MMDEPLOY_SRC_NET_ACL_ACL_NET_H_
+
+#include "acl/acl.h"
+#include "mmdeploy/core/net.h"
+#include "mmdeploy/core/status_code.h"
+
+namespace mmdeploy {
+
+class AclNet : public Net {
+ public:
+  ~AclNet() override;
+  Result<void> Init(const Value& cfg) override;
+  Result<void> Deinit() override;
+  Result<Span<Tensor>> GetInputTensors() override;
+  Result<Span<Tensor>> GetOutputTensors() override;
+  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+  Result<void> Forward() override;
+  Result<void> ForwardAsync(Event* event) override;
+
+ private:
+  enum InputShapeType { kStatic, kDynamicBatchSize, kDynamicImageSize, kDynamicDims };
+
+  Result<void> ReshapeStatic(Span<TensorShape> input_shapes);
+  Result<void> ReshapeDynamicBatchSize(Span<TensorShape> input_shapes);
+  Result<void> ReshapeDynamicImageSize(Span<TensorShape> input_shapes);
+  Result<void> ReshapeDynamicDims(Span<TensorShape> input_shapes);
+
+  struct Buffers {
+    aclDataBuffer* device_buffer;
+    Tensor host_tensor;
+  };
+
+  Result<Buffers> CreateBuffers(const aclmdlIODims& dims, aclDataType data_type);
+
+  Result<Buffers> CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type);
+  Result<Buffers> CreateBuffersDynamicImageSize(int index, aclmdlIODims dims,
+                                                aclDataType data_type);
+  Result<Buffers> CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims,
+                                           aclDataType data_type);
+
+  Result<void> ConfigDynamicShapes();
+
+  Result<void> CreateInputBuffers();
+  Result<void> CreateOutputBuffers();
+
+  std::shared_ptr<void> acl_context_;
+  Stream cpu_stream_;
+  int32_t device_id_{0};
+  uint32_t model_id_{(uint32_t)-1};
+  aclmdlDesc* model_desc_{nullptr};
+  int dynamic_tensor_index_{-1};
+  InputShapeType input_shape_type_{kStatic};
+  std::vector<size_t> dynamic_batch_size_;
+  std::vector<aclmdlIODims> dynamic_input_dims_;
+  aclmdlDataset* input_dataset_{nullptr};
+  aclmdlDataset* output_dataset_{nullptr};
+  std::vector<aclmdlIODims> input_dims_;
+  std::vector<aclmdlIODims> output_dims_;
+  std::vector<aclDataType> input_data_type_;
+  std::vector<aclDataType> output_data_type_;
+  std::vector<Tensor> input_tensor_;
+  std::vector<Tensor> output_tensor_;
+};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_SRC_NET_ACL_ACL_NET_H_
--- a/csrc/mmdeploy/net/coreml/CMakeLists.txt
+++ b/csrc/mmdeploy/net/coreml/CMakeLists.txt
@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_coreml_net)
+
+if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+    find_library(CORE_ML CoreML)
+    find_library(FOUNDATION Foundation)
+    mmdeploy_add_module(${PROJECT_NAME} coreml_net.mm)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${CORE_ML} ${FOUNDATION})
+    add_library(mmdeploy::coreml_net ALIAS ${PROJECT_NAME})
+else ()
+    message(ERROR "'coreml_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
+endif ()
--- a/csrc/mmdeploy/net/coreml/coreml_net.h
+++ b/csrc/mmdeploy/net/coreml/coreml_net.h
@ -0,0 +1,37 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_NET_COREML_COREML_NET_H_
+#define MMDEPLOY_SRC_NET_COREML_COREML_NET_H_
+
+#include "mmdeploy/core/net.h"
+
+namespace mmdeploy {
+
+namespace coreml {
+class Execution;
+}  // namespace coreml
+
+class CoreMLNet : public Net {
+ public:
+  ~CoreMLNet() override = default;
+  Result<void> Init(const Value& cfg) override;
+  Result<void> Deinit() override;
+  Result<Span<Tensor>> GetInputTensors() override;
+  Result<Span<Tensor>> GetOutputTensors() override;
+  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+  Result<void> Forward() override;
+  Result<void> ForwardAsync(Event* event) override;
+
+ private:
+  std::unique_ptr<coreml::Execution> execution_;
+  std::vector<Tensor> input_tensors_;
+  std::vector<Tensor> output_tensors_;
+  Device device_;
+  Stream stream_;
+
+  friend class coreml::Execution;
+};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_SRC_NET_ORT_ORT_NET_H_
--- a/csrc/mmdeploy/net/coreml/coreml_net.mm
+++ b/csrc/mmdeploy/net/coreml/coreml_net.mm
@ -0,0 +1,326 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "coreml_net.h"
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/status_code.h"
+#include "mmdeploy/core/utils/filesystem.h"
+#include <fstream>
+
+#import <CoreML/CoreML.h>
+#import <Foundation/Foundation.h>
+#include <memory>
+
+@interface MMBatchTensorFeatureProvider : NSObject <MLBatchProvider> {
+  const std::vector<mmdeploy::Tensor> *inputs_;
+}
+
+- (instancetype)initWithInputs:(const std::vector<mmdeploy::Tensor> &)inputs;
+- (NSInteger)count;
+- (id<MLFeatureProvider>)featuresAtIndex:(NSInteger)index;
+@end
+
+@implementation MMBatchTensorFeatureProvider
+
+- (instancetype)initWithInputs:(const std::vector<mmdeploy::Tensor> &)inputs {
+  inputs_ = &inputs;
+  return self;
+}
+
+- (NSInteger)count {
+  return (*inputs_)[0].shape(0);
+}
+
+- (id<MLFeatureProvider>)featuresAtIndex:(NSInteger)index {
+  MLDictionaryFeatureProvider *feature = nil;
+  NSMutableDictionary<NSString *, id> *input_dict =
+      [[NSMutableDictionary<NSString *, id> alloc] init];
+
+  for (auto x : *inputs_) {
+    auto in = x.Slice(index);
+    NSMutableArray *shape = [[NSMutableArray alloc] init];
+    for (const auto dim : in.shape()) {
+      [shape addObject:[NSNumber numberWithLongLong:dim]];
+    }
+
+    NSMutableArray *strides = [[NSMutableArray alloc] init];
+    int64_t stride = 1;
+    for (int i = in.shape().size() - 1; i >= 0; i--) {
+      [strides insertObject:[NSNumber numberWithLongLong:stride] atIndex:0];
+      stride *= in.shape()[i];
+    }
+
+    MLMultiArrayDataType data_type = MLMultiArrayDataTypeFloat32;
+    NSError *error = nil;
+    MLMultiArray *mlArray =
+        [[MLMultiArray alloc] initWithDataPointer:in.data()
+                                            shape:shape
+                                         dataType:data_type
+                                          strides:strides
+                                      deallocator:(^(void *){
+                                                  })error:&error];
+    if (error != nil) {
+      MMDEPLOY_ERROR("init MLMultiArray failed with key: {}, error message: {}",
+                     in.name(), [[error localizedDescription] UTF8String]);
+      return nil;
+    }
+
+    NSString *key = [NSString stringWithUTF8String:in.name()];
+    input_dict[key] = mlArray;
+  }
+
+  NSError *error = nil;
+  feature = [[MLDictionaryFeatureProvider alloc] initWithDictionary:input_dict
+                                                              error:&error];
+  if (error != nil) {
+    MMDEPLOY_ERROR("init MLDictionaryFeatureProvider failed with index: {}, "
+                   "error message: {}",
+                   index, [[error localizedDescription] UTF8String]);
+    return nil;
+  }
+
+  return feature;
+}
+
+@end
+
+namespace mmdeploy {
+namespace coreml {
+
+static Result<void> CheckInputOutputFeatureType(MLFeatureType type) {
+  if (type != MLFeatureTypeMultiArray) {
+    MMDEPLOY_ERROR("unsupported feature type: {}", type);
+    return Status(eInvalidArgument);
+  }
+  return success();
+}
+
+static TensorShape to_shape(NSArray<NSNumber *> *shape) {
+  TensorShape _shape;
+  for (int i = 0; i < shape.count; i++) {
+    _shape.push_back(shape[i].intValue);
+  }
+  return _shape;
+}
+
+static Result<DataType> ConvertElementType(MLMultiArrayDataType type) {
+  switch (type) {
+  case MLMultiArrayDataTypeFloat32:
+    return DataType::kFLOAT;
+  case MLMultiArrayDataTypeFloat16:
+    return DataType::kHALF;
+  case MLMultiArrayDataTypeInt32:
+    return DataType::kINT32;
+  default:
+    MMDEPLOY_ERROR("unsupported MLMultiArrayDataType: {}",
+                   static_cast<int>(type));
+    return Status(eNotSupported);
+  }
+}
+
+static Result<Tensor> AsTensor(MLMultiArray *mlArray, const Device &device) {
+  TensorDesc desc;
+  desc.device = device;
+  desc.shape = to_shape(mlArray.shape);
+  OUTCOME_TRY(desc.data_type, ConvertElementType(mlArray.dataType));
+  std::shared_ptr<void> data(const_cast<void *>(mlArray.dataPointer),
+                             [](void *) {});
+  return Tensor(desc, data);
+}
+
+class Execution {
+public:
+  Execution(const std::string &path, CoreMLNet *net) : path_(path), net_(net) {}
+  ~Execution() { RemoveModel(); }
+
+  Result<void> Init() {
+    OUTCOME_TRY(LoadModel());
+    OUTCOME_TRY(SetInputOutputTensor());
+    return success();
+  }
+
+  Result<void> Forward() {
+    int batch_size = net_->input_tensors_[0].shape(0);
+
+    // prepare input
+    NSError *error = nil;
+    MMBatchTensorFeatureProvider *input_feature =
+        [[MMBatchTensorFeatureProvider alloc]
+            initWithInputs:net_->input_tensors_];
+
+    id<MLBatchProvider> output_feature =
+        [model_ predictionsFromBatch:input_feature error:&error];
+    if (error != nil) {
+      MMDEPLOY_ERROR("coreml forward failed, error message: {}",
+                     [[error localizedDescription] UTF8String]);
+      return Status(eFail);
+    }
+
+    // extract output
+    for (size_t i = 0; i < net_->output_tensors_.size(); ++i) {
+      auto &out = net_->output_tensors_[i];
+
+      for (int bid = 0; bid < output_feature.count; bid++) {
+        NSString *name =
+            [NSString stringWithCString:out.name()
+                               encoding:[NSString defaultCStringEncoding]];
+        if (name == nil) {
+          MMDEPLOY_ERROR("output name must not be nil");
+          return Status(eFail);
+        }
+        MLFeatureValue *output_value =
+            [[output_feature featuresAtIndex:bid] featureValueForName:name];
+        if (output_value == nil) {
+          MMDEPLOY_ERROR("model output doesn't have name tensort: {}",
+                         out.name());
+          return Status(eFail);
+        }
+
+        MLMultiArray *mlArray = [output_value multiArrayValue];
+        OUTCOME_TRY(auto tmp, AsTensor(mlArray, out.device()));
+        if (bid == 0) {
+          TensorShape batch_shape = tmp.shape();
+          batch_shape[0] = batch_size;
+          out.Reshape(batch_shape);
+        }
+
+        auto slice = out.Slice(bid);
+        OUTCOME_TRY(tmp.CopyTo(slice, net_->stream_));
+      }
+    }
+
+    return success();
+  }
+
+  Result<void> SetInputOutputTensor() {
+    // input
+    auto input_desc = model_.modelDescription.inputDescriptionsByName;
+    for (NSString *name in input_desc) {
+      MLFeatureDescription *value = input_desc[name];
+      OUTCOME_TRY(CheckInputOutputFeatureType(value.type));
+      // use default shape
+      auto shape = to_shape(value.multiArrayConstraint.shape);
+      OUTCOME_TRY(auto data_type,
+                  ConvertElementType(value.multiArrayConstraint.dataType));
+      net_->input_tensors_.emplace_back(
+          TensorDesc{net_->device_, data_type, shape, [name UTF8String]});
+    }
+
+    // output
+    auto output_desc = model_.modelDescription.outputDescriptionsByName;
+    for (NSString *name in output_desc) {
+      MLFeatureDescription *value = output_desc[name];
+      OUTCOME_TRY(auto data_type,
+                  ConvertElementType(value.multiArrayConstraint.dataType));
+      // can't get output shape
+      net_->output_tensors_.emplace_back(
+          TensorDesc{net_->device_, data_type, {}, [name UTF8String]});
+    }
+
+    return success();
+  }
+
+  Result<void> Reshape(Span<TensorShape> input_shapes) {
+    for (size_t i = 0; i < input_shapes.size(); ++i) {
+      net_->input_tensors_[i].Reshape(input_shapes[i]);
+    }
+    return success();
+  }
+
+  Result<void> LoadModel() {
+    NSString *model_path = [NSString stringWithUTF8String:path_.c_str()];
+    NSError *error = nil;
+    NSURL *model_url = [NSURL URLWithString:model_path];
+    compiled_model_url_ = [MLModel compileModelAtURL:model_url error:&error];
+    if (error != nil) {
+      MMDEPLOY_ERROR("failed to compile model, error message: {}",
+                     [[error localizedDescription] UTF8String]);
+      return Status(eFail);
+    }
+
+    MLModelConfiguration *config = [MLModelConfiguration alloc];
+    config.computeUnits = MLComputeUnitsAll;
+    model_ = [MLModel modelWithContentsOfURL:compiled_model_url_
+                               configuration:config
+                                       error:&error];
+    if (error != nil) {
+      MMDEPLOY_ERROR("failed to construct model, error message: {}",
+                     [[error localizedDescription] UTF8String]);
+      return Status(eFail);
+    }
+    return success();
+  }
+
+  void RemoveModel() {
+    NSError *error = nil;
+    if (compiled_model_url_ != nil) {
+      [[NSFileManager defaultManager] removeItemAtURL:compiled_model_url_
+                                                error:&error];
+      if (error != nil) {
+        MMDEPLOY_ERROR("failed to remove compiled model, error message: {}",
+                       [[error localizedDescription] UTF8String]);
+      }
+      compiled_model_url_ = nil;
+    }
+  }
+
+  NSURL *compiled_model_url_{nil};
+  MLModel *model_{nil};
+
+  std::string path_;
+  CoreMLNet *net_{nullptr};
+};
+
+} // namespace coreml
+
+Result<void> CoreMLNet::Init(const Value &cfg) {
+  auto &context = cfg["context"];
+  device_ = context["device"].get<Device>();
+  stream_ = context["stream"].get<Stream>();
+
+  auto name = cfg["name"].get<std::string>();
+  auto model = context["model"].get<Model>();
+  OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+  std::string coreml_tmp_path =
+      (fs::path(model.GetModelPath()) / config.net).string();
+  execution_ = std::make_unique<coreml::Execution>(coreml_tmp_path, this);
+  OUTCOME_TRY(execution_->Init());
+
+  return success();
+}
+
+Result<void> CoreMLNet::Deinit() { return success(); }
+
+Result<Span<Tensor>> CoreMLNet::GetInputTensors() { return input_tensors_; }
+
+Result<Span<Tensor>> CoreMLNet::GetOutputTensors() { return output_tensors_; }
+
+Result<void> CoreMLNet::Reshape(Span<TensorShape> input_shapes) {
+  return execution_->Reshape(input_shapes);
+}
+
+Result<void> CoreMLNet::Forward() { return execution_->Forward(); }
+
+Result<void> CoreMLNet::ForwardAsync(Event *event) {
+  return Status(eNotSupported);
+}
+
+class CoreMLNetCreator : public Creator<Net> {
+public:
+  const char *GetName() const override { return "coreml"; }
+  int GetVersion() const override { return 0; }
+  std::unique_ptr<Net> Create(const Value &args) override {
+    auto p = std::make_unique<CoreMLNet>();
+    if (auto r = p->Init(args)) {
+      return p;
+    } else {
+      MMDEPLOY_ERROR("error creating CoreMLNet: {}",
+                     r.error().message().c_str());
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_MODULE(Net, CoreMLNetCreator);
+
+} // namespace mmdeploy
--- a/csrc/mmdeploy/net/ppl/CMakeLists.txt
+++ b/csrc/mmdeploy/net/ppl/CMakeLists.txt
@ -14,7 +14,11 @@ if (PPLNN_USE_CUDA AND ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_CUDA=1)
    target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
    target_link_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+    target_link_libraries(${PROJECT_NAME} PRIVATE nvrtc)
 endif ()
 target_link_libraries(${PROJECT_NAME}
-        PRIVATE ${PPLNN_LIBRARIES} nvrtc)
+        PRIVATE ${PPLNN_LIBRARIES})
+if (PPLNN_USE_RISCV AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
+    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_RISCV=1)
+endif ()
 add_library(mmdeploy::pplnn_net ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/net/ppl/ppl_net.cpp
+++ b/csrc/mmdeploy/net/ppl/ppl_net.cpp
@ -18,6 +18,11 @@
 #include "ppl/nn/engines/cuda/ops.h"
 #define PPL_CUDA_IMPORT_FROM_BUFFER 1
 #endif
+#if PPL_NN_HAS_RISCV
+#include "ppl/nn/engines/riscv/engine_factory.h"
+#include "ppl/nn/engines/riscv/engine_options.h"
+#include "ppl/nn/engines/riscv/ops.h"
+#endif

 namespace mmdeploy {

@ -92,6 +97,18 @@ Result<void> PPLNet::Init(const Value& args) {
    engines_.emplace_back(ppl::nn::x86::EngineFactory::Create({}));
  }
 #endif
+#if PPL_NN_HAS_RISCV
+  if (device_.is_host()) {
+    ppl::nn::riscv::RegisterBuiltinOpImpls();
+    ppl::nn::riscv::EngineOptions options{};
+    // TODO:
+    //   FP16 -> postprocess
+    options.forward_precision = ppl::common::DATATYPE_FLOAT32;
+    options.dynamic_tuning_level = 0;
+    options.winograd_level = 1;
+    engines_.emplace_back(ppl::nn::riscv::EngineFactory::Create(options));
+  }
+#endif

  std::vector<ppl::nn::Engine*> engines;
  for (const auto& engine : engines_) {
--- a/csrc/mmdeploy/net/torchscript/CMakeLists.txt
+++ b/csrc/mmdeploy/net/torchscript/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_torch_net)
+
+option(MMDEPLOY_TORCHSCRIPT_SDK_BACKEND "Build TorchScript SDK backend" OFF)
+
+if (MMDEPLOY_TORCHSCRIPT_SDK_BACKEND)
+    find_package(Torch REQUIRED)
+    find_package(TorchVision QUIET)
+
+    mmdeploy_add_module(${PROJECT_NAME} torch_net.cpp)
+
+    target_link_libraries(${PROJECT_NAME} PRIVATE
+            ${TORCH_LIBRARIES})
+
+    target_link_directories(${PROJECT_NAME} INTERFACE
+            $<BUILD_INTERFACE:${Torch_DIR}/../../../lib>)
+
+    target_link_libraries(${PROJECT_NAME} PRIVATE
+            mmdeploy_torchscript_ops_obj)
+
+    if (TorchVision_FOUND)
+        target_link_libraries(${PROJECT_NAME} PRIVATE TorchVision::TorchVision)
+        target_compile_definitions(${PROJECT_NAME} PRIVATE -DMMDEPLOY_USE_TORCHVISION=1)
+    endif ()
+
+    add_library(mmdeploy::torch_net ALIAS ${PROJECT_NAME})
+endif ()
--- a/csrc/mmdeploy/net/torchscript/torch_net.cpp
+++ b/csrc/mmdeploy/net/torchscript/torch_net.cpp
@ -0,0 +1,237 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/net/torchscript/torch_net.h"
+
+#include "mmdeploy/core/model.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "torch/torch.h"
+
+#if MMDEPLOY_USE_CUDA
+#include "c10/cuda/CUDAGuard.h"
+#include "c10/cuda/CUDAStream.h"
+#endif
+
+#if MMDEPLOY_USE_TORCHVISION
+#include "torchvision/vision.h"
+MMDEPLOY_API void _mmdeploy_force_link_torchvision() { vision::detail::_register_ops(); }
+#endif
+
+namespace mmdeploy {
+
+namespace {
+
+class InferenceMode {
+#if TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 10
+  c10::InferenceMode guard_;
+#else
+  at::AutoNonVariableTypeMode guard_;
+#endif
+};
+
+class StreamGuard {
+ public:
+  StreamGuard(const torch::Device& device, Stream stream)
+      : device_(device), stream_(std::move(stream)), device_guard_(device) {
+    stream_.Wait().value();
+  }
+
+  ~StreamGuard() {
+#if MMDEPLOY_USE_CUDA
+    auto device = stream_.GetDevice();
+    if (device.is_device()) {
+      Stream stream(device, (cudaStream_t)c10::cuda::getCurrentCUDAStream(device_.index()));
+      stream.Wait().value();
+    }
+#endif
+  }
+
+ private:
+  torch::Device device_;
+  Stream stream_;
+  c10::DeviceGuard device_guard_;
+};
+
+Result<torch::ScalarType> FromDataType(DataType data_type) {
+  switch (data_type) {
+    case DataType::kFLOAT:
+      return torch::ScalarType::Float;
+    case DataType::kHALF:
+      return torch::ScalarType::Half;
+    case DataType::kINT32:
+      return torch::ScalarType::Int;
+    case DataType::kINT64:
+      return torch::ScalarType::Long;
+    case DataType::kINT8:
+      return torch::ScalarType::Char;
+    default:
+      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType: {}", to_string(data_type));
+      return Status(eNotSupported);
+  }
+}
+
+Result<DataType> ToDataType(torch::ScalarType scalar_type) {
+  switch (scalar_type) {
+    case torch::ScalarType::Float:
+      return DataType::kFLOAT;
+    case torch::ScalarType::Half:
+      return DataType::kHALF;
+    case torch::ScalarType::Int:
+      return DataType::kINT32;
+    case torch::ScalarType::Long:
+      return DataType::kINT64;
+    case torch::ScalarType::Char:
+      return DataType::kINT8;
+    default:
+      MMDEPLOY_ERROR("Unsupported torch::ScalarType: {}", toString(scalar_type));
+      return Status(eNotSupported);
+  }
+}
+
+}  // namespace
+
+TorchNet::~TorchNet() = default;
+
+Result<void> TorchNet::Init(const Value& cfg) {
+  auto& context = cfg["context"];
+  device_ = context["device"].get<Device>();
+  stream_ = context["stream"].get<Stream>();
+
+  auto name = cfg["name"].get<std::string>();
+  auto model = context["model"].get<Model>();
+
+  OUTCOME_TRY(auto config, model.GetModelConfig(name));
+  OUTCOME_TRY(auto bytes, model.ReadFile(config.net));
+
+  auto platform = Platform(device_.platform_id());
+  auto device_name = platform.GetPlatformName();
+
+  try {
+    {
+      using namespace std::string_literals;
+      if (device_name == "cpu"s) {
+        torch_device_ = torch::Device(device_name);
+      } else {
+        torch_device_ = torch::Device(device_name + ":"s + std::to_string(device_.device_id()));
+      }
+    }
+    std::istringstream iss(bytes);
+    InferenceMode guard;
+    module_ = torch::jit::load(iss);
+    module_.eval();
+    module_.to(*torch_device_);
+    auto forward = module_.get_method("forward");
+
+    auto ToDesc = [&](torch::jit::Value* value, const char* type, int index) {
+      MMDEPLOY_INFO("Found {}: {}", type, value->debugNameBase());
+      return TensorDesc{device_, DataType::kFLOAT, {}, "#" + std::to_string(index)};
+    };
+
+    auto inputs = forward.graph()->inputs();
+    int input_count = 0;
+    for (int i = 1; i < inputs.size(); ++i) {
+      if (inputs[i]->type()->kind() == c10::TypeKind::TensorType) {
+        input_tensor_.emplace_back(ToDesc(inputs[i], "input", input_count++));
+      } else {
+        MMDEPLOY_ERROR("Unsupported input type: {}", typeKindToString(inputs[i]->type()->kind()));
+        return Status(eNotSupported);
+      }
+    }
+
+    auto outputs = forward.graph()->outputs();
+    int output_count = 0;
+    for (const auto& output : outputs) {
+      auto kind = output->type()->kind();
+      if (kind == c10::TypeKind::TensorType) {
+        output_tensor_.emplace_back(ToDesc(output, "output", output_count++));
+      } else if (output->type()->kind() == c10::TypeKind::TupleType) {
+        for (const auto& v : output->node()->inputs()) {
+          if (v->type()->kind() == c10::TypeKind::TensorType) {
+            output_tensor_.emplace_back(ToDesc(v, "output", output_count++));
+          } else {
+            MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(v->type()->kind()));
+            return Status(eNotSupported);
+          }
+        }
+      } else {
+        MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(kind));
+      }
+    }
+    return success();
+  } catch (const std::exception& e) {
+    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    return Status(eFail);
+  }
+}
+
+Result<void> TorchNet::Deinit() { return success(); }
+Result<Span<Tensor>> TorchNet::GetInputTensors() { return input_tensor_; }
+Result<Span<Tensor>> TorchNet::GetOutputTensors() { return output_tensor_; }
+
+Result<void> TorchNet::Reshape(Span<TensorShape> input_shapes) {
+  if (input_shapes.size() != input_tensor_.size()) {
+    return Status(eInvalidArgument);
+  }
+  for (size_t i = 0; i < input_shapes.size(); ++i) {
+    input_tensor_[i].Reshape(input_shapes[i]);
+  }
+  return success();
+}
+
+Result<void> TorchNet::Forward() {
+  try {
+    StreamGuard stream_guard(*torch_device_, stream_);
+    InferenceMode inference_guard;
+    std::vector<torch::jit::IValue> inputs;
+    for (auto& v : input_tensor_) {
+      OUTCOME_TRY(auto data_type, FromDataType(v.data_type()));
+      auto tensor = torch::from_blob(v.data(), v.shape(),
+                                     c10::TensorOptions(*torch_device_).dtype(data_type));
+      inputs.emplace_back(tensor);
+    }
+    auto outputs = module_.forward(inputs);
+    if (outputs.isTensor()) {
+      OUTCOME_TRY(output_tensor_[0], FromTorchTensor(outputs.toTensor(), output_tensor_[0].name()));
+    } else if (outputs.isTuple()) {
+      auto tuple = outputs.toTuple();
+      size_t index = 0;
+      for (const auto& x : tuple->elements()) {
+        OUTCOME_TRY(output_tensor_[index],
+                    FromTorchTensor(x.toTensor(), output_tensor_[index].name()));
+        ++index;
+      }
+    } else {
+      MMDEPLOY_ERROR("{}", toString(outputs.type()));
+      return Status(eNotSupported);
+    }
+  } catch (const std::exception& e) {
+    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    return Status(eFail);
+  }
+  return success();
+}
+Result<void> TorchNet::ForwardAsync(Event* event) { return success(); }
+
+Result<Tensor> TorchNet::FromTorchTensor(const torch::Tensor& tensor, const std::string& name) {
+  OUTCOME_TRY(auto data_type, ToDataType(tensor.scalar_type()));
+  auto shape = tensor.sizes();
+  TensorDesc desc{device_, data_type, {shape.begin(), shape.end()}, name};
+  return Tensor(desc, std::shared_ptr<void>(tensor.data_ptr(), [tensor](auto) {}));
+}
+
+class TorchNetCreator : public Creator<Net> {
+ public:
+  const char* GetName() const override { return "torchscript"; }
+  std::unique_ptr<Net> Create(const Value& cfg) override {
+    auto p = std::make_unique<TorchNet>();
+    if (auto status = p->Init(cfg)) {
+      return p;
+    } else {
+      MMDEPLOY_ERROR("Failed to created TorchNet with config: {}", cfg);
+    }
+    return nullptr;
+  }
+};
+
+REGISTER_MODULE(Net, TorchNetCreator);
+
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/net/torchscript/torch_net.h
+++ b/csrc/mmdeploy/net/torchscript/torch_net.h
@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_MMDEPLOY_NET_TORCHSCRIPT_TORCH_NET_H_
+#define MMDEPLOY_CSRC_MMDEPLOY_NET_TORCHSCRIPT_TORCH_NET_H_
+
+#include "mmdeploy/core/net.h"
+#include "torch/script.h"
+
+namespace mmdeploy {
+
+class TorchNet : public Net {
+ public:
+  ~TorchNet() override;
+  Result<void> Init(const Value& cfg) override;
+  Result<void> Deinit() override;
+  Result<Span<Tensor>> GetInputTensors() override;
+  Result<Span<Tensor>> GetOutputTensors() override;
+  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+  Result<void> Forward() override;
+  Result<void> ForwardAsync(Event* event) override;
+
+ private:
+  Result<Tensor> FromTorchTensor(const torch::Tensor& tensor, const std::string& name);
+
+  torch::jit::script::Module module_;
+  std::vector<Tensor> input_tensor_;
+  std::vector<Tensor> output_tensor_;
+  Device device_;
+  Stream stream_;
+  std::optional<torch::Device> torch_device_;
+};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_MMDEPLOY_NET_TORCHSCRIPT_TORCH_NET_H_
--- a/csrc/mmdeploy/preprocess/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/CMakeLists.txt
@ -4,6 +4,9 @@ project(mmdeploy_transform_module)

 add_subdirectory(transform)
 add_subdirectory(cpu)
+if (MMDEPLOY_ELENA_FUSION)
+    add_subdirectory(elena)
+endif ()
 if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
    add_subdirectory(cuda)
 endif ()
--- a/csrc/mmdeploy/preprocess/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/cuda/CMakeLists.txt
@ -5,6 +5,7 @@ project(mmdeploy_cuda_transform_impl CUDA CXX)
 find_package(pplcv REQUIRED)

 set(SRCS
+        collect_impl.cpp
        crop_impl.cpp
        image2tensor_impl.cpp
        default_format_bundle_impl.cpp
--- a/csrc/mmdeploy/preprocess/cuda/collect_impl.cpp
+++ b/csrc/mmdeploy/preprocess/cuda/collect_impl.cpp
@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/collect.h"
+
+namespace mmdeploy {
+namespace cuda {
+
+class CollectImpl : public ::mmdeploy::CollectImpl {
+ public:
+  CollectImpl(const Value& args) : ::mmdeploy::CollectImpl(args) {}
+  ~CollectImpl() = default;
+};
+
+class CollectImplCreator : public Creator<::mmdeploy::CollectImpl> {
+ public:
+  const char* GetName() const override { return "cuda"; }
+  int GetVersion() const override { return 1; }
+  std::unique_ptr<::mmdeploy::CollectImpl> Create(const Value& args) override {
+    return std::make_unique<CollectImpl>(args);
+  }
+};
+
+}  // namespace cuda
+}  // namespace mmdeploy
+
+using mmdeploy::CollectImpl;
+using mmdeploy::cuda::CollectImplCreator;
+REGISTER_MODULE(CollectImpl, CollectImplCreator);
--- a/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_elena_transform_impl)
+
+set(SRCS
+        crop_impl.cpp
+        collect_impl.cpp
+        image2tensor_impl.cpp
+        default_format_bundle_impl.cpp
+        load_impl.cpp
+        normalize_impl.cpp
+        pad_impl.cpp
+        resize_impl.cpp
+        elena_registry.cpp)
+
+file(GLOB CPU_KERNEL_SRCS "cpu_kernel/*.cpp")
+
+set(ALL_SRCS ${SRCS} ${CPU_KERNEL_SRCS})
+if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+        file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
+        set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
+endif ()
+
+mmdeploy_add_module(${PROJECT_NAME} "${ALL_SRCS}")
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${PROJECT_NAME}
+        PRIVATE mmdeploy::transform)
+if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+        target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
+endif ()
+add_library(mmdeploy::transform_impl::elena ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/preprocess/elena/collect_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/collect_impl.cpp
@ -0,0 +1,145 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+#include <set>
+#include <string>
+
+#include "elena_registry.h"
+#include "mmdeploy/archive/json_archive.h"
+#include "mmdeploy/core/mat.h"
+#include "mmdeploy/core/tensor.h"
+#include "mmdeploy/core/utils/device_utils.h"
+#include "mmdeploy/core/utils/formatter.h"
+#include "mmdeploy/preprocess/transform/collect.h"
+#include "mmdeploy/preprocess/transform/tracer.h"
+
+namespace mmdeploy {
+namespace elena {
+
+using namespace trace;
+
+struct ExtractTransParamVisitor {
+  bool valid{true};
+  std::set<std::string> st;
+
+  std::array<float, 3> mean;
+  std::array<float, 3> std;
+  std::array<int, 2> resize_hw;
+  std::string resize_mode;
+  float pad_val;
+  std::array<int, 4> pad_tlbr;
+  std::array<int, 2> pad_hw;
+  std::array<int, 4> crop_tlbr;
+  std::array<int, 2> crop_hw;
+
+  void CheckValid(const std::string& name) {
+    if (st.count(name)) {
+      valid = false;
+      return;
+    }
+    st.insert(name);
+  }
+
+  void operator()(CvtColorParam&) {}
+  void operator()(CastParam&) {}
+  void operator()(HWC2CHWParam&) {}
+
+  void operator()(ResizeParam& param) {
+    CheckValid("Resize");
+    resize_hw = {param.size[0], param.size[1]};
+    resize_mode = param.mode;
+  }
+  void operator()(PadParam& param) {
+    CheckValid("Pad");
+    pad_val = param.pad_val;
+    std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
+    std::copy_n(param.size.begin(), 2, pad_hw.begin());
+  }
+  void operator()(NormParam& param) {
+    CheckValid("Normalize");
+    std::copy(param.mean.begin(), param.mean.end(), mean.begin());
+    std::copy(param.std.begin(), param.std.end(), std.begin());
+  }
+  void operator()(CropParam& param) {
+    CheckValid("CenterCrop");
+    std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
+    std::copy_n(param.size.begin(), 2, crop_hw.begin());
+  }
+};
+
+class CollectImpl : public ::mmdeploy::CollectImpl {
+ public:
+  CollectImpl(const Value& args) : ::mmdeploy::CollectImpl(args) {
+    Platform platform(device_.platform_id());
+    device_name_ = platform.GetPlatformName();
+    sha256_ = args["context"].value("sha256", std::string(""));
+  }
+
+  ~CollectImpl() = default;
+
+  Result<Value> Process(const Value& input) override {
+    auto tracer = input["__tracer__"].get<Tracer>();
+    Mat _src_mat = input["ori_img"].get<Mat>();
+    OUTCOME_TRY(auto src_mat, MakeAvailableOnDevice(_src_mat, device_, stream_));
+    OUTCOME_TRY(stream_.Wait());
+
+    ExtractTransParamVisitor visitor{};
+    for (auto&& trans : tracer.trans_) {
+      std::visit(visitor, trans);
+    }
+    std::string tag = sha256_ + "_" + device_name_;
+    FuseFunc func = FuseKernel::Get().GetFunc(tag);
+
+    if (!visitor.valid) {
+      MMDEPLOY_ERROR("unsupported fuse transform");
+      throw std::invalid_argument("");
+    }
+    if (src_mat.type() != DataType::kINT8) {
+      MMDEPLOY_ERROR("unsupported data type in fuse transform");
+      throw std::invalid_argument("");
+    }
+    if (!func) {
+      MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag);
+      throw std::invalid_argument("");
+    }
+
+    Value output = input;
+    auto img_fields = GetImageFields(input);
+    for (auto& key : img_fields) {
+      assert(input.contains(key));
+      auto src_tensor = input[key].get<Tensor>();
+      auto desc = src_tensor.desc();
+      desc.device = device_;
+      Tensor dst_tensor{desc};
+
+      func(stream_.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(),
+           to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1],
+           visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1],
+           visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1],
+           visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0],
+           visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0],
+           visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2),
+           dst_tensor.shape(3));
+      output[key] = std::move(dst_tensor);
+    }
+    return ::mmdeploy::CollectImpl::Process(output);
+  }
+
+  std::string sha256_;
+  std::string device_name_;
+};
+
+class CollectImplCreator : public Creator<::mmdeploy::CollectImpl> {
+ public:
+  const char* GetName() const override { return "elena"; }
+  int GetVersion() const override { return 1; }
+  std::unique_ptr<::mmdeploy::CollectImpl> Create(const Value& args) override {
+    return std::make_unique<CollectImpl>(args);
+  }
+};
+
+}  // namespace elena
+}  // namespace mmdeploy
+
+using mmdeploy::CollectImpl;
+using mmdeploy::elena::CollectImplCreator;
+REGISTER_MODULE(CollectImpl, CollectImplCreator);
--- a/csrc/mmdeploy/preprocess/elena/cpu_kernel/.gitkeep
+++ b/csrc/mmdeploy/preprocess/elena/cpu_kernel/.gitkeep
--- a/csrc/mmdeploy/preprocess/elena/crop_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/crop_impl.cpp
@ -0,0 +1,44 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/crop.h"
+
+using namespace std;
+
+namespace mmdeploy {
+namespace elena {
+
+class CenterCropImpl : public ::mmdeploy::CenterCropImpl {
+ public:
+  explicit CenterCropImpl(const Value& args) : ::mmdeploy::CenterCropImpl(args) {}
+
+ protected:
+  Result<Tensor> CropImage(const Tensor& tensor, int top, int left, int bottom,
+                           int right) override {
+    auto& src_desc = tensor.desc();
+    auto data_type = src_desc.data_type;
+    auto shape = src_desc.shape;
+    shape[1] = bottom - top + 1;  // h
+    shape[2] = right - left + 1;  // w
+
+    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
+    Tensor dummy(dummy_desc, dummy_buffer_);
+
+    return dummy;
+  }
+  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
+};
+
+class CenterCropImplCreator : public Creator<::mmdeploy::CenterCropImpl> {
+ public:
+  const char* GetName() const override { return "elena"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override { return make_unique<CenterCropImpl>(args); }
+};
+
+}  // namespace elena
+}  // namespace mmdeploy
+
+using ::mmdeploy::CenterCropImpl;
+using ::mmdeploy::elena::CenterCropImplCreator;
+
+REGISTER_MODULE(CenterCropImpl, CenterCropImplCreator);
--- a/csrc/mmdeploy/preprocess/elena/cuda_kernel/.gitkeep
+++ b/csrc/mmdeploy/preprocess/elena/cuda_kernel/.gitkeep
--- a/csrc/mmdeploy/preprocess/elena/default_format_bundle_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/default_format_bundle_impl.cpp
@ -0,0 +1,56 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/default_format_bundle.h"
+
+namespace mmdeploy {
+namespace elena {
+
+class DefaultFormatBundleImpl : public ::mmdeploy::DefaultFormatBundleImpl {
+ public:
+  explicit DefaultFormatBundleImpl(const Value& args) : ::mmdeploy::DefaultFormatBundleImpl(args) {}
+
+ protected:
+  Result<Tensor> ToFloat32(const Tensor& tensor, const bool& img_to_float) override {
+    auto& src_desc = tensor.desc();
+    auto data_type = src_desc.data_type;
+    auto shape = src_desc.shape;
+
+    if (img_to_float && data_type == DataType::kINT8) {
+      data_type = DataType::kFLOAT;
+    }
+
+    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
+    Tensor dummy(dummy_desc, dummy_buffer_);
+
+    return dummy;
+  }
+
+  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
+    auto& src_desc = tensor.desc();
+    auto data_type = src_desc.data_type;
+    auto shape = src_desc.shape;
+    shape = {shape[0], shape[3], shape[1], shape[2]};
+
+    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
+    Tensor dummy(dummy_desc, dummy_buffer_);
+
+    return dummy;
+  }
+  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
+};
+
+class DefaultFormatBundleImplCreator : public Creator<::mmdeploy::DefaultFormatBundleImpl> {
+ public:
+  const char* GetName() const override { return "elena"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override {
+    return std::make_unique<DefaultFormatBundleImpl>(args);
+  }
+};
+
+}  // namespace elena
+}  // namespace mmdeploy
+
+using mmdeploy::DefaultFormatBundleImpl;
+using mmdeploy::elena::DefaultFormatBundleImplCreator;
+REGISTER_MODULE(DefaultFormatBundleImpl, DefaultFormatBundleImplCreator);
--- a/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
+++ b/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "elena_registry.h"
+
+#include "mmdeploy/core/logger.h"
+
+namespace mmdeploy {
+namespace elena {
+
+FuseKernel& FuseKernel::Get() {
+  static FuseKernel fuse_kernel;
+  return fuse_kernel;
+}
+
+FuseFunc FuseKernel::GetFunc(const std::string& name) {
+  if (entries_.count(name)) {
+    return entries_[name];
+  }
+  return nullptr;
+}
+
+int FuseKernel::Register(const std::string& name, FuseFunc func) {
+  if (entries_.count(name)) {
+    return -1;
+  }
+  MMDEPLOY_DEBUG("Register fuse kernel: '{}'", name);
+  entries_.emplace(name, func);
+  return 0;
+}
+
+}  // namespace elena
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/preprocess/elena/elena_registry.h
+++ b/csrc/mmdeploy/preprocess/elena/elena_registry.h
@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_ELENA_REGISTRY_H_
+#define MMDEPLOY_ELENA_REGISTRY_H_
+
+#include <map>
+#include <string>
+
+#include "mmdeploy/core/macro.h"
+
+namespace mmdeploy {
+namespace elena {
+
+using FuseFunc = void (*)(void* stream, uint8_t* data_in, int src_h, int src_w, const char* format,
+                          int resize_h, int resize_w, const char* interpolation, int crop_top,
+                          int crop_left, int crop_h, int crop_w, float mean0, float mean1,
+                          float mean2, float std0, float std1, float std2, int pad_top,
+                          int pad_left, int pad_bottom, int pad_right, int pad_h, int pad_w,
+                          float pad_value, float* data_out, int dst_h, int dst_w);
+
+class MMDEPLOY_API FuseKernel {
+ public:
+  static FuseKernel& Get();
+  int Register(const std::string& name, FuseFunc func);
+  FuseFunc GetFunc(const std::string& name);
+
+ private:
+  FuseKernel() = default;
+  std::map<std::string, FuseFunc> entries_;
+};
+
+class MMDEPLOY_API FuseKernelRegister {
+ public:
+  FuseKernelRegister(const std::string& name, FuseFunc func) {
+    FuseKernel::Get().Register(name, func);
+  }
+};
+
+}  // namespace elena
+}  // namespace mmdeploy
+
+#define REGISTER_FUSE_KERNEL(name, module_name, func) \
+  static ::mmdeploy::elena::FuseKernelRegister g_register_##name##_##func(module_name, func);
+
+#endif
--- a/csrc/mmdeploy/preprocess/elena/image2tensor_impl.cpp
+++ b/csrc/mmdeploy/preprocess/elena/image2tensor_impl.cpp
@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/preprocess/transform/image2tensor.h"
+
+namespace mmdeploy {
+namespace elena {
+
+class ImageToTensorImpl : public ::mmdeploy::ImageToTensorImpl {
+ public:
+  explicit ImageToTensorImpl(const Value& args) : ::mmdeploy::ImageToTensorImpl(args) {}
+
+ protected:
+  Result<Tensor> HWC2CHW(const Tensor& tensor) override {
+    auto& src_desc = tensor.desc();
+    auto data_type = src_desc.data_type;
+    auto shape = src_desc.shape;
+    shape = {shape[0], shape[3], shape[1], shape[2]};
+
+    TensorDesc dummy_desc = {Device{"cpu"}, data_type, shape};
+    Tensor dummy(dummy_desc, dummy_buffer_);
+
+    return dummy;
+  }
+  Buffer dummy_buffer_{Device{"cpu"}, 0, nullptr};
+};
+
+class ImageToTensorImplCreator : public Creator<::mmdeploy::ImageToTensorImpl> {
+ public:
+  const char* GetName() const override { return "elena"; }
+  int GetVersion() const override { return 1; }
+  ReturnType Create(const Value& args) override {
+    return std::make_unique<ImageToTensorImpl>(args);
+  }
+};
+
+}  // namespace elena
+}  // namespace mmdeploy
+
+using mmdeploy::ImageToTensorImpl;
+using mmdeploy::elena::ImageToTensorImplCreator;
+REGISTER_MODULE(ImageToTensorImpl, ImageToTensorImplCreator);
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`backend_config = dict(type='coreml', convert_to='mlprogram')`
				`@ -0,0 +1 @@`
				`_base_ = ['../_base_/base_coreml_static-800x1344.py']`