[Feature] Merge NCNN deployment to grimoire based on mmcls - revert [#25](https://github.com/grimoire/deploy_prototype/pull/25) (#30)

* add * change VulkanSDK to 1.2.176.1 * add ncnn cmakelist * add ncnn source code as third party * add all ncnn * ncnn compile passed * onnx2ncnn correctly * fix code style * merge_as_grimoire_design, only backend_ops, manually register. * remove data and test sh * remove build example * remove config ncnn * remove onnx2ncnn intermediate files * remove other files auto-generated * remove vulkan tools * remove Vulkan, gitignore new rules, __init__ new lines * rollback __init__ to grimoire * remove pytorch version pending * grimoire comments reply 1, 3, 4 * reply comment 5,6,7 * add auto definer, add python register * fix lint * add ncnn deploy support * add model_wrapper, fix a typo bug, and add code comment for onnx2ncnn(WIP) * add model wrapper ncnn * fix lint * fix pep8 * fix pre-commit-config.yaml paths * fix import * fix lint * remove sys.path.append * remove sys * isort fix * fix double quoted * fix trailing space * try fix isort * fix clang-format-9 * fix requests * fix all comments * Fix typo * test code for grimoire * fix ops register * new definere * fix visualization of mmcls * remove temp * fix flake8 * fix seed-isort-config * fix thirdparty * fix thirdparty * fix yapf * fix third_party_sort * fix third party * fix clang-format * try fix clang-format * try to fix clang format 9 customreshape * try fix clang-format-9 * try fix clang-format-9 * try fix clang-format-9 * try fix ext * fix onnx2ncnn * Fix comments * Fix Comments * Fix Comments * Fix Comments * Fix conflict * Fix flake8 * Update .isort.cfg * Update ncnn_ext.cpp * Update ncnn_ext.cpp * fix missing ncnn backend code * delete out of date comments of gather.cpp * add DeployBaseClassifier * add return -100 error * clear out-of-date to do comments Co-authored-by: 韩睿 <SENSETIME\hanrui1@cn0614008774l.domain.sensetime.com> Co-authored-by: grimoire <yaoqian@sensetime.com> Co-authored-by: grimoire <streetyao@live.com>
2021-08-05 14:06:47 +08:00 · 2021-08-05 14:06:47 +08:00 · e05521c933
parent f607f1965b
commit e05521c933
29 changed files with 6348 additions and 35 deletions
--- a/.isort.cfg
+++ b/.isort.cfg
@ -1,2 +1,2 @@
 [settings]
-known_third_party = mmcls,mmcv,mmdet,numpy,onnx,onnxruntime,packaging,pytest,setuptools,tensorrt,torch
+known_third_party = mmcls,mmcv,mmdet,numpy,onnx,packaging,pytest,setuptools,tensorrt,torch
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -26,4 +26,18 @@ if (BUILD_TENSORRT_OPS)
    endif()
 endif()

+# NCNN config
+
+# enable ncnn
+option(BUILD_NCNN_OPS "enable NCNN ops" OFF)
+# NCNN search path
+if (BUILD_NCNN_OPS)
+    if (NOT DEFINED NCNN_DIR)
+        set(NCNN_DIR $ENV{NCNN_DIR})
+    endif()
+    if (NOT NCNN_DIR)
+    message(ERROR " NCNN_DIR is not found.")
+    endif()
+endif()
+
 add_subdirectory (backend_ops)
--- a/backend_ops/CMakeLists.txt
+++ b/backend_ops/CMakeLists.txt
@ -16,3 +16,9 @@ if (BUILD_TENSORRT_OPS)
    message("Build TensorRT custom ops.")
    add_subdirectory (tensorrt)
 endif()
+
+# build NCNN ops
+if (BUILD_NCNN_OPS)
+    message("Build NCNN custom ops")
+    add_subdirectory (ncnn)
+endif()
--- a/backend_ops/ncnn/CMakeLists.txt
+++ b/backend_ops/ncnn/CMakeLists.txt
@ -0,0 +1,16 @@
+set(TARGET_NAME mmlab_ncnn_ops)
+set(SHARED_TARGET ${TARGET_NAME})
+
+# ncnn
+set(ncnn_DIR ${NCNN_DIR}/build/install/lib/cmake/ncnn)
+find_package(ncnn)
+
+if (ncnn_FOUND)
+    message(STATUS "ncnn library found!")
+else ()
+    message(FATAL_ERROR "Could not locate ncnn" \n)
+endif()
+
+add_subdirectory (ops)
+add_subdirectory (onnx2ncnn)
+add_subdirectory (pyncnn_ext)
--- a/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
+++ b/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
@ -0,0 +1,15 @@
+
+find_package(Protobuf)
+
+if(PROTOBUF_FOUND)
+    protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS ${NCNN_DIR}/tools/onnx/onnx.proto)
+    add_executable(onnx2ncnn onnx2ncnn.cpp ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
+    target_include_directories(onnx2ncnn
+        PRIVATE
+            ${PROTOBUF_INCLUDE_DIR}
+            ${CMAKE_CURRENT_BINARY_DIR})
+    target_link_libraries(onnx2ncnn PRIVATE ${PROTOBUF_LIBRARIES})
+
+else()
+    message(FATAL_ERROR "Protobuf not found, onnx model convert tool won't be built")
+endif()
--- a/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
+++ b/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
--- a/backend_ops/ncnn/ops/CMakeLists.txt
+++ b/backend_ops/ncnn/ops/CMakeLists.txt
@ -0,0 +1,15 @@
+# add plugin source
+set(PLUGIN_LISTS custom_reshape
+                 gather
+                 shape)
+
+foreach(PLUGIN_ITER ${PLUGIN_LISTS})
+    file(GLOB PLUGIN_OPS_SRCS ${PLUGIN_ITER}/*.cpp)
+    file(GLOB PLUGIN_OPS_HEADS ${PLUGIN_ITER}/*.h)
+    set(BACKEND_OPS_SRCS ${BACKEND_OPS_SRCS} ${PLUGIN_OPS_SRCS} ${PLUGIN_OPS_HEADS})
+endforeach(PLUGIN_ITER)
+
+set(BACKEND_OPS_SRCS ${BACKEND_OPS_SRCS} ncnn_ops_register.cpp)
+
+add_library(${SHARED_TARGET} SHARED ${BACKEND_OPS_SRCS})
+target_link_libraries(${SHARED_TARGET} ncnn)
--- a/backend_ops/ncnn/ops/custom_reshape/custom_reshape.cpp
+++ b/backend_ops/ncnn/ops/custom_reshape/custom_reshape.cpp
@ -0,0 +1,217 @@
+#include "custom_reshape.h"
+
+#include "../ncnn_ops_definer.h"
+
+namespace mmlab {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(CustomReshape)
+DEFINE_NCNN_OPS(CustomReshape, CustomReshape)
+CustomReshape::CustomReshape() {
+  one_blob_only = false;
+  support_inplace = false;
+}
+
+int CustomReshape::load_param(const ParamDict &pd) {
+  permute = pd.get(0, 0);
+
+  return 0;
+}
+
+int CustomReshape::forward(const std::vector<Mat> &bottom_blobs,
+                           std::vector<Mat> &top_blobs,
+                           const Option &opt) const {
+  const Mat &bottom_blob = bottom_blobs[0];
+  Mat &top_blob = top_blobs[0];
+  int ndim = bottom_blobs[1].w;
+  int w = 0;
+  int h = 0;
+  int c = 0;
+  if (ndim == 1) {
+    w = (int)(bottom_blobs[1].row(0)[0] + 0.5);
+  }
+  if (ndim == 2) {
+    h = (int)(bottom_blobs[1].row(0)[0] + 0.5);
+    w = (int)(bottom_blobs[1].row(0)[1] + 0.5);
+  }
+  if (ndim == 3) {
+    c = (int)(bottom_blobs[1].row(0)[0] + 0.5);
+    h = (int)(bottom_blobs[1].row(0)[1] + 0.5);
+    w = (int)(bottom_blobs[1].row(0)[2] + 0.5);
+  }
+
+  size_t elemsize = bottom_blob.elemsize;
+  int total = bottom_blob.w * bottom_blob.h * bottom_blob.c;
+
+  int dims = bottom_blob.dims;
+
+  // resolve out shape
+  int outw = w;
+  int outh = h;
+  int outc = c;
+
+  if (ndim == 1) {
+    if (outw == 0)
+      outw = bottom_blob.w;
+
+    else if (outw == -1)
+      outw = total;
+
+    else {
+      fprintf(stderr,
+              "Warning: custom shape memory maybe invalid, using "
+              "bottom_blob shape!\n");
+      outw = bottom_blob.w;
+    }
+
+    if (dims == 1 && bottom_blob.w == outw) {
+      top_blob = bottom_blob;
+      return 0;
+    }
+  }
+  if (ndim == 2) {
+    if (outw == 0) outw = bottom_blob.w;
+    if (outh == 0) outh = bottom_blob.h;
+
+    if (outw == -1) outw = total / outh;
+    if (outh == -1) outh = total / outw;
+
+    if (dims == 2 && bottom_blob.h == outh) {
+      top_blob = bottom_blob;
+      return 0;
+    }
+  }
+  if (ndim == 3) {
+    if (outw == 0) outw = bottom_blob.w;
+    if (outh == 0) outh = bottom_blob.h;
+    if (outc == 0) outc = bottom_blob.c;
+
+    if (outw == -1) outw = total / outc / outh;
+    if (outh == -1) outh = total / outc / outw;
+    if (outc == -1) outc = total / outh / outw;
+
+    if (dims == 3 && bottom_blob.c == outc) {
+      top_blob = bottom_blob;
+      top_blob.w = outw;
+      top_blob.h = outh;
+      return 0;
+    }
+  }
+
+  bool need_permute = permute == 1;
+  if (dims == 2 && ndim == 2 && bottom_blob.h == outh) need_permute = false;
+  if (dims == 3 && ndim == 3 && bottom_blob.c == outc) need_permute = false;
+
+  if (need_permute) {
+    Mat bottom_blob_permuted = bottom_blob;
+
+    if (dims == 2) {
+      // hw -> wh
+      int _w = bottom_blob.w;
+      int _h = bottom_blob.h;
+
+      bottom_blob_permuted.create(_h, _w, elemsize, opt.workspace_allocator);
+      if (bottom_blob_permuted.empty()) return -100;
+      const float *ptr = bottom_blob;
+      float *outptr = bottom_blob_permuted;
+
+      for (int i = 0; i < _w; i++) {
+        for (int j = 0; j < _h; j++) {
+          outptr[i * _h + j] = ptr[j * _w + i];
+        }
+      }
+    }
+    if (dims == 3) {
+      // chw -> hwc
+      int _w = bottom_blob.w;
+      int _h = bottom_blob.h;
+      int channels = bottom_blob.c;
+
+      bottom_blob_permuted.create(channels, _w, _h, elemsize,
+                                  opt.workspace_allocator);
+      if (bottom_blob_permuted.empty()) return -100;
+
+#pragma omp parallel for num_threads(opt.num_threads)
+      for (int q = 0; q < _h; q++) {
+        float *outptr = bottom_blob_permuted.channel(q);
+
+        for (int i = 0; i < _w; i++) {
+          for (int j = 0; j < channels; j++) {
+            const float *ptr = bottom_blob.channel(j).row(q);
+            outptr[i * channels + j] = ptr[i];
+          }
+        }
+      }
+    }
+
+    if (ndim == 1) {
+      top_blob = bottom_blob_permuted.reshape(outw, opt.blob_allocator);
+      if (top_blob.empty()) return -100;
+
+      return 0;
+    }
+
+    // permute on nhwc/nhc
+    Mat top_blob_permuted;
+    if (ndim == 2) {
+      top_blob_permuted =
+          bottom_blob_permuted.reshape(outh, outw, opt.workspace_allocator);
+    }
+    if (ndim == 3) {
+      top_blob_permuted = bottom_blob_permuted.reshape(outc, outw, outh,
+                                                       opt.workspace_allocator);
+    }
+
+    if (top_blob_permuted.empty()) return -100;
+
+    if (ndim == 2) {
+      // wh -> hw
+      top_blob.create(outw, outh, elemsize, opt.blob_allocator);
+      if (top_blob.empty()) return -100;
+
+      const float *ptr = top_blob_permuted;
+      float *outptr = top_blob;
+
+      for (int i = 0; i < outh; i++) {
+        for (int j = 0; j < outw; j++) {
+          outptr[i * outw + j] = ptr[j * outh + i];
+        }
+      }
+    }
+    if (ndim == 3) {
+      // chw -> hwc
+      top_blob.create(outw, outh, outc, elemsize, opt.blob_allocator);
+      if (top_blob.empty()) return -100;
+
+#pragma omp parallel for num_threads(opt.num_threads)
+      for (int q = 0; q < outc; q++) {
+        float *outptr = top_blob.channel(q);
+
+        for (int i = 0; i < outh; i++) {
+          const float *ptr = top_blob_permuted.channel(i);
+
+          for (int j = 0; j < outw; j++) {
+            outptr[i * outw + j] = ptr[j * outc + q];
+          }
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  if (ndim == 1) {
+    top_blob = bottom_blob.reshape(outw, opt.blob_allocator);
+  }
+  if (ndim == 2) {
+    top_blob = bottom_blob.reshape(outw, outh, opt.blob_allocator);
+  }
+  if (ndim == 3) {
+    top_blob = bottom_blob.reshape(outw, outh, outc, opt.blob_allocator);
+  }
+
+  if (top_blob.empty()) return -100;
+
+  return 0;
+}
+
+}  // namespace mmlab
--- a/backend_ops/ncnn/ops/custom_reshape/custom_reshape.h
+++ b/backend_ops/ncnn/ops/custom_reshape/custom_reshape.h
@ -0,0 +1,30 @@
+#ifndef LAYER_CUSTOMRESHAPE_H
+#define LAYER_CUSTOMRESHAPE_H
+
+#include "layer.h"
+
+namespace mmlab {
+
+class CustomReshape : public ncnn::Layer {
+ public:
+  CustomReshape();
+
+  virtual int load_param(const ncnn::ParamDict& pd);
+
+  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                      std::vector<ncnn::Mat>& top_blobs,
+                      const ncnn::Option& opt) const;
+
+ public:
+  // reshape flag
+  // 0 = copy from bottom
+  // -1 = remaining
+  // -233 = drop this dim (default)
+
+  // flag permute chw->hwc or hw->wh before and after reshape
+  int permute;
+};
+
+}  // namespace mmlab
+
+#endif  // LAYER_CUSTOMRESHAPE_H
--- a/backend_ops/ncnn/ops/gather/gather.cpp
+++ b/backend_ops/ncnn/ops/gather/gather.cpp
@ -0,0 +1,246 @@
+#include "gather.h"
+
+#include "../ncnn_ops_definer.h"
+
+namespace mmlab {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(Gather)
+DEFINE_NCNN_OPS(Gather, Gather)
+Gather::Gather() {
+  one_blob_only = false;
+  support_inplace = false;
+}
+
+int Gather::load_param(const ParamDict &pd) {
+  axis = pd.get(0, 0);
+
+  return 0;
+}
+
+int Gather::forward(const std::vector<Mat> &bottom_blobs,
+                    std::vector<Mat> &top_blobs, const Option &opt) const {
+  const Mat &bottom_blob = bottom_blobs[0];
+  const Mat &indices = bottom_blobs[1];
+  int dims = bottom_blob.dims;
+  int indices_dims = indices.dims;
+  size_t elemsize = bottom_blob.elemsize;
+  int positive_axis = axis < 0 ? dims + axis : axis;
+  Mat &top_blob = top_blobs[0];
+
+  const float *indices_ptr = indices;
+
+  if (dims == 1 && indices_dims == 1)  // positive_axis == 0
+  {
+    int w = indices.w;
+    top_blob.create(w, elemsize, opt.blob_allocator);
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int i = 0; i < w; i++) {
+      float indice = indices_ptr[i];
+      outptr[i] = ptr[(int)(indice + 0.5)];
+    }
+
+    return 0;
+  }
+
+  if (dims == 1 && indices_dims == 2)  // positive_axis == 0
+  {
+    int w = indices.w;
+    int h = indices.h;
+    top_blob.create(w, h, elemsize, opt.blob_allocator);
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int j = 0; j < h; j++) {
+      for (int i = 0; i < w; i++) {
+        int indice = (int)(indices_ptr[j * w + i] + 0.5);
+        outptr[j * w + i] = ptr[indice];
+      }
+    }
+    return 0;
+  }
+  if (dims == 1 && indices_dims == 3)  // positive_axis == 0
+  {
+    int c = indices.c;
+    int w = indices.w;
+    int h = indices.h;
+    top_blob.create(c, w, h, elemsize, opt.blob_allocator);
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+
+    for (int page = 0; page < c; page++) {
+      indices_ptr = indices.channel(page);
+      float *outptr = top_blob.channel(page);
+      for (int j = 0; j < h; j++) {
+        for (int i = 0; i < w; i++) {
+          int indice = (int)(indices_ptr[j * w + i] + 0.5);
+          outptr[j * w + i] = ptr[indice];
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  if (dims == 2 && positive_axis == 0 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    top_blob.create(w, indices.w, elemsize, opt.blob_allocator);
+    // w -> w
+    // h -> indices.w
+    // h * w -> indices.w * w
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int i = 0; i < indices.w; i++) {
+      for (int j = 0; j < w; j++) {
+        int selected = (float)(indices_ptr[i] + 0.5);
+        outptr[i * w + j] = ptr[selected * w + j];
+      }
+    }
+
+    return 0;
+  }
+
+  if (dims == 2 && positive_axis == 1 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    top_blob.create(h, indices.w, elemsize, opt.blob_allocator);
+    // w -> h
+    // h -> indices.w
+    // h * w -> indices.w * h
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int i = 0; i < indices.w; i++) {
+      for (int j = 0; j < h; j++) {
+        int selected = (int)(indices_ptr[i] + 0.5);
+        outptr[i * h + j] = ptr[j * w + selected];
+      }
+    }
+    return 0;
+  }
+
+  if (dims == 2 && positive_axis == 0 && indices_dims == 2) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    top_blob.create(w, indices.w, indices.h, elemsize, opt.blob_allocator);
+
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+
+    for (int k = 0; k < indices.h; k++) {
+      float *outptr = top_blob.channel(k);
+      for (int i = 0; i < indices.w; i++) {
+        for (int j = 0; j < w; j++) {
+          int selected = (float)(indices_ptr[k * indices.w + i] + 0.5);
+          outptr[i * w + j] = ptr[selected * w + j];
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  if (dims == 2 && positive_axis == 1 && indices_dims == 2) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    top_blob.create(h, indices.w, indices.h, elemsize, opt.blob_allocator);
+
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    for (int k = 0; k < indices.h; k++) {
+      float *outptr = top_blob.channel(k);
+      for (int i = 0; i < indices.w; i++) {
+        for (int j = 0; j < h; j++) {
+          int selected = (int)(indices_ptr[k * indices.w + i] + 0.5);
+          outptr[i * h + j] = ptr[j * w + selected];
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  if (dims == 3 && positive_axis == 0 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    top_blob.create(w, h, indices.w, elemsize, opt.blob_allocator);
+
+    if (top_blob.empty()) {
+      return -100;
+    }
+    for (int i = 0; i < indices.w; i++) {
+      int selected = (int)(indices_ptr[i] + 0.5);
+      const unsigned char *ptr = bottom_blob.channel(selected);
+      unsigned char *outptr = top_blob.channel(i);
+
+      memcpy(outptr, ptr, w * h * elemsize);
+    }
+    return 0;
+  }
+
+  if (dims == 3 && positive_axis == 1 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    top_blob.create(w, channels, indices.w, elemsize, opt.blob_allocator);
+#pragma omp parallel for num_threads(opt.num_threads)
+    // use parallel programming
+    for (int i = 0; i < indices.w; i++) {
+      int selected = (int)(indices_ptr[i] + 0.5);
+      float *outptr = top_blob.channel(i);
+      for (int j = 0; j < channels; j++) {
+        const float *ptr = bottom_blob.channel(j);
+        for (int k = 0; k < w; k++) {
+          outptr[j * w + k] = ptr[selected * w + k];
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  if (dims == 3 && positive_axis == 2 && indices_dims == 1) {
+    fprintf(stderr, "gather: dim = 3\n");
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    top_blob.create(h, channels, indices.w, elemsize, opt.blob_allocator);
+#pragma omp parallel for num_threads(opt.num_threads)
+    // use parallel programming
+    for (int i = 0; i < indices.w; i++) {
+      int selected = (int)(indices_ptr[i] + 0.5);
+      float *outptr = top_blob.channel(i);
+      for (int j = 0; j < channels; j++) {
+        const float *ptr = bottom_blob.channel(j);
+        for (int k = 0; k < h; k++) {
+          outptr[j * h + k] = ptr[k * w + selected];
+        }
+      }
+    }
+    fprintf(stderr, "top_blob.size: (%d %d %d)\n", top_blob.c, top_blob.h,
+            top_blob.w);
+    return 0;
+  }
+
+  return 0;
+}
+
+}  //  namespace mmlab
--- a/backend_ops/ncnn/ops/gather/gather.h
+++ b/backend_ops/ncnn/ops/gather/gather.h
@ -0,0 +1,24 @@
+#ifndef LAYER_GATHER_H
+#define LAYER_GATHER_H
+
+#include "layer.h"
+
+namespace mmlab {
+
+class Gather : public ncnn::Layer {
+ public:
+  Gather();
+
+  virtual int load_param(const ncnn::ParamDict& pd);
+
+  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                      std::vector<ncnn::Mat>& top_blobs,
+                      const ncnn::Option& opt) const;
+
+ public:
+  int axis;
+};
+
+}  // namespace mmlab
+
+#endif  // LAYER_GATHER_H
--- a/backend_ops/ncnn/ops/ncnn_ops_definer.h
+++ b/backend_ops/ncnn/ops/ncnn_ops_definer.h
@ -0,0 +1,30 @@
+#ifndef NCNN_OPS_DEFINER_H
+#define NCNN_OPS_DEFINER_H
+
+#include <string>
+
+#include "layer.h"
+#include "ncnn_ops_register.h"
+
+namespace mmlab {
+
+class NCNNOpsDefiner {
+ public:
+  NCNNOpsDefiner(const std::string& ops_name,
+                 const ncnn::layer_creator_func& creator_func = 0,
+                 const ncnn::layer_destroyer_func& destroyer_func = 0)
+      : _ops_name(ops_name) {
+    get_mm_layer_creator()[_ops_name.c_str()] = creator_func;
+  }
+
+ private:
+  const std::string _ops_name;
+};
+
+#define DEFINE_NCNN_OPS(ops_name, OpsLayer)              \
+  static mmlab::NCNNOpsDefiner NCNNOpsDefiner##ops_name{ \
+      #ops_name, OpsLayer##_layer_creator};
+
+}  // namespace mmlab
+
+#endif
--- a/backend_ops/ncnn/ops/ncnn_ops_register.cpp
+++ b/backend_ops/ncnn/ops/ncnn_ops_register.cpp
@ -0,0 +1,35 @@
+#include "ncnn_ops_register.h"
+
+#include <iostream>
+
+std::map<const char *, ncnn::layer_creator_func> &get_mm_layer_creator() {
+  static std::map<const char *, ncnn::layer_creator_func> _layer_creator_map;
+  return _layer_creator_map;
+}
+
+std::map<const char *, ncnn::layer_destroyer_func> &get_mm_layer_destroyer() {
+  static std::map<const char *, ncnn::layer_destroyer_func>
+      _layer_destroyer_map;
+  return _layer_destroyer_map;
+}
+
+int register_mm_custom_layers(ncnn::Net &net) {
+  auto &layer_creator_map = get_mm_layer_creator();
+  auto &layer_destroyer_map = get_mm_layer_destroyer();
+
+  for (auto const &creator_pair : layer_creator_map) {
+    auto creator_name = creator_pair.first;
+    auto creator_func = creator_pair.second;
+
+    ncnn::layer_destroyer_func destroyer_func = 0;
+    if (layer_destroyer_map.find(creator_name) != layer_destroyer_map.end()) {
+      destroyer_func = layer_destroyer_map[creator_name];
+    }
+    int ret =
+        net.register_custom_layer(creator_name, creator_func, destroyer_func);
+    if (0 != ret) {
+      return ret;
+    }
+  }
+  return 0;
+}
--- a/backend_ops/ncnn/ops/ncnn_ops_register.h
+++ b/backend_ops/ncnn/ops/ncnn_ops_register.h
@ -0,0 +1,16 @@
+#ifndef NCNN_OPS_REGISTER_H
+#define NCNN_OPS_REGISTER_H
+
+#include <map>
+#include <string>
+
+#include "net.h"
+
+extern "C" {
+std::map<const char*, ncnn::layer_creator_func>& get_mm_layer_creator();
+std::map<const char*, ncnn::layer_destroyer_func>& get_mm_layer_destroyer();
+
+int register_mm_custom_layers(ncnn::Net& net);
+}
+
+#endif
--- a/backend_ops/ncnn/ops/shape/shape.cpp
+++ b/backend_ops/ncnn/ops/shape/shape.cpp
@ -0,0 +1,44 @@
+#include "shape.h"
+
+#include "../ncnn_ops_definer.h"
+
+namespace mmlab {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(Shape)
+DEFINE_NCNN_OPS(Shape, Shape)
+Shape::Shape() {
+  one_blob_only = true;
+  support_inplace = false;
+}
+
+int Shape::forward(const Mat &bottom_blob, Mat &top_blob,
+                   const Option &opt) const {
+  int dims = bottom_blob.dims;
+  int w = bottom_blob.w;
+  size_t elemsize = sizeof(bottom_blob.w);
+  top_blob.create(dims, elemsize, opt.blob_allocator);
+  if (top_blob.empty()) {
+    return -100;
+  }
+  float *outptr = top_blob;
+  if (dims == 1) {
+    outptr[0] = w;
+    return 0;
+  }
+  if (dims == 2) {
+    int h = bottom_blob.h;
+    outptr[0] = h;
+    outptr[1] = w;
+    return 0;
+  }
+  if (dims == 3) {
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    outptr[0] = channels;
+    outptr[1] = h;
+    outptr[2] = w;
+    return 0;
+  }
+}
+
+}  // namespace mmlab
--- a/backend_ops/ncnn/ops/shape/shape.h
+++ b/backend_ops/ncnn/ops/shape/shape.h
@ -0,0 +1,18 @@
+#ifndef LAYER_SHAPE_H
+#define LAYER_SHAPE_H
+
+#include "layer.h"
+
+namespace mmlab {
+
+class Shape : public ncnn::Layer {
+ public:
+  Shape();
+
+  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
+                      const ncnn::Option& opt) const;
+};
+
+}  // namespace mmlab
+
+#endif  // LAYER_SHAPE_H
--- a/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
+++ b/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
@ -0,0 +1,12 @@
+# ncnn
+set(ncnn_DIR ${NCNN_DIR}/build/install/lib/cmake/ncnn)
+find_package(ncnn)
+
+# pybind11
+set(PYBIND11_DIR ${NCNN_DIR}/python/pybind11)
+add_subdirectory(${PYBIND11_DIR} pybind11)
+
+include_directories(${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
+pybind11_add_module(ncnn_ext ncnn_ext.cpp)
+target_link_libraries(ncnn_ext PUBLIC ncnn ${SHARED_TARGET})
+set_target_properties(ncnn_ext PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/mmdeploy/apis/ncnn)
--- a/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
+++ b/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
@ -0,0 +1,11 @@
+#include <pybind11/pybind11.h>
+
+#include "../ops/ncnn_ops_register.h"
+#include "net.h"
+
+PYBIND11_MODULE(ncnn_ext, m) {
+  m.def(
+      "register_mm_custom_layers",
+      [](ncnn::Net &net) { return register_mm_custom_layers(net); },
+      "register all mmlab custom ncnn layers.");
+}
--- a/configs/_base_/backends/ncnn.py
+++ b/configs/_base_/backends/ncnn.py
@ -0,0 +1 @@
+backend = 'ncnn'
--- a/configs/mmcls/mmcls_ncnn.py
+++ b/configs/mmcls/mmcls_ncnn.py
@ -0,0 +1 @@
+_base_ = ['./mmcls_base.py', '../_base_/backends/ncnn.py']
--- a/mmdeploy/apis/ncnn/init.py
+++ b/mmdeploy/apis/ncnn/init.py
@ -0,0 +1,17 @@
+import importlib
+import os.path as osp
+
+from .init_plugins import get_onnx2ncnn_path, get_ops_path
+
+__all__ = ['get_ops_path', 'get_onnx2ncnn_path']
+
+
+def is_available():
+    ncnn_ops_path = get_ops_path()
+    if not osp.exists(ncnn_ops_path):
+        return False
+    has_pyncnn = importlib.util.find_spec('ncnn') is not None
+    has_pyncnn_ext = importlib.util.find_spec(
+        'mmdeploy.apis.ncnn.ncnn_ext') is not None
+
+    return has_pyncnn and has_pyncnn_ext
--- a/mmdeploy/apis/ncnn/init_plugins.py
+++ b/mmdeploy/apis/ncnn/init_plugins.py
@ -0,0 +1,25 @@
+import glob
+import os
+
+
+def get_ops_path():
+    """Get NCNN custom ops library path."""
+    wildcard = os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__),
+            '../../../build/lib/libmmlab_ncnn_ops.so'))
+
+    paths = glob.glob(wildcard)
+    lib_path = paths[0] if len(paths) > 0 else ''
+    return lib_path
+
+
+def get_onnx2ncnn_path():
+    """Get onnx2ncnn path."""
+    wildcard = os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), '../../../build/bin/onnx2ncnn'))
+
+    paths = glob.glob(wildcard)
+    lib_path = paths[0] if len(paths) > 0 else ''
+    return lib_path
--- a/mmdeploy/apis/tensorrt/init.py
+++ b/mmdeploy/apis/tensorrt/init.py
@ -15,8 +15,8 @@ def is_available():

 if is_available():
    from .onnx2tensorrt import onnx2tensorrt
-    from .tensorrt_utils import (TRTWrapper, load_trt_engine,
-                                 create_trt_engine, save_trt_engine)
+    from .tensorrt_utils import (TRTWrapper, create_trt_engine,
+                                 load_trt_engine, save_trt_engine)

    # load tensorrt plugin lib
    load_tensorrt_plugin()
--- a/mmdeploy/apis/utils.py
+++ b/mmdeploy/apis/utils.py
@ -104,6 +104,13 @@ def init_backend_model(model_files: Sequence[str],
            from mmdeploy.mmcls.export import TensorRTClassifier
            backend_model = TensorRTClassifier(
                model_files[0], class_names=class_names, device_id=device_id)
+        elif backend == 'ncnn':
+            from mmdeploy.mmcls.export import NCNNClassifier
+            backend_model = NCNNClassifier(
+                model_files[0],
+                model_files[1],
+                class_names=class_names,
+                device_id=device_id)
        else:
            raise NotImplementedError(f'Unsupported backend type: {backend}')
        return backend_model
--- a/mmdeploy/mmcls/export/init.py
+++ b/mmdeploy/mmcls/export/init.py
@ -1,4 +1,8 @@
-from .model_wrappers import ONNXRuntimeClassifier, TensorRTClassifier
+from .model_wrappers import (NCNNClassifier, ONNXRuntimeClassifier,
+                             TensorRTClassifier)
 from .prepare_input import create_input

-__all__ = ['create_input', 'ONNXRuntimeClassifier', 'TensorRTClassifier']
+__all__ = [
+    'create_input', 'NCNNClassifier', 'ONNXRuntimeClassifier',
+    'TensorRTClassifier'
+]
--- a/mmdeploy/mmcls/export/model_wrappers.py
+++ b/mmdeploy/mmcls/export/model_wrappers.py
@ -5,12 +5,34 @@ import torch
 from mmcls.models import BaseClassifier


-class ONNXRuntimeClassifier(BaseClassifier):
+class DeployBaseClassifier(BaseClassifier):
+    """Base Class of Wrapper for classifier's inference."""
+
+    def __init__(self, class_names, device_id):
+        super(DeployBaseClassifier, self).__init__()
+        self.CLASSES = class_names
+        self.device_id = device_id
+
+    def simple_test(self, img, *args, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def extract_feat(self, imgs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward_train(self, imgs, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+    def forward_test(self, imgs, *args, **kwargs):
+        raise NotImplementedError('This method is not implemented.')
+
+
+class ONNXRuntimeClassifier(DeployBaseClassifier):
    """Wrapper for classifier's inference with ONNXRuntime."""

    def __init__(self, onnx_file, class_names, device_id):
-        super(ONNXRuntimeClassifier, self).__init__()
+        super(ONNXRuntimeClassifier, self).__init__(class_names, device_id)
        import onnxruntime as ort
+
        sess = ort.InferenceSession(onnx_file)

        providers = ['CPUExecutionProvider']
@ -22,21 +44,10 @@ class ONNXRuntimeClassifier(BaseClassifier):
        sess.set_providers(providers, options)

        self.sess = sess
-        self.CLASSES = class_names
-        self.device_id = device_id
        self.io_binding = sess.io_binding()
        self.output_names = [_.name for _ in sess.get_outputs()]
        self.is_cuda_available = is_cuda_available

-    def simple_test(self, img, *args, **kwargs):
-        raise NotImplementedError('This method is not implemented.')
-
-    def extract_feat(self, imgs):
-        raise NotImplementedError('This method is not implemented.')
-
-    def forward_train(self, imgs, **kwargs):
-        raise NotImplementedError('This method is not implemented.')
-
    def forward_test(self, imgs, *args, **kwargs):
        input_data = imgs
        # set io binding for inputs/outputs
@ -59,10 +70,10 @@ class ONNXRuntimeClassifier(BaseClassifier):
        return list(results)


-class TensorRTClassifier(BaseClassifier):
+class TensorRTClassifier(DeployBaseClassifier):

    def __init__(self, trt_file, class_names, device_id):
-        super(TensorRTClassifier, self).__init__()
+        super(TensorRTClassifier, self).__init__(class_names, device_id)
        from mmdeploy.apis.tensorrt import TRTWrapper, load_tensorrt_plugin
        try:
            load_tensorrt_plugin()
@ -72,17 +83,6 @@ class TensorRTClassifier(BaseClassifier):
        model = TRTWrapper(trt_file)

        self.model = model
-        self.device_id = device_id
-        self.CLASSES = class_names
-
-    def simple_test(self, img, *args, **kwargs):
-        raise NotImplementedError('This method is not implemented.')
-
-    def extract_feat(self, imgs):
-        raise NotImplementedError('This method is not implemented.')
-
-    def forward_train(self, imgs, **kwargs):
-        raise NotImplementedError('This method is not implemented.')

    def forward_test(self, imgs, *args, **kwargs):
        input_data = imgs
@ -91,3 +91,33 @@ class TensorRTClassifier(BaseClassifier):
        results = results.detach().cpu().numpy()

        return list(results)
+
+
+class NCNNClassifier(DeployBaseClassifier):
+
+    def __init__(self, ncnn_param_file, ncnn_bin_file, class_names, device_id):
+        super(NCNNClassifier, self).__init__(class_names, device_id)
+        import ncnn
+        from mmdeploy.apis.ncnn import ncnn_ext
+        self.net = ncnn.Net()
+        ncnn_ext.register_mm_custom_layers(self.net)
+        self.net.load_param(ncnn_param_file)
+        self.net.load_model(ncnn_bin_file)
+
+    def forward_test(self, imgs, *args, **kwargs):
+        import ncnn
+        assert len(imgs.shape) == 4
+        # Only for batch == 1 now.
+        assert imgs.shape[0] == 1
+        input_data = imgs[0].cpu().numpy()
+        input_data = ncnn.Mat(input_data)
+        if self.device_id == -1:
+            ex = self.net.create_extractor()
+            ex.input('input', input_data)
+            ret, results = ex.extract('output')
+            results = np.array(results)
+            assert ret != -100, 'Memory allocation failed in ncnn layers'
+            assert ret == 0
+            return [results]
+        else:
+            raise NotImplementedError('GPU device is not implemented.')
--- a/mmdeploy/pytorch/ops/instance_norm.py
+++ b/mmdeploy/pytorch/ops/instance_norm.py
@ -55,7 +55,7 @@ def instance_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):

    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
    axes = list(range(1, input_rank - 1))
-    from torch.onnx.symbolic_opset9 import mul, add
+    from torch.onnx.symbolic_opset9 import add, mul
    return add(g, mul(g, norm, _unsqueeze_helper(g, weight, axes)),
               _unsqueeze_helper(g, bias, axes))

--- a/tests/test_core/test_register.py
+++ b/tests/test_core/test_register.py
@ -69,9 +69,10 @@ def test_function_rewriter():


 def test_module_rewriter():
-    from mmdeploy.core import MODULE_REWRITER, patch_model
    from torchvision.models.resnet import resnet50

+    from mmdeploy.core import MODULE_REWRITER, patch_model
+
    @MODULE_REWRITER.register_rewrite_module(
        module_type='torchvision.models.resnet.Bottleneck', backend='tensorrt')
    class BottleneckWrapper(torch.nn.Module):
@ -105,10 +106,11 @@ def test_module_rewriter():


 def test_symbolic_register():
+    import onnx
+    from torch.autograd import Function
+
    import mmdeploy
    from mmdeploy.core import SYMBOLIC_REGISTER, register_extra_symbolics
-    from torch.autograd import Function
-    import onnx

    class TestFunc(Function):

--- a/tools/deploy.py
+++ b/tools/deploy.py
@ -1,6 +1,7 @@
 import argparse
 import logging
 import os.path as osp
+import subprocess
 from functools import partial

 import mmcv
@ -140,6 +141,29 @@ def main():

            backend_files.append(osp.join(args.work_dir, save_file))

+    elif backend == 'ncnn':
+        from mmdeploy.apis.ncnn import get_onnx2ncnn_path
+        from mmdeploy.apis.ncnn import is_available as is_available_ncnn
+
+        if not is_available_ncnn():
+            logging.error('ncnn support is not available.')
+            exit(-1)
+
+        onnx2ncnn_path = get_onnx2ncnn_path()
+
+        backend_files = []
+        for onnx_path in onnx_files:
+            onnx_name = osp.splitext(osp.split(onnx_path)[1])[0]
+            save_param = onnx_name + '.param'
+            save_bin = onnx_name + '.bin'
+
+            save_param = osp.join(args.work_dir, save_param)
+            save_bin = osp.join(args.work_dir, save_bin)
+
+            subprocess.call([onnx2ncnn_path, onnx_path, save_param, save_bin])
+
+            backend_files += [save_param, save_bin]
+
    # check model outputs by visualization
    codebase = deploy_cfg['codebase']
				`@ -0,0 +1 @@`
				`_base_ = ['./mmcls_base.py', '../_base_/backends/ncnn.py']`