[Feature] Multi level roi align (#9)

* add multi level roi align * better trt plugin code * fix lint * optimizer trt_multi_level_roi_align.cpp * fix lint
2021-07-09 13:56:50 +08:00 · 2021-07-09 13:56:50 +08:00 · 8408ae2bcf
parent dae6a8ccf9
commit 8408ae2bcf
23 changed files with 733 additions and 85 deletions
--- a/backend_ops/tensorrt/CMakeLists.txt
+++ b/backend_ops/tensorrt/CMakeLists.txt
@ -51,7 +51,8 @@ INCLUDE_DIRECTORIES(${CUB_ROOT_DIR})
 set(PLUGIN_LISTS scatternd
                 nms
                 roi_align
-                 batched_nms)
+                 batched_nms
+                 multi_level_roi_align)

 foreach(PLUGIN_ITER ${PLUGIN_LISTS})
        file(GLOB PLUGIN_OPS_SRCS ${PLUGIN_ITER}/*.cpp ${PLUGIN_ITER}/*.cu)
@ -59,11 +60,10 @@ foreach(PLUGIN_ITER ${PLUGIN_LISTS})
        set(BACKEND_OPS_SRCS ${BACKEND_OPS_SRCS} ${PLUGIN_OPS_SRCS} ${PLUGIN_OPS_HEADS})
 endforeach(PLUGIN_ITER)

-list(APPEND BACKEND_OPS_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/trt_plugin.cpp")
 list(APPEND BACKEND_OPS_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/common_impl/trt_cuda_helper.cu")

 set(INFER_PLUGIN_LIB ${TENSORRT_LIBRARY})

-cuda_add_library(${SHARED_TARGET} SHARED ${BACKEND_OPS_SRCS})
+cuda_add_library(${SHARED_TARGET} MODULE ${BACKEND_OPS_SRCS})
 target_link_libraries(${SHARED_TARGET} ${INFER_PLUGIN_LIB})
 target_include_directories(${SHARED_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
--- a/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
@ -16,9 +16,6 @@ static const char* NMS_PLUGIN_VERSION{"1"};
 static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
 }  // namespace

-PluginFieldCollection TRTBatchedNMSPluginDynamicCreator::mFC{};
-std::vector<PluginField> TRTBatchedNMSPluginDynamicCreator::mPluginAttributes;
-
 TRTBatchedNMSPluginDynamic::TRTBatchedNMSPluginDynamic(NMSParameters params)
    : param(params) {}

@ -268,3 +265,5 @@ void TRTBatchedNMSPluginDynamicCreator::setPluginNamespace(
 const char* TRTBatchedNMSPluginDynamicCreator::getPluginNamespace() const {
  return mNamespace.c_str();
 }
+
+REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSPluginDynamicCreator);
--- a/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+++ b/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
@ -109,9 +109,9 @@ class TRTBatchedNMSPluginDynamicCreator : public nvinfer1::IPluginCreator {
  const char* getPluginNamespace() const override;

 private:
-  static nvinfer1::PluginFieldCollection mFC;
+  nvinfer1::PluginFieldCollection mFC;
  nvinfer1::plugin::NMSParameters params;
-  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
  std::string mNamespace;
 };

--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
@ -0,0 +1,260 @@
+
+#include "trt_multi_level_roi_align.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_multi_level_roi_align_kernel.hpp"
+#include "trt_serialize.hpp"
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
+}  // namespace
+
+MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic(
+    const std::string &name, int alignedHeight, int alignedWidth, int sampleNum,
+    const std::vector<float> &featmapStrides, float roiScaleFactor,
+    int finestScale, bool aligned)
+    : mLayerName(name),
+      mAlignedHeight(alignedHeight),
+      mAlignedWidth(alignedWidth),
+      mSampleNum(sampleNum),
+      mFeatmapStrides(featmapStrides),
+      mRoiScaleFactor(roiScaleFactor),
+      mFinestScale(finestScale),
+      mAligned(aligned) {}
+
+MultiLevelRoiAlignPluginDynamic::MultiLevelRoiAlignPluginDynamic(
+    const std::string name, const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mAlignedHeight);
+  deserialize_value(&data, &length, &mAlignedWidth);
+  deserialize_value(&data, &length, &mSampleNum);
+  deserialize_value(&data, &length, &mRoiScaleFactor);
+  deserialize_value(&data, &length, &mFinestScale);
+  deserialize_value(&data, &length, &mAligned);
+  deserialize_value(&data, &length, &mFeatmapStrides);
+}
+
+nvinfer1::IPluginV2DynamicExt *MultiLevelRoiAlignPluginDynamic::clone() const {
+  MultiLevelRoiAlignPluginDynamic *plugin = new MultiLevelRoiAlignPluginDynamic(
+      mLayerName, mAlignedHeight, mAlignedWidth, mSampleNum, mFeatmapStrides,
+      mRoiScaleFactor, mFinestScale, mAligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs MultiLevelRoiAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  assert(nbInputs == mFeatmapStrides.size() + 1);
+
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[1].d[1];
+  ret.d[2] = exprBuilder.constant(mAlignedHeight);
+  ret.d[3] = exprBuilder.constant(mAlignedWidth);
+
+  return ret;
+}
+
+bool MultiLevelRoiAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  const auto *in = inOut;
+  const auto *out = inOut + nbInputs;
+  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+void MultiLevelRoiAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  // Validate input arguments
+  assert(nbOutputs == 1);
+  assert(nbInputs == mFeatmapStrides.size() + 1);
+}
+
+size_t MultiLevelRoiAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int MultiLevelRoiAlignPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int num_rois = inputDesc[0].dims.d[0];
+  int batch_size = inputDesc[1].dims.d[0];
+  int channels = inputDesc[1].dims.d[1];
+
+  const int kMaxFeatMap = 10;
+  int heights[kMaxFeatMap];
+  int widths[kMaxFeatMap];
+  float strides[kMaxFeatMap];
+
+  int num_feats = mFeatmapStrides.size();
+  for (int i = 0; i < num_feats; ++i) {
+    heights[i] = inputDesc[i + 1].dims.d[2];
+    widths[i] = inputDesc[i + 1].dims.d[3];
+    strides[i] = mFeatmapStrides[i];
+  }
+
+  const void *rois = inputs[0];
+  const void *const *feats = inputs + 1;
+
+  multi_level_roi_align<float>((float *)outputs[0], (const float *)rois,
+                               num_rois, feats, num_feats, batch_size, channels,
+                               &heights[0], &widths[0], &strides[0],
+                               mAlignedHeight, mAlignedWidth, mSampleNum,
+                               mRoiScaleFactor, mFinestScale, mAligned, stream);
+
+  return 0;
+}
+
+nvinfer1::DataType MultiLevelRoiAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return nvinfer1::DataType::kFLOAT;
+}
+
+// IPluginV2 Methods
+const char *MultiLevelRoiAlignPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *MultiLevelRoiAlignPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int MultiLevelRoiAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int MultiLevelRoiAlignPluginDynamic::initialize() { return 0; }
+
+void MultiLevelRoiAlignPluginDynamic::terminate() {}
+
+size_t MultiLevelRoiAlignPluginDynamic::getSerializationSize() const {
+  return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
+         serialized_size(mAlignedWidth) + serialized_size(mSampleNum) +
+         serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
+         serialized_size(mAligned);
+}
+
+void MultiLevelRoiAlignPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mAlignedHeight);
+  serialize_value(&buffer, mAlignedWidth);
+  serialize_value(&buffer, mSampleNum);
+  serialize_value(&buffer, mRoiScaleFactor);
+  serialize_value(&buffer, mFinestScale);
+  serialize_value(&buffer, mAligned);
+  serialize_value(&buffer, mFeatmapStrides);
+}
+
+void MultiLevelRoiAlignPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void MultiLevelRoiAlignPluginDynamic::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *MultiLevelRoiAlignPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+MultiLevelRoiAlignPluginDynamicCreator::
+    MultiLevelRoiAlignPluginDynamicCreator() {
+  mPluginAttributes = std::vector<nvinfer1::PluginField>(
+      {nvinfer1::PluginField("output_height"),
+       nvinfer1::PluginField("output_width"),
+       nvinfer1::PluginField("sampling_ratio"),
+       nvinfer1::PluginField("featmap_strides"),
+       nvinfer1::PluginField("roi_scale_factor"),
+       nvinfer1::PluginField("finest_scale"),
+       nvinfer1::PluginField("aligned")});
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+MultiLevelRoiAlignPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int alignedHeight = 7;
+  int alignedWidth = 7;
+  int sampleNum = 2;
+  std::vector<float> featmapStrides;
+  float roiScaleFactor = -1;
+  int finestScale = 56;
+  bool aligned = false;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("output_height") == 0) {
+      alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("output_width") == 0) {
+      alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("sampling_ratio") == 0) {
+      sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("roi_scale_factor") == 0) {
+      roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("finest_scale") == 0) {
+      finestScale = static_cast<const int *>(fc->fields[i].data)[0];
+    } else if (field_name.compare("featmap_strides") == 0) {
+      int data_size = (fc->fields[i].length) / sizeof(float);
+      const float *data_start = static_cast<const float *>(fc->fields[i].data);
+      featmapStrides = std::vector<float>(data_start, data_start + data_size);
+    } else if (field_name.compare("aligned") == 0) {
+      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
+      aligned = aligned_int != 0;
+    }
+  }
+
+  assert(featmapStrides.size() != 0);
+
+  MultiLevelRoiAlignPluginDynamic *plugin = new MultiLevelRoiAlignPluginDynamic(
+      name, alignedHeight, alignedWidth, sampleNum, featmapStrides,
+      roiScaleFactor, finestScale, aligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *MultiLevelRoiAlignPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin =
+      new MultiLevelRoiAlignPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void MultiLevelRoiAlignPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *MultiLevelRoiAlignPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+REGISTER_TENSORRT_PLUGIN(MultiLevelRoiAlignPluginDynamicCreator);
--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
@ -0,0 +1,115 @@
+#ifndef TRT_MULTI_LEVEL_ROI_ALIGN_HPP
+#define TRT_MULTI_LEVEL_ROI_ALIGN_HPP
+
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class MultiLevelRoiAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  MultiLevelRoiAlignPluginDynamic(const std::string &name, int alignedHeight,
+                                  int alignedWidth, int sampleNum,
+                                  const std::vector<float> &featmapStrides,
+                                  float roiScaleFactor = -1,
+                                  int finestScale = 56, bool aligned = false);
+
+  MultiLevelRoiAlignPluginDynamic(const std::string name, const void *data,
+                                  size_t length);
+
+  // It doesn't make sense to make MultiLevelRoiAlignPluginDynamic without
+  // arguments, so we delete default constructor.
+  MultiLevelRoiAlignPluginDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mAlignedHeight;
+  int mAlignedWidth;
+  int mSampleNum;
+  std::vector<float> mFeatmapStrides;
+  float mRoiScaleFactor;
+  int mFinestScale;
+  bool mAligned;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class MultiLevelRoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  MultiLevelRoiAlignPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif  // TRT_ROI_ALIGN_HPP
--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
@ -0,0 +1,212 @@
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_multi_level_roi_align_kernel.hpp"
+#include "trt_plugin_helper.hpp"
+
+const int kMAX_FEATMAP_SIZE = 10;
+struct FeatData {
+  const void *data[kMAX_FEATMAP_SIZE];
+  int batch_size;
+  int channels;
+  int h[kMAX_FEATMAP_SIZE];
+  int w[kMAX_FEATMAP_SIZE];
+  float spatial_scale[kMAX_FEATMAP_SIZE];
+  int num_featmap;
+};
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
+                                         const int height, const int width,
+                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+  // do bilinear interpolation
+  scalar_t lt = bottom_data[y_low * width + x_low];
+  scalar_t rt = bottom_data[y_low * width + x_high];
+  scalar_t lb = bottom_data[y_high * width + x_low];
+  scalar_t rb = bottom_data[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
+
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t roi_align_single(
+    const scalar_t *bottom_data, const int roi_batch_ind,
+    const scalar_t roi_start_w, const scalar_t roi_start_h,
+    const scalar_t roi_end_w, const scalar_t roi_end_h,
+    const scalar_t spatial_scale, const int pw, const int ph, const int c,
+    const int sample_num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const bool aligned) {
+  // Force malformed ROIs to be 1x1
+  scalar_t roi_width = fmaxf((scalar_t)roi_end_w - (scalar_t)roi_start_w, 0.);
+  scalar_t roi_height = fmaxf((scalar_t)roi_end_h - (scalar_t)roi_start_h, 0.);
+  if (!aligned) {
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+  }
+
+  const scalar_t bin_size_h = roi_height / pooled_height;
+  const scalar_t bin_size_w = roi_width / pooled_width;
+
+  const scalar_t *offset_bottom_data =
+      bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+  int sample_num_h = (sample_num > 0)
+                         ? sample_num
+                         : ceil(roi_height / pooled_height);  // e.g., = 2
+  int sample_num_w =
+      (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+  scalar_t output_val = 0;
+#pragma unroll
+  for (int iy = 0; iy < sample_num_h; iy++) {
+    const scalar_t y =
+        roi_start_h + ph * bin_size_h +
+        (scalar_t)(iy + scalar_t(.5f)) * bin_size_h / (scalar_t)(sample_num_h);
+#pragma unroll
+    for (int ix = 0; ix < sample_num_w; ix++) {
+      const scalar_t x = roi_start_w + pw * bin_size_w +
+                         (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
+                             (scalar_t)(sample_num_w);
+      scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                    width, y, x);
+      output_val += val;
+    }
+  }
+  output_val /= max(sample_num_h * sample_num_w, 1);
+
+  return output_val;
+}
+
+template <typename scalar_t>
+__global__ void roi_extractor_kernel(
+    scalar_t *output, const scalar_t *bottom_rois, FeatData feat_data,
+    const int sample_num, const float roi_scale_factor, const int finest_scale,
+    const int pooled_height, const int pooled_width, const bool aligned,
+    int nThreads) {
+  CUDA_1D_KERNEL_LOOP(index, nThreads) {
+    const int channels = feat_data.channels;
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+
+    scalar_t roi_offset_x0 = offset_bottom_rois[1];
+    scalar_t roi_offset_y0 = offset_bottom_rois[2];
+    scalar_t roi_offset_x1 = offset_bottom_rois[3];
+    scalar_t roi_offset_y1 = offset_bottom_rois[4];
+
+    const scalar_t scale = sqrtf((roi_offset_y1 - roi_offset_y0) *
+                                 (roi_offset_x1 - roi_offset_x0));
+
+    const int target_lvls =
+        fminf(feat_data.num_featmap - 1,
+              fmaxf(0, floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6))));
+
+    if (roi_scale_factor > 0.) {
+      const scalar_t roi_off_cx = (roi_offset_x0 + roi_offset_x1) * 0.5;
+      const scalar_t roi_off_cy = (roi_offset_y0 + roi_offset_y1) * 0.5;
+      const scalar_t roi_off_w =
+          (roi_offset_x1 - roi_offset_x0 + 1) * roi_scale_factor;
+      const scalar_t roi_off_h =
+          (roi_offset_y1 - roi_offset_y0 + 1) * roi_scale_factor;
+
+      roi_offset_x0 = roi_off_cx - roi_off_w * 0.5 + 0.5;
+      roi_offset_x1 = roi_off_cx + roi_off_w * 0.5 - 0.5;
+      roi_offset_y0 = roi_off_cy - roi_off_h * 0.5 + 0.5;
+      roi_offset_y1 = roi_off_cy + roi_off_h * 0.5 - 0.5;
+    }
+
+    const scalar_t spatial_scale =
+        (scalar_t)feat_data.spatial_scale[target_lvls];
+    const int height = feat_data.h[target_lvls];
+    const int width = feat_data.w[target_lvls];
+    const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
+
+    const int roi_batch_ind = offset_bottom_rois[0];
+    const scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    const scalar_t roi_start_w = roi_offset_x0 * spatial_scale - offset;
+    const scalar_t roi_start_h = roi_offset_y0 * spatial_scale - offset;
+    const scalar_t roi_end_w = (roi_offset_x1)*spatial_scale - offset;
+    const scalar_t roi_end_h = (roi_offset_y1)*spatial_scale - offset;
+
+    const scalar_t output_val = roi_align_single<scalar_t>(
+        bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w,
+        roi_end_h, spatial_scale, pw, ph, c, sample_num, channels, height,
+        width, pooled_height, pooled_width, aligned);
+
+    output[index] = output_val;
+  }
+}
+
+template <typename T>
+void multi_level_roi_align(T *output, const T *rois, int num_rois,
+                           const void *const *feats, int num_feats, int n,
+                           int c, int *h, int *w, float *strides,
+                           int aligned_height, int aligned_width,
+                           int sample_num, float roi_scale_factor,
+                           int finest_scale, bool aligned,
+                           cudaStream_t stream) {
+  FeatData feat_data;
+  feat_data.batch_size = n;
+  feat_data.channels = c;
+  feat_data.num_featmap = num_feats;
+  for (int i = 0; i < num_feats; ++i) {
+    feat_data.data[i] = feats[i];
+    feat_data.h[i] = h[i];
+    feat_data.w[i] = w[i];
+    feat_data.spatial_scale[i] = 1. / float(strides[i]);
+  }
+  int nThreads = num_rois * c * aligned_height * aligned_width;
+  // bool aligned = true;
+  roi_extractor_kernel<T>
+      <<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
+          output, rois, feat_data, sample_num, roi_scale_factor, finest_scale,
+          aligned_height, aligned_width, aligned, nThreads);
+}
+
+template void multi_level_roi_align<float>(
+    float *output, const float *rois, int num_rois, const void *const *feats,
+    int num_feats, int n, int c, int *h, int *w, float *strides,
+    int aligned_height, int aligned_width, int sample_num,
+    float roi_scale_factor, int finest_scale, bool aligned,
+    cudaStream_t stream);
--- a/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
+++ b/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
@ -0,0 +1,13 @@
+#ifndef TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
+#define TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
+#include <cuda_runtime.h>
+
+template <typename T>
+void multi_level_roi_align(T *output, const T *rois, int num_rois,
+                           const void *const *feats, int num_feats, int n,
+                           int c, int *h, int *w, float *strides,
+                           int aligned_height, int aligned_width,
+                           int sample_num, float roi_scale_factor,
+                           int finest_scale, bool aligned, cudaStream_t stream);
+
+#endif  // TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
--- a/backend_ops/tensorrt/nms/trt_nms.cpp
+++ b/backend_ops/tensorrt/nms/trt_nms.cpp
@ -24,10 +24,6 @@ static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"NonMaxSuppression"};
 }  // namespace

-nvinfer1::PluginFieldCollection NonMaxSuppressionDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    NonMaxSuppressionDynamicCreator::mPluginAttributes;
-
 NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
    const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
    float iouThreshold, float scoreThreshold, int offset)
@ -168,8 +164,10 @@ int NonMaxSuppressionDynamic::initialize() { return 0; }
 void NonMaxSuppressionDynamic::terminate() {}

 size_t NonMaxSuppressionDynamic::getSerializationSize() const {
-  return sizeof(mCenterPointBox) + sizeof(mMaxOutputBoxesPerClass) +
-         sizeof(mIouThreshold) + sizeof(mScoreThreshold) + sizeof(mOffset);
+  return serialized_size(mCenterPointBox) +
+         serialized_size(mMaxOutputBoxesPerClass) +
+         serialized_size(mIouThreshold) + serialized_size(mScoreThreshold) +
+         serialized_size(mOffset);
 }

 void NonMaxSuppressionDynamic::serialize(void *buffer) const {
@ -276,3 +274,5 @@ void NonMaxSuppressionDynamicCreator::setPluginNamespace(
 const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
  return mNamespace.c_str();
 }
+
+REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator);
--- a/backend_ops/tensorrt/nms/trt_nms.hpp
+++ b/backend_ops/tensorrt/nms/trt_nms.hpp
@ -100,8 +100,8 @@ class NonMaxSuppressionDynamicCreator : public nvinfer1::IPluginCreator {
  const char *getPluginNamespace() const override;

 private:
-  static nvinfer1::PluginFieldCollection mFC;
-  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
  std::string mNamespace;
 };
 #endif  // TRT_NMS_HPP
--- a/backend_ops/tensorrt/nms/trt_nms_kernel.cu
+++ b/backend_ops/tensorrt/nms/trt_nms_kernel.cu
@ -9,8 +9,8 @@
 #include <vector>

 #include "common_cuda_helper.hpp"
-#include "nms_cuda_kernel.cuh"
 #include "trt_cuda_helper.cuh"
+#include "trt_nms_kernel.cuh"
 #include "trt_plugin_helper.hpp"

 struct NMSBox {
@ -72,8 +72,8 @@ __global__ void mask_to_output_kernel(const unsigned long long* dev_mask,
  int start = *output_count;
  int out_per_class_count = 0;
  for (int i = 0; i < spatial_dimension; i++) {
-    const int nblock = i / threadsPerBlock;
-    const int inblock = i % threadsPerBlock;
+    const int nblock = i / THREADS_PER_BLOCK;
+    const int inblock = i % THREADS_PER_BLOCK;
    if (!(remv[nblock] & (1ULL << inblock))) {
      if (threadIdx.x == 0) {
        output[start * 3 + 0] = batch_id;
@ -113,7 +113,7 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
  size_t scores_workspace = getAlignedSize(spatial_dimension * boxes_word_size);
  size_t boxes_workspace =
      getAlignedSize(spatial_dimension * 4 * boxes_word_size);
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks = DIVUP(spatial_dimension, THREADS_PER_BLOCK);
  size_t mask_workspace = getAlignedSize(spatial_dimension * col_blocks *
                                         sizeof(unsigned long long));
  size_t index_template_workspace =
@ -162,7 +162,7 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
                                    size_t output_length, void* workspace,
                                    cudaStream_t stream) {
  using mmlab::getAlignedSize;
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks = DIVUP(spatial_dimension, THREADS_PER_BLOCK);
  float* boxes_sorted = (float*)workspace;
  workspace = static_cast<char*>(workspace) +
              getAlignedSize(spatial_dimension * 4 * sizeof(float));
@ -214,7 +214,7 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
  cudaCheckError();

  dim3 blocks(col_blocks, col_blocks);
-  dim3 threads(threadsPerBlock);
+  dim3 threads(THREADS_PER_BLOCK);

  for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
    for (int cls_id = 0; cls_id < num_classes; ++cls_id) {
@ -261,7 +261,7 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
                                               offset, boxes_sorted, dev_mask);

      // will be performed when dev_mask is full.
-      mask_to_output_kernel<<<1, threadsPerBlock,
+      mask_to_output_kernel<<<1, THREADS_PER_BLOCK,
                              col_blocks * sizeof(unsigned long long),
                              stream>>>(
          dev_mask, index_cache, output, output_count, batch_id, cls_id,
--- a/backend_ops/tensorrt/nms/nms_cuda_kernel.cuh
+++ b/backend_ops/tensorrt/nms/nms_cuda_kernel.cuh
--- a/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
@ -17,10 +17,6 @@ static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"MMCVRoiAlign"};
 }  // namespace

-nvinfer1::PluginFieldCollection RoIAlignPluginDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    RoIAlignPluginDynamicCreator::mPluginAttributes;
-
 RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name,
                                             int outWidth, int outHeight,
                                             float spatialScale,
@ -162,8 +158,9 @@ int RoIAlignPluginDynamic::initialize() { return 0; }
 void RoIAlignPluginDynamic::terminate() {}

 size_t RoIAlignPluginDynamic::getSerializationSize() const {
-  return sizeof(mOutWidth) + sizeof(mOutHeight) + sizeof(mSpatialScale) +
-         sizeof(mSampleRatio) + sizeof(mPoolMode) + sizeof(mAligned);
+  return serialized_size(mOutWidth) + serialized_size(mOutHeight) +
+         serialized_size(mSpatialScale) + serialized_size(mSampleRatio) +
+         serialized_size(mPoolMode) + serialized_size(mAligned);
 }

 void RoIAlignPluginDynamic::serialize(void *buffer) const {
@ -291,3 +288,5 @@ void RoIAlignPluginDynamicCreator::setPluginNamespace(
 const char *RoIAlignPluginDynamicCreator::getPluginNamespace() const {
  return mNamespace.c_str();
 }
+
+REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
--- a/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
@ -101,8 +101,8 @@ class RoIAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
  const char *getPluginNamespace() const override;

 private:
-  static nvinfer1::PluginFieldCollection mFC;
-  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
  std::string mNamespace;
 };
 #endif  // TRT_ROI_ALIGN_HPP
--- a/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
+++ b/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
@ -1,5 +1,5 @@
 #include "common_cuda_helper.hpp"
-#include "roi_align_cuda_kernel.cuh"
+#include "trt_roi_align_kernel.cuh"

 template <typename scalar_t>
 void TRTRoIAlignForwardCUDAKernelLauncher(
--- a/backend_ops/tensorrt/roi_align/roi_align_cuda_kernel.cuh
+++ b/backend_ops/tensorrt/roi_align/roi_align_cuda_kernel.cuh
--- a/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
@ -5,27 +5,14 @@

 #include <chrono>

+#include "trt_scatternd_kernel.hpp"
 #include "trt_serialize.hpp"

-extern void TRTONNXScatterNDKernelLauncher_float(
-    const float *data, const int *indices, const float *update, const int *dims,
-    int nbDims, const int *indices_dims, int indice_nbDims, float *output,
-    cudaStream_t stream);
-
-extern void TRTONNXScatterNDKernelLauncher_int32(
-    const int *data, const int *indices, const int *update, const int *dims,
-    int nbDims, const int *indices_dims, int indice_nbDims, int *output,
-    cudaStream_t stream);
-
 namespace {
 static const char *PLUGIN_VERSION{"1"};
 static const char *PLUGIN_NAME{"ScatterND"};
 }  // namespace

-nvinfer1::PluginFieldCollection ONNXScatterNDDynamicCreator::mFC{};
-std::vector<nvinfer1::PluginField>
-    ONNXScatterNDDynamicCreator::mPluginAttributes;
-
 ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string &name)
    : mLayerName(name) {}

@ -110,13 +97,13 @@ int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,

  switch (data_type) {
    case nvinfer1::DataType::kFLOAT:
-      TRTONNXScatterNDKernelLauncher_float(
+      TRTONNXScatterNDKernelLauncher<float>(
          (float *)data, (int *)indices, (float *)update, dims, nbDims,
          indices_dims, indice_nbDims, (float *)output, stream);
      break;

    case nvinfer1::DataType::kINT32:
-      TRTONNXScatterNDKernelLauncher_int32(
+      TRTONNXScatterNDKernelLauncher<int>(
          (int *)data, (int *)indices, (int *)update, dims, nbDims,
          indices_dims, indice_nbDims, (int *)output, stream);
      break;
@ -204,3 +191,5 @@ void ONNXScatterNDDynamicCreator::setPluginNamespace(const char *libNamespace) {
 const char *ONNXScatterNDDynamicCreator::getPluginNamespace() const {
  return mNamespace.c_str();
 }
+
+REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator);
--- a/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
@ -91,8 +91,8 @@ class ONNXScatterNDDynamicCreator : public nvinfer1::IPluginCreator {
  const char *getPluginNamespace() const override;

 private:
-  static nvinfer1::PluginFieldCollection mFC;
-  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
  std::string mNamespace;
 };
 #endif  // TRT_SCATTERND_HPP
--- a/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
@ -6,8 +6,6 @@
 #include "trt_cuda_helper.cuh"
 #include "trt_plugin_helper.hpp"

-static int const threadsPerBlock = sizeof(unsigned long long int) * 8;
-
 using mmlab::TensorDesc;

 template <typename T>
@ -66,27 +64,17 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
    num_update_indice *= indice_desc.shape[i];
  }
  // scatter
-  const int col_block = DIVUP(num_update_indice, threadsPerBlock);
-  onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
+  const int col_block = DIVUP(num_update_indice, THREADS_PER_BLOCK);
+  onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
      num_update_indice, indices, update, output, tensor_desc, indice_desc);
 }

-void TRTONNXScatterNDKernelLauncher_float(const float* data, const int* indices,
-                                          const float* update, const int* dims,
-                                          int nbDims, const int* indices_dims,
-                                          int indice_nbDims, float* output,
-                                          cudaStream_t stream) {
-  TRTONNXScatterNDKernelLauncher<float>(data, indices, update, dims, nbDims,
-                                        indices_dims, indice_nbDims, output,
-                                        stream);
-}
+template void TRTONNXScatterNDKernelLauncher<float>(
+    const float* data, const int* indices, const float* update, const int* dims,
+    int nbDims, const int* indices_dims, int indice_nbDims, float* output,
+    cudaStream_t stream);

-void TRTONNXScatterNDKernelLauncher_int32(const int* data, const int* indices,
-                                          const int* update, const int* dims,
-                                          int nbDims, const int* indices_dims,
-                                          int indice_nbDims, int* output,
-                                          cudaStream_t stream) {
-  TRTONNXScatterNDKernelLauncher<int>(data, indices, update, dims, nbDims,
-                                      indices_dims, indice_nbDims, output,
-                                      stream);
-}
+template void TRTONNXScatterNDKernelLauncher<int>(
+    const int* data, const int* indices, const int* update, const int* dims,
+    int nbDims, const int* indices_dims, int indice_nbDims, int* output,
+    cudaStream_t stream);
--- a/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
+++ b/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
@ -0,0 +1,12 @@
+#ifndef TRT_SCATTERND_KERNEL_HPP
+#define TRT_SCATTERND_KERNEL_HPP
+#include <cuda_runtime.h>
+
+template <typename T>
+void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
+                                    const T* update, const int* dims,
+                                    int nbDims, const int* indices_dims,
+                                    int indice_nbDims, T* output,
+                                    cudaStream_t stream);
+
+#endif  // TRT_SCATTERND_KERNEL_HPP
--- a/backend_ops/tensorrt/trt_plugin.cpp
+++ b/backend_ops/tensorrt/trt_plugin.cpp
@ -1,13 +0,0 @@
-#include "batched_nms/trt_batched_nms.hpp"
-#include "nms/trt_nms.hpp"
-#include "roi_align/trt_roi_align.hpp"
-#include "scatternd/trt_scatternd.hpp"
-
-REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSPluginDynamicCreator);
-REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator);
-REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator);
-REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
-
-extern "C" {
-bool initLibMMCVInferPlugins() { return true; }
-}  // extern "C"
--- a/mmdeploy/mmdet/models/roi_heads/init.py
+++ b/mmdeploy/mmdet/models/roi_heads/init.py
@ -0,0 +1 @@
+from .roi_extractors import *  # noqa: F401, F403
--- a/mmdeploy/mmdet/models/roi_heads/roi_extractors/init.py
+++ b/mmdeploy/mmdet/models/roi_heads/roi_extractors/init.py
@ -0,0 +1 @@
+from .base_roi_extractor import *  # noqa: F401, F403
--- a/mmdeploy/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
+++ b/mmdeploy/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@ -0,0 +1,72 @@
+from torch.autograd import Function
+
+from mmdeploy.utils import FUNCTION_REWRITERS
+
+
+class MultiLevelRoiAlign(Function):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @staticmethod
+    def symbolic(g, *args):
+        aligned = args[-1]
+        featmap_strides = args[-2]
+        finest_scale = args[-3]
+        roi_scale_factor = args[-4]
+        sampling_ratio = args[-5]
+        output_size = args[-6]
+        inputs = args[:len(featmap_strides)]
+        rois = args[len(featmap_strides)]
+        return g.op(
+            'mmlab::MMCVMultiLevelRoiAlign',
+            rois,
+            *inputs,
+            output_height_i=output_size[1],
+            output_width_i=output_size[0],
+            sampling_ratio_i=sampling_ratio,
+            roi_scale_factor_f=roi_scale_factor,
+            finest_scale_i=finest_scale,
+            featmap_strides_f=featmap_strides,
+            aligned_i=aligned)
+
+    @staticmethod
+    def forward(g, *args):
+        # aligned = args[-1]
+        featmap_strides = args[-2]
+        # finest_scale = args[-3]
+        # roi_scale_factor = args[-4]
+        # sampling_ratio = args[-5]
+        output_size = args[-6]
+        inputs = args[:len(featmap_strides)]
+        rois = args[len(featmap_strides)]
+
+        num_proposals = rois.shape[0]
+        channel = inputs[0].shape[1]
+
+        return rois.new_zeros(
+            (num_proposals, channel, output_size[1], output_size[0]))
+
+
+@FUNCTION_REWRITERS.register_rewriter(
+    func_name='mmdet.models.roi_heads.SingleRoIExtractor.forward',
+    backend='tensorrt')
+def SingleRoIExtractor_forward_static(rewriter,
+                                      self,
+                                      feats,
+                                      rois,
+                                      roi_scale_factor=None):
+    featmap_strides = self.featmap_strides
+    finest_scale = self.finest_scale
+
+    roi_layer = self.roi_layers[0]
+    out_size = roi_layer.output_size
+    sampling_ratio = roi_layer.sampling_ratio
+    aligned = roi_layer.aligned
+    if roi_scale_factor is None:
+        roi_scale_factor = 1.0
+
+    featmap_strides = [float(s) for s in featmap_strides]
+    return MultiLevelRoiAlign.apply(*feats, rois, out_size, sampling_ratio,
+                                    roi_scale_factor, finest_scale,
+                                    featmap_strides, aligned)
				`@ -0,0 +1 @@`
				`from .roi_extractors import * # noqa: F401, F403`
				`@ -0,0 +1 @@`
				`from .base_roi_extractor import * # noqa: F401, F403`