[Feature] Support NMS with cambricon MLU590 backend (#2401)

* [Feature] Support Nms with cambricon MLU590 backend support 590 for nms * add blank
2022-11-11 16:44:44 +08:00 · 2022-11-11 16:44:44 +08:00 · 652b1bf207
parent 3efe957f40
commit 652b1bf207
3 changed files with 54 additions and 8 deletions
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@ -234,7 +234,7 @@ __mlu_func__ void nms_detection_ux(
    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
    const int input_num_boxes, const int max_output_size,
    const float thresh_iou, const float thresh_score, const float offset,
-    const int output_mode, const int algo) {
+    const int output_mode, const int algo, char *cdma_gdram) {
  exit_flag[0] = 0;

  IN_DT *sram = (IN_DT *)sram_buffer;
@ -321,7 +321,25 @@ __mlu_func__ void nms_detection_ux(
      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
    }
    __sync_all();
-#if __BANG_ARCH__ <= 372
+#if __BANG_ARCH__ >= 590
+    __memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram,
+             REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM);
+    __sync_all();
+    if (clusterId == 0 && coreId == 0) {
+      __bang_write_zero(inter_x1, NMS_SIZE);
+      __memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM,
+               sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
+      __bang_max(max_box, inter_x1, NMS_SIZE);
+      int max_cluster = (sizeof(IN_DT) == sizeof(half))
+                            ? ((uint16_t *)max_box)[1]
+                            : ((uint32_t *)max_box)[1];
+      __memcpy((char *)cdma_gdram,
+               (char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT),
+               REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM);
+    }
+    __sync_all();
+    __memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM);
+#else
    findGlobalMaxBox(max_box, sram, inter_x1);
 #endif

@ -380,6 +398,7 @@ __mlu_global__ void MLUUionXKernelNMS(
  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
  int32_t *exit_flag = (int32_t *)((char *)workspace +
                                   INFO_NUM * input_num_boxes * input_dwidth);
+  char *cdma_addr = (char *)exit_flag + sizeof(int32_t);
  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;

@ -409,24 +428,26 @@ __mlu_global__ void MLUUionXKernelNMS(
      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
                       score_data, boxes_data, input_ram, input_num_boxes,
                       max_output_size, iou_threshold, confidence_threshold,
-                       offset, output_mode, algo);
+                       offset, output_mode, algo, cdma_addr);
    } else {
      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
                       (half *)score_data, (half *)boxes_data, input_ram,
                       input_num_boxes, max_output_size, iou_threshold,
-                       confidence_threshold, offset, output_mode, algo);
+                       confidence_threshold, offset, output_mode, algo,
+                       cdma_addr);
    }
  } else {
    if (data_type_input == CNRT_FLOAT32) {
      nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
                       boxes_data, input_ram, input_num_boxes, max_output_size,
                       iou_threshold, confidence_threshold, offset, output_mode,
-                       algo);
+                       algo, cdma_addr);
    } else {
      nms_detection_ux(exit_flag, output_box_num, (half *)output,
                       (half *)score_data, (half *)boxes_data, input_ram,
                       input_num_boxes, max_output_size, iou_threshold,
-                       confidence_threshold, offset, output_mode, algo);
+                       confidence_threshold, offset, output_mode, algo,
+                       cdma_addr);
    }
  }
  ((uint32_t *)result_num)[0] = output_box_num;
--- a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
@ -36,6 +36,26 @@ inline int32_t getJobLimitCapability() {
  return (int32_t)ctx_conf_param.unionLimit;
 }

+inline int32_t getCoreNumOfJobLimitCapability() {
+  switch (getJobLimitCapability()) {
+    default:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
+             getJobLimitCapability();
+    case CN_KERNEL_CLASS_BLOCK:
+      return 1;
+    case CN_KERNEL_CLASS_UNION:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+    case CN_KERNEL_CLASS_UNION2:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
+    case CN_KERNEL_CLASS_UNION4:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
+    case CN_KERNEL_CLASS_UNION8:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
+    case CN_KERNEL_CLASS_UNION16:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
+  }
+}
+
 #endif  // MMCV_WITH_MLU

 #endif  // PYTORCH_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (C) 2021 by Cambricon.
+ * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
@ -34,6 +34,7 @@ static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
                               int &core_num_per_class,
                               const int input_box_num) {
  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
  uint32_t job_limit = getJobLimitCapability();
  uint32_t core_number = job_limit;

@ -116,7 +117,11 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  } else {
    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
  }
-
+#if __BANG_ARCH__ > 370
+  int cluster_num = getCoreNumOfJobLimitCapability() /
+                    torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  space_size += cluster_number * sizeof(float) * 7;
+#endif
  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));

  // get compute queue