[Improve] Improve the performance of roialign, nms and focalloss with MLU backend (#1572)

* [Improve] Improve the performance of roialign with MLU backend * replace CHECK_MLU with CHECK_MLU_INPUT * [Improve] Improve the perf of nms and focallosssigmoid with MLU backend
2025-06-03 21:54:52 +08:00 · 2021-12-21 21:29:10 +08:00 · 2021-12-21 21:29:10 +08:00 · 9d007f951f
commit 9d007f951f
parent e1c73552a4
7 changed files with 1822 additions and 600 deletions
--- a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
@ -24,8 +24,21 @@ __mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
                            const int32_t dst_stride = 0,
                            const int32_t src_stride = 0,
                            const int32_t count = 1) {
-  __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
-                 src_stride, count - 1);
+  if (dst_stride == src_stride) {
+    __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
+  } else {
+    __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
+                   src_stride, count - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
+                             const int32_t c, const int32_t has_weight,
+                             const int32_t partition_nc) {
+  if (has_weight && partition_nc && t >= 0 && t < c) {
+    __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
+  }
 }

 template <typename T>
@ -33,152 +46,117 @@ __mlu_func__ void storeOutput(T *dram_output, char *nram_output,
                              const int32_t size, const int32_t dst_stride = 0,
                              const int32_t src_stride = 0,
                              const int32_t count = 1) {
-  __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
-                 src_stride, count - 1);
+  if (dst_stride == src_stride) {
+    __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
+  } else {
+    __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
+                   src_stride, count - 1);
+  }
 }

 template <typename T>
 __mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
-                          const int32_t has_weight, const int32_t deal_num,
-                          const int32_t n_seg, const int32_t C, float alpha,
-                          float gamma, T *scalar_temp, T *tensor_max,
-                          T *tensor_temp, T *output) {
-  const int32_t scalar_elem_num = NFU_ALIGN_SIZE / sizeof(T);
+                          const int32_t has_weight, const int32_t partition_nc,
+                          const int32_t deal_num, const int32_t n_seg,
+                          const int32_t c, const int32_t c_seg,
+                          const int32_t c_start_index, const float alpha,
+                          const float gamma, T *compute_a, T *compute_b,
+                          T *output) {
+  // set params
+  const int32_t c_num =
+      has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
+  const int32_t c_end_index = c_start_index + c_seg;
+  const int32_t half_epsilon = 0x0400;
+  const T epsilon_f =
+      sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);

-  // 0. n_max = max(0, x)
-  __nramset((T *)tensor_max, deal_num, (T)0);
-  __bang_cycle_maxequal((T *)tensor_max, (T *)tensor_max, (T *)input, deal_num,
-                        deal_num);
-
-  // 1. ln(1+e^x) = ln(e^(-max) + e^(x-max)) + max
-  __nramset((T *)scalar_temp, scalar_elem_num, (T)-1);
-  __bang_cycle_mul((T *)tensor_temp, (T *)tensor_max, (T *)scalar_temp,
-                   deal_num, scalar_elem_num);
-  __bang_cycle_add((T *)output, (T *)input, (T *)tensor_temp, deal_num,
-                   deal_num);
-  __bang_active_exphp((T *)output, (T *)output, deal_num);
-  __bang_active_exphp((T *)tensor_temp, (T *)tensor_temp, deal_num);
-  __bang_cycle_add((T *)output, (T *)output, (T *)tensor_temp, deal_num,
-                   deal_num);
-  __bang_active_loghp((T *)output, (T *)output, deal_num);
-  __bang_cycle_add((T *)output, (T *)output, (T *)tensor_max, deal_num,
-                   deal_num);
-
-  // 2. temp = [1 + e^(-x)] ^ (-r)
-  __nramset((T *)scalar_temp, scalar_elem_num, (T)-1);
-  __bang_cycle_mul((T *)tensor_temp, (T *)input, (T *)scalar_temp, deal_num,
-                   scalar_elem_num);
-  __bang_active_exphp((T *)tensor_temp, (T *)tensor_temp, deal_num);
-
-  __nramset((T *)scalar_temp, scalar_elem_num, (T)1);
-  __bang_cycle_add((T *)tensor_temp, (T *)tensor_temp, (T *)scalar_temp,
-                   deal_num, scalar_elem_num);
-  __bang_active_loghp((T *)tensor_temp, (T *)tensor_temp, deal_num);
-
-  __nramset((T *)scalar_temp, scalar_elem_num, (T)(-gamma));
-  __bang_cycle_mul((T *)tensor_temp, (T *)tensor_temp, (T *)scalar_temp,
-                   deal_num, scalar_elem_num);
-  __bang_active_exphp((T *)tensor_temp, (T *)tensor_temp, deal_num);
-
-  // 3.1 output: target != j
-  __nramset((T *)scalar_temp, scalar_elem_num, (T)(1 - alpha));
-  __bang_cycle_mul((T *)output, (T *)output, (T *)scalar_temp, deal_num,
-                   scalar_elem_num);
-  __bang_cycle_mul((T *)output, (T *)output, (T *)tensor_temp, deal_num,
-                   deal_num);
-
-  // 3.2 output: target == j
-  const int32_t c_align_size = PAD_UP((sizeof(T) * C), NFU_ALIGN_SIZE);
+  // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma  if t == c_i
+  //                    = (1 - alpha) * p ^ gamma  if t != c_i
+  __nramset((T *)output, deal_num, (T)(1 - alpha));
+  __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
  for (int32_t i = 0; i < n_seg; ++i) {
-    const int32_t target_value = *((int32_t *)target + i);
-    if (target_value >= 0 && target_value < C) {
-      const int32_t offset = i * c_align_size + target_value * sizeof(T);
-      char *addr_input = (char *)input + offset;
-      char *addr_output = (char *)output + offset;
-      const float x = *(T *)addr_input;
-      const float p = 1. / (1. + exp(-x));
-      *(T *)addr_output = -alpha * pow(1. - p, gamma) * log(fmax(p, FLT_MIN));
+    const int32_t t = *((uint32_t *)target + i);
+    if (t >= c_start_index && t < c_end_index) {
+      const uint32_t index = i * c_num + t - c_start_index;
+      *((T *)input + index) = -1.0 * (*((T *)input + index));
+      *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
+      *((T *)output + index) = alpha;
    }
  }
+  if (sizeof(T) == sizeof(half)) {
+    __bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
+    __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
+    __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
+                     deal_num);
+    __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
+    __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
+  } else {
+    __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
+    __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
+    __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
+  }
+  __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);

-  // with weight
-  if (has_weight > 0) {
-    int32_t row_num_elem = deal_num / n_seg;
+  // 1. max = max(0, -x)  if t == c_i
+  //        = max(0, x)   if t != c_i
+  __nramset((T *)compute_b, deal_num, (T)0);
+  __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
+
+  // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max   if t == c_i
+  //              = ln(e^(-max)+ e^(-max+x) + max   if t != c_i
+  __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
+  __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
+
+  __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
+  __bang_active_exphp((T *)input, (T *)input, deal_num);
+  __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
+  __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
+  __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
+
+  // 3. output = alpha_t * p_t^r * [-log(p_t)]
+  __bang_mul((T *)output, (T *)output, (T *)input, deal_num);
+
+  // 4. with weight
+  if (has_weight) {
    for (int32_t i = 0; i < n_seg; ++i) {
-      const int32_t t = *((int32_t *)target + i);
-      __nramset((T *)scalar_temp, scalar_elem_num, *((T *)weight + t));
-      __bang_cycle_mul((T *)output + i * row_num_elem,
-                       (T *)output + i * row_num_elem, (T *)scalar_temp,
-                       row_num_elem, scalar_elem_num);
+      int32_t t = *((int32_t *)target + i);
+      if (t >= 0 && t < c) {
+        t = partition_nc ? 0 : t;
+        __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
+                         *((T *)weight + t), c_num);
+      }
    }
  }
 }

 template <typename T>
-__mlu_func__ void focalLossSigmoidForwardBlock(
+__mlu_func__ void startPipeline(
    const T *input, const int32_t *target, const T *weight,
-    const int32_t row_num, const int32_t C, const float alpha,
-    const float gamma, T *output) {
-  /*
-   * NRAM partition
-   *  |-----------------------------------------------------------------------|
-   *  |                                scalar                                 |
-   *  |-----------------------------------------------------------------------|
-   *  |                                weight                                 |
-   *  |------------------------------- COMPUTE -------------------------------|
-   *  |                                   |                                   |
-   *  |              computeA             |               computeB            |
-   *  |                                   |                                   |
-   *  |------------- PING ------------------------------- PONG ---------------|
-   *  |                                   |                                   |
-   *  |              input                |               input               |
-   *  |                                   |                                   |
-   *  |-----------------------------------|-----------------------------------|
-   *  |                                   |                                   |
-   *  |              output               |               output              |
-   *  |                                   |                                   |
-   *  |-----------------------------------|-----------------------------------|
-   *  |              target               |               target              |
-   *  |-----------------------------------|-----------------------------------|
-   *
-   * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
-   *                          PONG(input,output).
-   * split_target_num is 2: PING(target), PONG(target).
-   */
-  const int32_t c_align = PAD_UP(C, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t c_align_size = c_align * sizeof(T);
-  const int32_t scalar_size = NFU_ALIGN_SIZE;
-  const int32_t weight_size = (weight != NULL) * c_align_size;
-  const int32_t split_pipeline_num = 6;
-  const int32_t split_target_num = 2;
+    char *nram_compute_a, char *nram_compute_b, char *nram_input,
+    char *nram_target, char *nram_weight, char *nram_output,
+    const int32_t has_weight, const int32_t partition_nc,
+    const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
+    const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
+    const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
+    T *output) {
+  // with offset
+  input = (T *)((char *)input + c_offset_num * sizeof(T));
+  output = (T *)((char *)output + c_offset_num * sizeof(T));

-  const int32_t remain_size = MAX_NRAM_SIZE - scalar_size - weight_size;
-  const int32_t n_seg = remain_size / (split_pipeline_num * c_align_size +
-                                       split_target_num * sizeof(int32_t));
-  const int32_t deal_num = n_seg * c_align_size / sizeof(T);
-  const int32_t target_size = n_seg * sizeof(int32_t);
+  const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
+  const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t load_size = c_seg * sizeof(T);
+  const int32_t dram_stride = c * sizeof(T);
+  const int32_t nram_stride = c_num * sizeof(T);

-  // nram scalar,weight
-  char *nram_scalar = (char *)nram_buffer;
-  char *nram_weight = (char *)nram_scalar + scalar_size;
-  if (weight_size > 0) {
-    loadInput<T>(nram_weight, (T *)weight, C * sizeof(T));
-    __asm__ volatile("sync;");
+  if (has_weight && !partition_nc) {
+    loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
+                 1);
+    __asm__ volatile("sync;\n\t");
  }
-
-  // nram COMPUTE
-  const int32_t compute_size = 2 * c_align_size * n_seg;
-  char *nram_compute_a = (char *)nram_weight + weight_size;
-  char *nram_compute_b = (char *)nram_compute_a + c_align_size * n_seg;
-
-  // nram PING/PONG
-  const int32_t pingpong_offset = (remain_size - compute_size) / 2;
-  char *nram_input = (char *)nram_compute_a + 2 * c_align_size * n_seg;
-  char *nram_output = (char *)nram_compute_a + 3 * c_align_size * n_seg;
-  char *nram_target = (char *)nram_compute_a + 4 * c_align_size * n_seg;
-
-  const int32_t repeat = row_num / n_seg;
-  const int32_t remain = row_num % n_seg;
+  const int32_t repeat = n / n_seg;
+  const int32_t remain = n % n_seg;

  /*
   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
@ -206,80 +184,214 @@ __mlu_func__ void focalLossSigmoidForwardBlock(

  // diagram of PINGPONG: L0
  if (repeat > 0) {
-    loadInput<T>(nram_input, (T *)input, C * sizeof(T), c_align * sizeof(T),
-                 C * sizeof(T), n_seg);
-    loadInput<int32_t>(nram_target, (int32_t *)target, target_size);
-    __asm__ volatile("sync;");
+    loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
+                 n_seg);
+    loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
+    loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
+                  partition_nc);
+    __asm__ volatile("sync;\n\t");
  }

  // diagram of PINGPONG: C0 and L1
  if (repeat > 1) {
-    loadInput<T>(nram_input + pingpong_offset, (T *)input + C * n_seg,
-                 C * sizeof(T), c_align * sizeof(T), C * sizeof(T), n_seg);
-    loadInput<int32_t>(nram_target + pingpong_offset, (int32_t *)target + n_seg,
-                       target_size);
    compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
-            weight_size, deal_num, n_seg, C, alpha, gamma, (T *)nram_scalar,
-            (T *)nram_compute_a, (T *)nram_compute_b, (T *)nram_output);
-    __asm__ volatile("sync;");
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)nram_output);
+    loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
+                 load_size, nram_stride, dram_stride, n_seg);
+    loadInput<int32_t>((char *)nram_target + pingpong_offset,
+                       (int32_t *)target + n_seg, n_seg * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
+                  *((int32_t *)target + n_seg), c, has_weight, partition_nc);
+    __asm__ volatile("sync;\n\t");
  }

  for (int32_t i = 0; i < repeat - 2; ++i) {
-    storeOutput<T>((T *)output + i * C * n_seg,
-                   nram_output + (i % 2) * pingpong_offset, C * sizeof(T),
-                   C * sizeof(T), c_align * sizeof(T), n_seg);
-    loadInput<T>(nram_input + (i % 2) * pingpong_offset,
-                 (T *)input + (i + 2) * C * n_seg, C * sizeof(T),
-                 c_align * sizeof(T), C * sizeof(T), n_seg);
-    loadInput<int32_t>(nram_target + (i % 2) * pingpong_offset,
-                       (int32_t *)target + (i + 2) * n_seg, target_size);
+    storeOutput<T>((T *)output + i * c * n_seg,
+                   nram_output + (i % 2) * pingpong_offset, load_size,
+                   dram_stride, nram_stride, n_seg);
+    loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
+                 (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
+                 dram_stride, n_seg);
+    loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
+                       (int32_t *)target + (i + 2) * n_seg,
+                       n_seg * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
+                  (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
+                  has_weight, partition_nc);
    compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
            (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
-            (T *)nram_weight, weight_size, deal_num, n_seg, C, alpha, gamma,
-            (T *)nram_scalar, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_weight +
+                  partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
            (T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
-    __asm__ volatile("sync;");
+    __asm__ volatile("sync;\n\t");
  }

  if (repeat > 1) {
-    storeOutput<T>((T *)output + (repeat - 2) * C * n_seg,
-                   nram_output + (repeat % 2) * pingpong_offset, C * sizeof(T),
-                   C * sizeof(T), c_align * sizeof(T), n_seg);
+    storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
+                   (char *)nram_output + (repeat % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, n_seg);
  }
+
  if (remain > 0) {
-    loadInput<T>(nram_input + (repeat % 2) * pingpong_offset,
-                 (T *)input + repeat * C * n_seg, C * sizeof(T),
-                 c_align * sizeof(T), C * sizeof(T), remain);
-    loadInput<int32_t>(nram_target + (repeat % 2) * pingpong_offset,
+    loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
+                 (T *)input + repeat * c * n_seg, load_size, nram_stride,
+                 dram_stride, remain);
+    loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
                       (int32_t *)target + repeat * n_seg,
                       remain * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
+                  (T *)weight, *((int32_t *)target + repeat * n_seg), c,
+                  has_weight, partition_nc);
  }
+
  if (repeat > 0) {
    compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
            (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
-            (T *)nram_weight, weight_size, deal_num, n_seg, C, alpha, gamma,
-            (T *)nram_scalar, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_weight +
+                  partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
            (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
  }
-  __asm__ volatile("sync;");
+  __asm__ volatile("sync;\n\t");

  if (repeat > 0) {
-    storeOutput<T>((T *)output + (repeat - 1) * C * n_seg,
-                   nram_output + ((repeat - 1) % 2) * pingpong_offset,
-                   C * sizeof(T), C * sizeof(T), c_align * sizeof(T), n_seg);
+    storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
+                   (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, n_seg);
  }
+
  if (remain > 0) {
-    int rem_deal_num = remain * c_align_size / sizeof(T);
+    int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
    compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
            (int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
-            (T *)nram_weight, weight_size, rem_deal_num, remain, C, alpha,
-            gamma, (T *)nram_scalar, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_weight +
+                  partition_nc * (repeat % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
            (T *)(nram_output + (repeat % 2) * pingpong_offset));
-    __asm__ volatile("sync;");
+    __asm__ volatile("sync;\n\t");

-    storeOutput<T>((T *)output + repeat * C * n_seg,
-                   nram_output + (repeat % 2) * pingpong_offset, C * sizeof(T),
-                   C * sizeof(T), c_align * sizeof(T), remain);
+    storeOutput<T>((T *)output + repeat * c * n_seg,
+                   (char *)nram_output + (repeat % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, remain);
+  }
+  __asm__ volatile("sync;\n\t");
+}
+
+template <typename T>
+__mlu_func__ void focalLossSigmoidForwardBlock(
+    const T *input, const int32_t *target, const T *weight, const int32_t n,
+    const int32_t c, const float alpha, const float gamma, T *output) {
+  /*
+   * NRAM partition
+   *  |-----------------------------------------------------------------------|
+   *  |                                weight                                 |
+   *  |------------------------------- COMPUTE -------------------------------|
+   *  |                                   |                                   |
+   *  |              computeA             |               computeB            |
+   *  |                                   |                                   |
+   *  |------------- PING ------------------------------- PONG ---------------|
+   *  |                                   |                                   |
+   *  |              input                |               input               |
+   *  |                                   |                                   |
+   *  |-----------------------------------|-----------------------------------|
+   *  |                                   |                                   |
+   *  |              output               |               output              |
+   *  |                                   |                                   |
+   *  |-----------------------------------|-----------------------------------|
+   *  |              target               |               target              |
+   *  |-----------------------------------|-----------------------------------|
+   *
+   * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
+   * PONG(input,output).
+   * split_target_num is 2: PING(target), PONG(target).
+   * weight is not NULL:
+   *   The nram-size of weight is equal to c_align_size when partition input-N.
+   *   The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
+   * input-NC.
+  */
+
+  // calculate threshold of c
+  const int32_t split_pipeline_num = 6;
+  const int32_t split_target_num = 2;
+  const int32_t has_weight = weight != NULL;
+  const int32_t threshold_c =
+      PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      sizeof(T);
+  const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t c_align_size = c_align * sizeof(T);
+
+  if (c <= threshold_c) {
+    // partition inputN
+    int32_t c_num = c;
+    int32_t reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int32_t weight_size = 0;
+    if (has_weight) {
+      c_num = c_align;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      weight_size = c_align_size;
+    }
+
+    const int32_t remain_size =
+        MAX_NRAM_SIZE - weight_size - reservered_align_size;
+    const int32_t n_seg =
+        remain_size / (split_pipeline_num * c_num * sizeof(T) +
+                       split_target_num * sizeof(int32_t));
+    const int32_t split_pipeline_size =
+        PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
+    const int32_t compute_size = 2 * split_pipeline_size;
+    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
+
+    char *nram_weight = (char *)nram_buffer;
+    char *nram_compute_a = nram_weight + has_weight * c_align_size;
+    char *nram_compute_b = nram_compute_a + split_pipeline_size;
+    char *nram_input = nram_compute_b + split_pipeline_size;
+    char *nram_output = nram_input + split_pipeline_size;
+    char *nram_target = nram_output + split_pipeline_size;
+
+    startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
+                     nram_input, nram_target, nram_weight, nram_output,
+                     has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
+                     alpha, gamma, output);
+  } else {
+    // partition inputNC
+    const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
+    const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
+    const int32_t split_pipeline_size = PAD_DOWN(
+        (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
+        NFU_ALIGN_SIZE);
+    const int32_t c_seg = split_pipeline_size / sizeof(T);
+    const int32_t n_seg = 1;
+    const int32_t compute_size = 2 * split_pipeline_size;
+    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
+    const int32_t pingpong_weight_offset = weight_size / 2;
+
+    char *nram_weight = (char *)nram_buffer;
+    char *nram_compute_a = nram_weight + weight_size;
+    char *nram_compute_b = nram_compute_a + split_pipeline_size;
+    char *nram_input = nram_compute_b + split_pipeline_size;
+    char *nram_output = nram_input + split_pipeline_size;
+    char *nram_target = nram_output + split_pipeline_size;
+
+    const int32_t loop_num = (c + c_seg - 1) / c_seg;
+    const int32_t partition_nc = 1;
+    for (int32_t i = 0; i < loop_num; ++i) {
+      const int32_t c_index = i * c_seg;
+      const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
+      startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
+                       nram_input, nram_target, nram_weight, nram_output,
+                       has_weight, partition_nc, pingpong_offset,
+                       pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
+                       alpha, gamma, output);
+    }
  }
 }

--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (C) 2021 by Cambricon.
+ * Copyright (C) 2021 Cambricon.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
@ -15,6 +15,8 @@
 #define COORD_DIM (4)
 #define MEMORY_CORE (0x80)
 #define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
+#define REDUCE_NUM \
+  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)

 #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
 #define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
@ -551,7 +553,7 @@ __mlu_func__ void nms_detection(
  }    // for keepNum
 }

-__mlu_global__ void MLUKernelNMS(
+__mlu_global__ void MLUUnion1KernelNMS(
    const void *input_boxes, const void *input_confidence,
    const int input_num_boxes, const int input_stride,
    const int max_output_size, const float iou_threshold,
@ -635,15 +637,525 @@ __mlu_global__ void MLUKernelNMS(
  }
 }

+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void nms_detection_ux(
+    int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
+    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
+    const int input_layout, const int input_num_boxes, const int input_stride,
+    const int max_output_size, const float thresh_iou, const float thresh_score,
+    const float offset, const int output_mode, const int algo) {
+  loop_end_flag[0] = 0;
+  IN_DT *sram = (IN_DT *)sram_buffer;
+
+  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
+  int nms_buffer_count1 = 9;
+  // temp nram buffer to store selected target.
+  int nram_save_limit_count = 256;
+  float div_thresh_iou = 1.0 / thresh_iou;
+
+  // input data ptr
+  IN_DT *input_score_ptr;
+  const IN_DT *input_x1_ptr;
+  const IN_DT *input_y1_ptr;
+  const IN_DT *input_x2_ptr;
+  const IN_DT *input_y2_ptr;
+  input_score_ptr = score_data;
+  input_x1_ptr = boxes_data;
+  input_y1_ptr = input_x1_ptr + input_stride;
+  input_x2_ptr = input_y1_ptr + input_stride;
+  input_y2_ptr = input_x2_ptr + input_stride;
+
+  int limit = 0;        // find limit when GDRAM or SRAM
+  int max_seg_pad = 0;  // the max length every repeat
+  int repeat = 0;
+  int remain = 0;
+  int remain_pad = 0;
+  int nram_save_count = 0;
+
+  if (output_mode == 0) {
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  } else {
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  }
+
+  // data split
+  int avg_cluster = input_num_boxes / clusterDim;
+  int rem_cluster = input_num_boxes % clusterDim;
+  int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
+  int cluster_offset = avg_cluster * clusterId +
+                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
+
+  int avg_core = len_cluster / coreDim;
+  int rem_core = len_cluster % coreDim;
+  int len_core = avg_core + (coreId < rem_core ? 1 : 0);
+  int core_offset =
+      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
+  int input_offset = cluster_offset + core_offset;
+
+  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
+
+  // core 0 of each cluster calculate the max score index
+  int max_index_avg_core = input_num_boxes / clusterDim;
+  int max_index_rem_core = input_num_boxes % clusterDim;
+  int max_index_len_core =
+      max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
+  int max_index_input_offset =
+      max_index_avg_core * clusterId +
+      (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
+  repeat = max_index_len_core / max_seg_pad;
+  remain = max_index_len_core % max_seg_pad;
+  remain_pad = PAD_UP(remain, NMS_SIZE);
+
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  int max_seg_iou_compute =
+      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
+  int repeat_iou_compute = len_core / max_seg_iou_compute;
+  int remain_iou_compute = len_core % max_seg_iou_compute;
+  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+
+  // init the nram ptr
+  IN_DT *score = (IN_DT *)nram_buffer;
+  IN_DT *x1 = score + max_seg_pad;
+  IN_DT *y1 = x1 + max_seg_pad;
+  IN_DT *x2 = y1 + max_seg_pad;
+  IN_DT *y2 = x2 + max_seg_pad;
+  IN_DT *inter_x1 = y2 + max_seg_pad;
+  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
+  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
+  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
+  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
+  OUT_DT *nram_save =
+      (OUT_DT *)((char *)max_box +
+                 NFU_ALIGN_SIZE);  // offset two line from max_box
+
+  mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
+  input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+
+  for (int keep = 0; keep < max_output_size;
+       keep++) {  // loop until the max_score <= 0
+    __sync_all();
+
+    /******FIND MAX START******/
+    int max_index = 0;
+    int global_max_index = 0;  // for Ux
+    float max_area = 0;        // the max socre area
+    max_box[0] = 0;            // init 0
+
+    if (coreId == 0) {
+      for (int i = 0; i <= repeat; i++) {
+        if (i == repeat && remain == 0) {
+          break;
+        }
+
+        int seg_len = (i == repeat)
+                          ? remain_pad
+                          : max_seg_pad;  // the length every nms compute
+        // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
+        // num
+        // that fp16 could express.
+        if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
+          return;
+        }
+        int cpy_len = (i == repeat)
+                          ? remain
+                          : max_seg_pad;  // the length every nms memcpy
+
+        /******NMS LOAD START******/
+        __bang_write_zero(score, seg_len);
+        __memcpy(score,
+                 input_score_ptr + max_index_input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+
+        /******NMS LOAD END******/
+
+        __bang_max(inter_x1, score, seg_len);
+        if (inter_x1[0] > max_box[0]) {
+          max_box[0] = inter_x1[0];
+          if (sizeof(IN_DT) == sizeof(half)) {
+            max_index =
+                ((uint16_t *)inter_x1)[1] + max_index_input_offset +
+                i * max_seg_pad;  // offset start from head of input_data
+          } else if (sizeof(IN_DT) == sizeof(float)) {
+            max_index =
+                ((uint32_t *)inter_x1)[1] + max_index_input_offset +
+                i * max_seg_pad;  // offset start from head of input_data
+          }
+        }
+      }  // for repeat
+
+      // the max box's x1, y1, x2, y2 on every cluster
+      max_box[1] = input_x1_ptr[max_index];
+      max_box[2] = input_y1_ptr[max_index];
+      max_box[3] = input_x2_ptr[max_index];
+      max_box[4] = input_y2_ptr[max_index];
+      ((uint32_t *)(max_box + 5))[0] = max_index;
+      // copy max box info to sram
+      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+    }
+    __sync_all();
+    // copy all partial max to the sram of cluster 0
+    if (clusterId != 0) {
+      __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
+               SRAM2SRAM, 0);
+    }
+    __sync_all();
+
+    // reduce between clusters to get the global max box
+    if (clusterId == 0) {
+      if (coreId == 0) {
+        __bang_write_zero(inter_x1, NMS_SIZE);
+        __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+                 REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
+        __bang_max(max_box, inter_x1, NMS_SIZE);
+        int max_cluster = (sizeof(IN_DT) == sizeof(half))
+                              ? ((uint16_t *)max_box)[1]
+                              : ((uint32_t *)max_box)[1];
+        __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
+                 REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+        __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+      }
+      __sync_cluster();
+      if (coreId == 0x80 && clusterDim > 1) {
+        // broadcast global max box to each cluster's sram
+        for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
+          __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
+                   cluster_idx);
+        }
+      }
+      __sync_cluster();
+    }
+    __sync_all();
+
+    // copy the global max box to max_box
+    __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+    if (algo == 0 || offset == 0.0) {
+      max_area = ((float)max_box[3] - (float)max_box[1]) *
+                 ((float)max_box[4] - (float)max_box[2]);
+    } else {
+      max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                 ((float)max_box[4] - (float)max_box[2] + offset);
+    }
+    global_max_index = ((uint32_t *)(max_box + 5))[0];
+    if (coreId != 0x80) {
+      input_score_ptr[global_max_index] = 0;
+    }
+    // by now, we get: max_score|max_index|max_box|max_area
+    /******FIND MAX END******/
+
+    /******NMS STORE START******/
+    // store to nram
+    if (float(max_box[0]) > thresh_score) {
+      OUT_DT *save_ptr;
+      int save_offset = 0;
+      int save_str_num = 0;
+      save_ptr = nram_save;
+      save_offset = nram_save_count;
+      save_str_num = nram_save_limit_count;
+      if (clusterId == 0 && coreId == 0) {
+        if (output_mode == 0) {  // index1, index2, ...
+          save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
+        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
+                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
+        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
+                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
+                   4);
+        }
+      }
+      nram_save_count++;
+      output_box_num++;
+    }
+
+    // store to sram/gdram
+    if (output_box_num != 0) {
+      if ((nram_save_count == nram_save_limit_count) ||
+          (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
+        if (nram_save_count != 0) {
+          if (clusterId == 0 && coreId == 0) {
+            if (output_mode == 0) {  // index1, index2, ...
+              pvLock();
+              __memcpy(output_dram, nram_save,
+                       nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
+              pvUnlock();
+              output_dram += nram_save_count;
+            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+              pvLock();
+              __memcpy(output_dram, nram_save,
+                       nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
+              pvUnlock();
+              output_dram += nram_save_count * INFO_NUM;
+            } else if (output_mode ==
+                       2) {  // score---, x1---, y1---, x2---, y2---
+              pvLock();
+              __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
+                       NRAM2GDRAM, max_output_size * sizeof(IN_DT),
+                       nram_save_limit_count * sizeof(IN_DT), 4);
+              pvUnlock();
+              output_dram += nram_save_count;
+            }
+            nram_save_count = 0;
+          }
+        }
+      }  // if move data nram->sram/gdram
+    }    // if dst
+
+    if (float(max_box[0]) <= thresh_score) {
+      if (clusterId == 0 && coreId == 0) {
+        loop_end_flag[0] = 1;  // dram
+      }
+    }
+    __sync_all();
+    if (loop_end_flag[0] == 1) {
+      break;
+    }
+    /******NMS STORE END******/
+
+    // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
+    for (int i = 0; i <= repeat_iou_compute; i++) {
+      if (i == repeat_iou_compute && remain_iou_compute == 0) {
+        break;
+      }
+      int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
+                                              : max_seg_iou_compute;
+      int cpy_len =
+          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
+
+      /******NMS LOAD START******/
+      __nramset((float *)score, seg_len, 0.0f);
+      int dt_offset = 0;
+      if (sizeof(IN_DT) == sizeof(float)) {
+        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+        dt_offset = 0;
+      } else if (sizeof(IN_DT) == sizeof(half)) {
+        __nramset(x1, seg_len, half(0));
+        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+        __bang_half2float((float *)score, (half *)x1, seg_len);
+        dt_offset = max_seg_iou_compute;
+      }
+
+      __memcpy(x1 + dt_offset,
+               input_x1_ptr + input_offset + i * max_seg_iou_compute,
+               cpy_len * sizeof(IN_DT), input_load_dir,
+               max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
+      /******NMS LOAD END******/
+
+      /******NMS COMPUTE START******/
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
+                          seg_len);
+      }
+      // 1、 compute IOU
+      // get the area_I
+      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
+      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                      seg_len);                                  // inter_x1
+      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
+      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                      seg_len);  // inter_x2
+      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
+                         seg_len);                               // inter_w
+      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
+      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                      seg_len);                                  // inter_y1
+      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
+      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                      seg_len);  // inter_y2
+      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
+                         seg_len);  // inter_h
+      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+                 seg_len);  // area_I
+      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+      }
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+      // get the area_U: area + max_area - area_I
+      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
+                       seg_len);
+      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);  // area_U
+      // 2、 select the box
+      // if IOU greater than thres, set the score to zero, abort it: area_U >
+      // area_I * (1 / thresh)?
+      if (thresh_iou > 0.0) {
+        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                         seg_len);
+      } else {
+        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                         seg_len);
+      }
+      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                seg_len);
+      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+      /******NMS COMPUTE END******/
+
+      if (sizeof(IN_DT) == 2) {
+        __bang_float2half_rd((half *)score, (float *)score, seg_len);
+      }
+      pvLock();
+      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+               cpy_len * sizeof(IN_DT), input_store_dir,
+               cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+      pvUnlock();
+    }  // for repeat
+  }    // for max_output_size
+}
+
+__mlu_global__ void MLUUionXKernelNMS(
+    const void *input_boxes, const void *input_confidence,
+    const int input_num_boxes, const int input_layout, const int input_stride,
+    const int max_output_size, const float iou_threshold,
+    const float confidence_threshold, const float offset,
+    const cnrtDataType_t data_type_input, const int output_mode, const int algo,
+    void *workspace, void *result_num, void *output) {
+  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
+  int32_t *loop_end_flag =
+      (int32_t *)((char *)workspace +
+                  INFO_NUM * input_num_boxes * input_dwidth);
+  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
+  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
+
+  int cluster_score_size = input_num_boxes * input_dwidth;
+  int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
+  char *sram_score = (char *)sram_buffer + reduce_sram_size;
+  char *sram_boxes =
+      (char *)sram_buffer + reduce_sram_size + cluster_score_size;
+  Addr input_ram = GDRAM;
+  if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
+    input_ram = SRAM;
+    __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
+    __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
+  } else {
+    __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
+  }
+  __sync_cluster();
+  uint32_t output_box_num = 0;
+  if (output_mode == 0) {
+    uint32_t *output_dram = (uint32_t *)output;
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *score_data;
+        half *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *score_data;
+        float *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+    }
+  } else {
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *output_dram = (half *)output;
+        half *score_data;
+        half *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *output_dram = (float *)output;
+        float *score_data;
+        float *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+    }
+  }
+}
+
 void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
               const cnrtDataType_t data_type_input, const void *boxes_ptr,
               const void *scores_ptr, const int input_num_boxes,
               const int input_stride, const int max_output_boxes,
               const float iou_threshold, const float offset,
               void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
-  MLUKernelNMS<<<k_dim, k_type, queue>>>(
-      boxes_ptr, scores_ptr, input_num_boxes, input_stride, max_output_boxes,
-      iou_threshold, /*confidence_threshold=*/0.0, /*output_mode=*/0,
-      /*input_layout=*/0, workspace_ptr, output_size_ptr, output_ptr,
-      data_type_input, offset, /*algo=*/1);
+  switch (k_type) {
+    default: { return; }
+    case CNRT_FUNC_TYPE_BLOCK:
+    case CNRT_FUNC_TYPE_UNION1: {
+      MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
+          boxes_ptr, scores_ptr, input_num_boxes, input_stride,
+          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
+          /*output_mode=*/0,
+          /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
+          data_type_input, offset, /*algo=*/1);
+    }; break;
+    case CNRT_FUNC_TYPE_UNION2:
+    case CNRT_FUNC_TYPE_UNION4:
+    case CNRT_FUNC_TYPE_UNION8:
+    case CNRT_FUNC_TYPE_UNION16: {
+      MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
+          boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
+          input_stride, max_output_boxes, iou_threshold,
+          /*confidence_threshold=*/0.0, offset, data_type_input,
+          /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
+          output_ptr);
+    }; break;
+  }
 }
--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
--- a/mmcv/ops/csrc/pytorch/focal_loss.cpp
+++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp
@ -96,8 +96,10 @@ void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
 #endif
 #ifdef MMCV_WITH_MLU
  } else if (input.device().type() == at::kMLU) {
-    CHECK_MLU(input);
-    CHECK_MLU(target);
+    CHECK_MLU_INPUT(input);
+    CHECK_MLU_INPUT(target);
+    CHECK_MLU_INPUT(weight);
+    CHECK_MLU_INPUT(output);
    sigmoid_focal_loss_forward_mlu(input, target, weight, output, gamma, alpha);
 #endif
  } else {
@ -121,10 +123,10 @@ void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
 #endif
 #ifdef MMCV_WITH_MLU
  } else if (input.device().type() == at::kMLU) {
-    CHECK_MLU(input);
-    CHECK_MLU(target);
-    CHECK_MLU(weight);
-    CHECK_MLU(grad_input);
+    CHECK_MLU_INPUT(input);
+    CHECK_MLU_INPUT(target);
+    CHECK_MLU_INPUT(weight);
+    CHECK_MLU_INPUT(grad_input);

    sigmoid_focal_loss_backward_mlu(input, target, weight, grad_input, gamma,
                                    alpha);
--- a/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
@ -37,24 +37,38 @@ static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
  auto N = input.size(0);
  auto C = input.size(1);

-  auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
-  auto c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
+  const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
  const int split_target_num = 2;
  const int split_pipeline_num = 6;
-  auto scalar_size = NFU_ALIGN_SIZE;
-  auto weight_size = c_align_size;
+  const int has_weight = weight.data_ptr() != nullptr;
  const int target_data_width = target.scalar_type() == at::kLong
                                    ? target.itemsize() / 2
                                    : target.itemsize();
+  const int threshold_c =
+      PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      input.itemsize();

-  // n_seg * c_align_size * split_pipeline_num +
-  //    n_seg * target.itemsize() * split_target_num +
-  //    weight_size + scalar_size <= nram_size
-  auto n_seg = (nram_size - weight_size - scalar_size) /
-               (c_align_size * split_pipeline_num +
-                target_data_width * split_target_num);
-  auto seg_num = (N + n_seg - 1) / n_seg;
-
+  int n_seg = 1;
+  if (C <= threshold_c) {
+    int c_size = C * input.itemsize();
+    int reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int wegiht_size = 0;
+    if (has_weight) {
+      c_size = c_align_size;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      wegiht_size = c_align_size;
+    }
+    // n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
+    // split_target_num
+    //     + weight_size + reservered_align_size <= nram_size
+    n_seg = (nram_size - wegiht_size - reservered_align_size) /
+            (split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
+  }
+  auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
  auto core_num = core_dim * cluster_num;
@ -103,31 +117,8 @@ void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
    CNLOG(INFO) << "weight is a empty tensor.";
  }

-  // check C
-  auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
-  auto input_N = input.size(0);
-  auto input_C = input.size(1);
-  const int split_target_num = 2;
-  const int split_pipeline_num = 6;
-  const int has_weight = (int)(weight.data_ptr() != nullptr);
-
-  // target supports only INT on MLU device while it keeps LONG on host side,
-  // so target.itemsize() / 2
-  const int target_data_width = target.scalar_type() == at::kLong
-                                    ? target.itemsize() / 2
-                                    : target.itemsize();
-  auto threshold_C = PAD_DOWN((nram_size - NFU_ALIGN_SIZE -
-                               split_target_num * target_data_width) /
-                                  (split_pipeline_num + has_weight),
-                              NFU_ALIGN_SIZE) /
-                     input.itemsize();
-
-  TORCH_CHECK(threshold_C >= input_C,
-              "input.size(1) should be in the range of [0, ", threshold_C,
-              "]. ", "But now input.size(1) is ", input_C, ".");
-
+  // return if zero-element
  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
-    // return if zero-element
    return;
  }

@ -158,8 +149,8 @@ void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
              << k_dim.z << ">>>";
  // launch kernel
  KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
-                                target_ptr, weight_ptr, input_N, input_C, alpha,
-                                gamma, output_ptr);
+                                target_ptr, weight_ptr, input.size(0),
+                                input.size(1), alpha, gamma, output_ptr);
 }

 void getDealNAndThresholdC(const int compute_data_bytes,
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@ -19,6 +19,16 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
               const float iou_threshold, const float offset,
               void *workspace_ptr, void *output_size_ptr, void *output_ptr);

+int selectUnionType(uint32_t use_job, int box_num_per_core) {
+  // the box_num_per_core should be at least 256, otherwise the real IO
+  // bandwidth would be very low
+  while (box_num_per_core < 256 && use_job >= 4) {
+    box_num_per_core *= 2;
+    use_job /= 2;
+  }
+  return use_job;
+}
+
 Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                            int offset) {
  // dimension parameters check
@ -42,32 +52,57 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  }

  int input_num_boxes = boxes.size(0);
-  int input_stride = boxes.size(1);
+  int input_stride = boxes.size(0);
  int max_output_boxes = boxes.size(0);
-  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
-  int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t dim_x = core_dim;
-  cnrtDim3_t k_dim = {dim_x, 1, 1};
-  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());

+  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
+  cnrtDim3_t k_dim;
+  cnrtJobType_t k_type;
+  uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t job_limit = union_number * core_dim;
+  uint32_t core_number = union_number * core_dim;
+  int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
+  // initiate k_type as Union1
+  k_dim.x = core_dim;
+  k_dim.y = 1;
+  k_dim.z = 1;
+  k_type = CNRT_FUNC_TYPE_UNION1;
+  int use_job = selectUnionType(job_limit, box_num_per_core);
+  if (use_job < 4) {
+    k_dim.x = 1;
+    k_type = CNRT_FUNC_TYPE_BLOCK;
+  } else if (use_job == 4) {
+    k_dim.x = core_dim;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+  } else {
+    k_dim.x = use_job;
+    k_type = (cnrtFunctionType_t)use_job;
+  }
+
+  // transpose boxes (n, 4) to (4, n) for better performance
+  auto boxes_t = boxes.transpose(0, 1);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));

  // workspace
+  const int info_num = 5;  // x1, x2, y1, y2 and score
  size_t space_size = 0;
  if (boxes.scalar_type() == at::kHalf) {
-    space_size = input_num_boxes * sizeof(int16_t);
+    space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
  } else {
-    space_size = input_num_boxes * sizeof(float);
+    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
  }
  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));

  // get compute queue
  auto queue = torch_mlu::getCurQueue();

-  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes);
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
  auto boxes_ptr = boxes_impl->cnnlMalloc();
-  auto scores_impl = torch_mlu::getMluTensorImpl(scores);
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
  auto scores_ptr = scores_impl->cnnlMalloc();
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
  auto workspace_ptr = workspace_impl->cnnlMalloc();
@ -76,20 +111,11 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
  auto output_size_ptr = output_size_impl->cnnlMalloc();

-  switch (k_type) {
-    default: {
-      TORCH_CHECK(false, "[nms_mlu]:Failed to choose kernel to launch");
-    }
-    case CNRT_FUNC_TYPE_BLOCK:
-    case CNRT_FUNC_TYPE_UNION1: {
-      CNLOG(INFO) << "Launch Kernel MLUUnion1 or Block NMS<<<Union"
-                  << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y
-                  << ", " << k_dim.z << ">>>";
-      KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
-                input_num_boxes, input_stride, max_output_boxes, iou_threshold,
-                offset, workspace_ptr, output_size_ptr, output_ptr);
-    }; break;
-  }
+  CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
+              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
+            input_num_boxes, input_stride, max_output_boxes, iou_threshold,
+            offset, workspace_ptr, output_size_ptr, output_ptr);

  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
  return output.slice(0, 0, output_num);
--- a/mmcv/ops/csrc/pytorch/roi_align.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align.cpp
@ -122,11 +122,11 @@ void roi_align_forward(Tensor input, Tensor rois, Tensor output,
 #endif
 #ifdef MMCV_WITH_MLU
  } else if (input.device().type() == at::kMLU) {
-    CHECK_MLU(input);
-    CHECK_MLU(rois);
-    CHECK_MLU(output);
-    CHECK_MLU(argmax_y);
-    CHECK_MLU(argmax_x);
+    CHECK_MLU_INPUT(input);
+    CHECK_MLU_INPUT(rois);
+    CHECK_MLU_INPUT(output);
+    CHECK_MLU_INPUT(argmax_y);
+    CHECK_MLU_INPUT(argmax_x);

    roi_align_forward_mlu(input, rois, output, argmax_y, argmax_x,
                          aligned_height, aligned_width, spatial_scale,
@ -164,11 +164,11 @@ void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
 #endif
 #ifdef MMCV_WITH_MLU
  } else if (grad_output.device().type() == at::kMLU) {
-    CHECK_MLU(grad_output);
-    CHECK_MLU(rois);
-    CHECK_MLU(argmax_y);
-    CHECK_MLU(argmax_x);
-    CHECK_MLU(grad_input);
+    CHECK_MLU_INPUT(grad_output);
+    CHECK_MLU_INPUT(rois);
+    CHECK_MLU_INPUT(argmax_y);
+    CHECK_MLU_INPUT(argmax_x);
+    CHECK_MLU_INPUT(grad_input);

    roi_align_backward_mlu(grad_output, rois, argmax_y, argmax_x, grad_input,
                           aligned_height, aligned_width, spatial_scale,