mirror of
https://github.com/open-mmlab/mmcv.git
synced 2025-06-03 21:54:52 +08:00
[Feature] Support Iou3d with cambricon MLU backend (#2339)
* [Feature] Support Iou3d with cambricon MLU backend * [Feature] add double line * [Feature] add line
This commit is contained in:
parent
6d9e4a9b78
commit
7fd7058a9c
@ -26,7 +26,7 @@ We implement common ops used in detection, segmentation, etc.
|
||||
| FusedBiasLeakyrelu | | √ | | |
|
||||
| GatherPoints | | √ | | |
|
||||
| GroupPoints | | √ | | |
|
||||
| Iou3d | | √ | | |
|
||||
| Iou3d | | √ | √ | |
|
||||
| KNN | | √ | | |
|
||||
| MaskedConv | | √ | √ | |
|
||||
| MergeCells | | √ | | |
|
||||
|
@ -26,7 +26,7 @@ MMCV 提供了检测、分割等任务中常用的算子
|
||||
| FusedBiasLeakyrelu | | √ | | |
|
||||
| GatherPoints | | √ | | |
|
||||
| GroupPoints | | √ | | |
|
||||
| Iou3d | | √ | | |
|
||||
| Iou3d | | √ | √ | |
|
||||
| KNN | | √ | | |
|
||||
| MaskedConv | | √ | √ | |
|
||||
| MergeCells | | √ | | |
|
||||
|
431
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
Normal file
431
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
Normal file
@ -0,0 +1,431 @@
|
||||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "common_mlu_helper.hpp"
|
||||
#include "iou3d_utils.hpp"
|
||||
|
||||
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
|
||||
|
||||
/* NRAM buffer
|
||||
* Suppose deal N boxes once time.
|
||||
----------------------------------------------------------------
|
||||
| Basic |score (1N)+ |intersect_pts(48N)| |
|
||||
| |valid_box(1N) |+ ordered_pts(48N)| temp_long(72N) |
|
||||
| |+ temp_buffer(10N)| | |
|
||||
|--------------------------|------------------|----------------|
|
||||
| Reuse | null | null |rotated_pts(16N)|
|
||||
|-------|------------------|------------------|----------------|
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
| Basic | dist_ram(24N) | valid_pts(24N) |box1(5N) |box1_buffer(5KB) |
|
||||
| | |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB) |
|
||||
|--------------------------|-----------------|----------|-----------------|
|
||||
| Reuse | vec_buffer(5N) | null | null | null |
|
||||
|-------|------------------|-----------------|----------|-----------------|
|
||||
Total Basic Memory Size = 239N * sizeof(float) + 10KB
|
||||
*/
|
||||
|
||||
__nram__ char nram_buffer[MAX_NRAM_SIZE];
|
||||
__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
|
||||
const T *boxes_data, float *scores_data,
|
||||
const int core_limit, const int input_box_num,
|
||||
const float iou_threshold,
|
||||
mluMemcpyDirection_t scores_load_dir,
|
||||
mluMemcpyDirection_t scores_store_dir,
|
||||
mluMemcpyDirection_t boxes_load_dir) {
|
||||
// NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
|
||||
const int nram_save_limit_count = 256;
|
||||
int box_read_limit_count = 256;
|
||||
float div_thresh_iou = 1.0 / iou_threshold;
|
||||
// every box require 239 * sizeof(float) space in nram;
|
||||
const int32_t copies_of_nram = 239 * sizeof(float);
|
||||
const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
|
||||
nram_save_limit_count * sizeof(int32_t)) /
|
||||
copies_of_nram;
|
||||
|
||||
// x,y,z,dx,dy,dz,angle
|
||||
const T *input_x_ptr = boxes_data;
|
||||
const T *input_y_ptr = input_x_ptr + input_box_num;
|
||||
const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
|
||||
const T *input_dy_ptr = input_dx_ptr + input_box_num;
|
||||
const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
|
||||
float *input_score_ptr = scores_data;
|
||||
|
||||
// data split
|
||||
int avg_cluster = 0;
|
||||
int rem_cluster = 0;
|
||||
int len_cluster = 0;
|
||||
int cluster_offset = 0;
|
||||
if (clusterDim > 0) {
|
||||
// union
|
||||
avg_cluster = input_box_num / clusterDim;
|
||||
rem_cluster = input_box_num % clusterDim;
|
||||
len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
|
||||
cluster_offset = avg_cluster * clusterId +
|
||||
(clusterId <= rem_cluster ? clusterId : rem_cluster);
|
||||
} else {
|
||||
// block
|
||||
len_cluster = input_box_num;
|
||||
cluster_offset = 0;
|
||||
}
|
||||
int len_core = input_box_num;
|
||||
int input_offset = 0;
|
||||
if (core_limit > 1) {
|
||||
int avg_core = len_cluster / coreDim;
|
||||
int rem_core = len_cluster % coreDim;
|
||||
len_core = avg_core + (coreId < rem_core ? 1 : 0);
|
||||
int core_offset =
|
||||
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
|
||||
input_offset = cluster_offset + core_offset;
|
||||
}
|
||||
|
||||
int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
|
||||
int repeat_iou_compute = len_core / max_seg_pad;
|
||||
int remain_iou_compute = len_core % max_seg_pad;
|
||||
|
||||
// basic consistent memory layout
|
||||
void *score = ((char *)nram_buffer);
|
||||
void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
|
||||
void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
|
||||
void *intersect_pts_x =
|
||||
((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
|
||||
void *intersect_pts_y =
|
||||
((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
|
||||
void *ordered_pts_x =
|
||||
((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
|
||||
void *ordered_pts_y =
|
||||
((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
|
||||
void *temp_long_1 =
|
||||
((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
|
||||
void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
|
||||
void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
|
||||
void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
|
||||
void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
|
||||
void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
|
||||
T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
|
||||
T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
|
||||
void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
|
||||
int32_t *nram_save =
|
||||
(int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
|
||||
// nram_save ~ nram_save_limit_count * sizeof(int32_t)
|
||||
int nram_save_count = 0;
|
||||
|
||||
// reuse memory
|
||||
void *rotated_pts1_x = ((char *)dist_ram);
|
||||
void *rotated_pts1_y =
|
||||
((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
|
||||
void *rotated_pts2_x =
|
||||
((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
|
||||
void *rotated_pts2_y =
|
||||
((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
|
||||
void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
|
||||
// vec_buffer ~ 16 * max_seg_pad * sizeof(float)
|
||||
|
||||
// First, initialize ram with all 0, or could cause nan/inf unexcepted results
|
||||
__bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
|
||||
// number 8 and 0xff relay on box_read_limit_count initial as 256
|
||||
const int max_box_seg_id = (input_box_num - 1) >> 8;
|
||||
const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
|
||||
for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
|
||||
__sync_all();
|
||||
int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
|
||||
box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
|
||||
: box_read_limit_count;
|
||||
if (box_id == 0) {
|
||||
// x,y,z,dx,dy,dz,angle
|
||||
int offset_num = box_seg_id << 8;
|
||||
// x
|
||||
__memcpy((char *)box1_buffer, input_x_ptr + offset_num,
|
||||
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
|
||||
box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// y
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
|
||||
input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
|
||||
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// dx
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
|
||||
input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
|
||||
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// dy
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
|
||||
input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
|
||||
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// angle
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
|
||||
input_angle_ptr + offset_num,
|
||||
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
|
||||
box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
}
|
||||
if (((float *)input_score_ptr)[cur_box] == 0) {
|
||||
continue;
|
||||
}
|
||||
// save result
|
||||
nram_save[nram_save_count] = cur_box;
|
||||
result_box_num++;
|
||||
nram_save_count++;
|
||||
if (clusterId == 0 && coreId == 0 &&
|
||||
nram_save_count == nram_save_limit_count) {
|
||||
pvLock();
|
||||
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
|
||||
NRAM2GDRAM);
|
||||
pvUnlock();
|
||||
output_data += nram_save_count;
|
||||
nram_save_count = 0;
|
||||
}
|
||||
// prepare box1
|
||||
// x
|
||||
__bang_write_value((float *)box1, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id]));
|
||||
// y
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
|
||||
// dx
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad * 2, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
|
||||
// dy
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad * 3, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
|
||||
// angle
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad * 4, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
|
||||
|
||||
float max_area = 1.0f *
|
||||
((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
|
||||
((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
|
||||
// update score
|
||||
|
||||
for (int i = 0; i <= repeat_iou_compute; i++) {
|
||||
if (i == repeat_iou_compute && remain_iou_compute == 0) {
|
||||
break;
|
||||
}
|
||||
int seg_len = max_seg_pad;
|
||||
int cpy_len =
|
||||
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
|
||||
// int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
|
||||
int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
|
||||
// score
|
||||
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * sizeof(float), scores_load_dir,
|
||||
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
|
||||
// x
|
||||
__memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// y
|
||||
__memcpy(box2 + half_offset + seg_len * 1,
|
||||
input_y_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// dx
|
||||
__memcpy(box2 + half_offset + seg_len * 2,
|
||||
input_dx_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// dy
|
||||
__memcpy(box2 + half_offset + seg_len * 3,
|
||||
input_dy_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// angle
|
||||
__memcpy(box2 + half_offset + seg_len * 4,
|
||||
input_angle_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// if (std::is_same<T, half>::value) {
|
||||
if (sizeof(T) == sizeof(half)) {
|
||||
__bang_half2float((float *)box2, (half *)(box2 + half_offset),
|
||||
seg_len * 5);
|
||||
}
|
||||
|
||||
// Calculate rotated vertices
|
||||
void *temp1_ram = ((char *)temp_buffer);
|
||||
void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
|
||||
void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
|
||||
void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
|
||||
getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
|
||||
(float *)box1, (float *)temp1_ram, (float *)temp2_ram,
|
||||
(float *)temp3_ram, (float *)temp4_ram, seg_len);
|
||||
getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
|
||||
(float *)box2, (float *)temp1_ram, (float *)temp2_ram,
|
||||
(float *)temp3_ram, (float *)temp4_ram, seg_len);
|
||||
|
||||
__bang_write_zero((float *)valid_pts, 24 * seg_len);
|
||||
__bang_write_zero((float *)nums_in_ram, seg_len);
|
||||
__bang_write_value(((float *)valid_box), seg_len, 1.0f);
|
||||
void *vec1_x = ((char *)vec_buffer);
|
||||
void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
|
||||
void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
|
||||
void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
|
||||
void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
|
||||
void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
|
||||
void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
|
||||
void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
|
||||
void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
|
||||
void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
|
||||
|
||||
// Get all intersection points
|
||||
getIntersectPts(
|
||||
(float *)rotated_pts1_x, (float *)rotated_pts1_y,
|
||||
(float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
|
||||
(float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
|
||||
(float *)intersect_pts_x, (float *)intersect_pts_y,
|
||||
(float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
|
||||
(float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
|
||||
(float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
|
||||
(float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
|
||||
|
||||
// Where nums_in <= 2, set valid_box to false
|
||||
__bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
|
||||
__bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
|
||||
(float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
|
||||
seg_len);
|
||||
__bang_cycle_and((float *)valid_pts, (float *)valid_pts,
|
||||
(float *)valid_box, 24 * seg_len, seg_len);
|
||||
|
||||
// Convex-hull-graham to order the intersection points in clockwise order
|
||||
// and find the contour area
|
||||
|
||||
convexHullGraham(
|
||||
(float *)intersect_pts_x, (float *)intersect_pts_y,
|
||||
(float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
|
||||
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
|
||||
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
|
||||
(float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
|
||||
seg_len, seg_len);
|
||||
// Calculate polygon area
|
||||
// set temp1 = intersection part area
|
||||
polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
|
||||
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
|
||||
(float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
|
||||
(float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
|
||||
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
|
||||
seg_len);
|
||||
// area
|
||||
__bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
|
||||
(float *)box2 + seg_len * 3, seg_len);
|
||||
// get the area_U: area + max_area - area_I
|
||||
__bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
|
||||
seg_len);
|
||||
__bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
|
||||
seg_len); // area_U
|
||||
if (iou_threshold > 0.0) {
|
||||
__bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
|
||||
div_thresh_iou, seg_len);
|
||||
} else {
|
||||
__bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
|
||||
seg_len);
|
||||
}
|
||||
__bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
|
||||
seg_len);
|
||||
__bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
|
||||
|
||||
pvLock();
|
||||
__memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
|
||||
cpy_len * sizeof(float), scores_store_dir,
|
||||
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
|
||||
pvUnlock();
|
||||
}
|
||||
}
|
||||
if (clusterId == 0 && coreId == 0 && nram_save_count) {
|
||||
pvLock();
|
||||
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
|
||||
NRAM2GDRAM);
|
||||
pvUnlock();
|
||||
}
|
||||
}
|
||||
__mlu_global__ void MLUBlockorUnionIKernelOU3D(
|
||||
const void *input_boxes, const int input_box_num, const float iou_threshold,
|
||||
const cnrtDataType_t data_type_input, void *workspace, void *result_num,
|
||||
void *output) {
|
||||
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
|
||||
mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
|
||||
mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
|
||||
mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
|
||||
float *scores_data = (float *)workspace;
|
||||
float *boxes_data = (float *)input_boxes;
|
||||
const int cluster_score_size = input_box_num * sizeof(float);
|
||||
const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
|
||||
char *sram_score = (char *)sram_buffer;
|
||||
char *sram_boxes = (char *)sram_buffer + cluster_score_size;
|
||||
if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
|
||||
scores_data = (float *)sram_score;
|
||||
scores_load_dir = SRAM2NRAM;
|
||||
scores_store_dir = NRAM2SRAM;
|
||||
if (coreId == 0x80) {
|
||||
__sramset((void *)sram_buffer, input_box_num, 1.0f);
|
||||
}
|
||||
} else {
|
||||
if (coreId == 0) {
|
||||
__gdramset(scores_data, input_box_num, 1.0f);
|
||||
}
|
||||
}
|
||||
if (clusterDim == 1 &&
|
||||
SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
|
||||
boxes_load_dir = SRAM2NRAM;
|
||||
boxes_data = (float *)sram_boxes;
|
||||
if (coreId == 0x80) {
|
||||
__memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
|
||||
GDRAM2SRAM);
|
||||
}
|
||||
}
|
||||
__sync_cluster();
|
||||
|
||||
int32_t result_box_num = 0;
|
||||
int32_t *out_data = (int32_t *)output;
|
||||
|
||||
switch (data_type_input) {
|
||||
default: { return; }
|
||||
case CNRT_FLOAT16: {
|
||||
iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
|
||||
taskDim, input_box_num, iou_threshold, scores_load_dir,
|
||||
scores_store_dir, boxes_load_dir);
|
||||
}; break;
|
||||
case CNRT_FLOAT32: {
|
||||
iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
|
||||
taskDim, input_box_num, iou_threshold, scores_load_dir,
|
||||
scores_store_dir, boxes_load_dir);
|
||||
}; break;
|
||||
}
|
||||
((int32_t *)result_num)[0] = result_box_num;
|
||||
}
|
||||
|
||||
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t data_type_input, const void *boxes_dram,
|
||||
const int input_box_num, const float iou_threshold,
|
||||
void *workspace, void *output_size, void *output) {
|
||||
switch (k_type) {
|
||||
default: { return; }
|
||||
case CNRT_FUNC_TYPE_BLOCK:
|
||||
case CNRT_FUNC_TYPE_UNION1:
|
||||
case CNRT_FUNC_TYPE_UNION2:
|
||||
case CNRT_FUNC_TYPE_UNION4:
|
||||
case CNRT_FUNC_TYPE_UNION8:
|
||||
case CNRT_FUNC_TYPE_UNION16: {
|
||||
MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
|
||||
(void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
|
||||
workspace, output_size, output);
|
||||
}; break;
|
||||
}
|
||||
}
|
695
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
Normal file
695
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
Normal file
@ -0,0 +1,695 @@
|
||||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#ifndef IOU3D_UTILS_HPP_
|
||||
#define IOU3D_UTILS_HPP_
|
||||
#include "common_mlu_helper.hpp"
|
||||
|
||||
#define IOU3D_SIZE 64
|
||||
#define IOU3D_UP(x, y) (x / y + (int)(x % y > 0)) * y
|
||||
#define IOU3D_DOWN(x, y) (x / y) * y
|
||||
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE)
|
||||
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
|
||||
#define COMPUTE_COUNT_ALIGN 64
|
||||
#define INFO_NUM (5) // score, x1, y1, x2, y2
|
||||
#define REDUCE_NUM \
|
||||
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
|
||||
#define SINGLE_BOX_DIM 5
|
||||
#define MEMORY_CORE (0x80)
|
||||
__mlu_func__ void pvLock() {
|
||||
#if __BANG_ARCH__ == 270
|
||||
if (coreId != MEMORY_CORE) {
|
||||
__bang_lock(0, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__mlu_func__ void pvUnlock() {
|
||||
#if __BANG_ARCH__ == 270
|
||||
if (coreId != MEMORY_CORE) {
|
||||
__bang_unlock(0, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// cross2d<T>(A, B) = A.x * B.y - A.y * B.x;
|
||||
template <typename T>
|
||||
inline __mlu_func__ void cross2d(T *result, const T *p1_x, const T *p1_y,
|
||||
const T *p2_x, const T *p2_y,
|
||||
const int &length, T *temp_ram) {
|
||||
__bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_y, length);
|
||||
__bang_mul((T *)result, (T *)p1_y, (T *)p2_x, length);
|
||||
__bang_sub((T *)result, (T *)temp_ram, (T *)result, length);
|
||||
}
|
||||
|
||||
// dot2d<T>(A, B) = A.x * B.x + A.y * B.y
|
||||
template <typename T>
|
||||
inline __mlu_func__ void dot2d(T *result, const T *p1_x, const T *p1_y,
|
||||
const T *p2_x, const T *p2_y, const int &length,
|
||||
T *temp_ram) {
|
||||
__bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_x, length);
|
||||
__bang_mul((T *)result, (T *)p1_y, (T *)p2_y, length);
|
||||
__bang_add((T *)result, (T *)temp_ram, (T *)result, length);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void getRotatedVertices(T *pts_x, T *pts_y, T *box, T *temp1,
|
||||
T *temp2, T *temp3, T *temp4,
|
||||
const uint32_t &actual_compute_box_num) {
|
||||
// T cosTheta2 = (T)cos(theta) * 0.5f; -- temp1
|
||||
// T sinTheta2 = (T)sin(theta) * 0.5f; -- temp2
|
||||
// theta is the box's 5th data: a, rotated radian;
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_cos((float *)temp1, ((float *)box) + 4 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sin((float *)temp2, ((float *)box) + 4 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
#else
|
||||
__bang_taylor4_cos((T *)temp1, ((T *)box) + 4 * actual_compute_box_num,
|
||||
(T *)temp3, (T *)temp4, actual_compute_box_num);
|
||||
__bang_taylor4_sin((T *)temp2, ((T *)box) + 4 * actual_compute_box_num,
|
||||
(T *)temp3, (T *)temp4, actual_compute_box_num);
|
||||
#endif
|
||||
__bang_mul_scalar((T *)temp1, (T *)temp1, (T)0.5, actual_compute_box_num);
|
||||
__bang_mul_scalar((T *)temp2, (T *)temp2, (T)0.5, actual_compute_box_num);
|
||||
|
||||
// Temp3 = sinTheta2 * box.h;
|
||||
// Temp4 = cosTheta2 * box.w;
|
||||
__bang_mul((T *)temp3, (T *)temp2, ((T *)box) + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp4, (T *)temp1, ((T *)box) + 2 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
|
||||
// pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
|
||||
__bang_sub((T *)pts_x, (T *)box, (T *)temp3, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x, (T *)pts_x, (T *)temp4, actual_compute_box_num);
|
||||
__bang_add((T *)pts_x + 1 * actual_compute_box_num, (T *)box, (T *)temp3,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x + 1 * actual_compute_box_num,
|
||||
(T *)pts_x + 1 * actual_compute_box_num, (T *)temp4,
|
||||
actual_compute_box_num);
|
||||
// Temp3 = cosTheta2 * box.h;
|
||||
// Temp4 = sinTheta2 * box.w;
|
||||
__bang_mul((T *)temp3, (T *)temp1, box + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp4, (T *)temp2, box + 2 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
|
||||
// pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
|
||||
__bang_add((T *)pts_y, (T *)box + 1 * actual_compute_box_num, (T *)temp3,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y, (T *)pts_y, (T *)temp4, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 1 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num, (T *)temp3,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 1 * actual_compute_box_num,
|
||||
(T *)pts_y + 1 * actual_compute_box_num, (T *)temp4,
|
||||
actual_compute_box_num);
|
||||
// pts[2].x = 2 * box.x_ctr - pts[0].x;
|
||||
// pts[3].x = 2 * box.x_ctr - pts[1].x;
|
||||
__bang_add((T *)pts_x + 2 * actual_compute_box_num, (T *)box, (T *)box,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x + 2 * actual_compute_box_num,
|
||||
(T *)pts_x + 2 * actual_compute_box_num, (T *)pts_x,
|
||||
actual_compute_box_num);
|
||||
__bang_add((T *)pts_x + 3 * actual_compute_box_num, (T *)box, (T *)box,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x + 3 * actual_compute_box_num,
|
||||
(T *)pts_x + 3 * actual_compute_box_num,
|
||||
(T *)pts_x + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
// pts[2].y = 2 * box.y_ctr - pts[0].y;
|
||||
// pts[3].y = 2 * box.y_ctr - pts[1].y;
|
||||
__bang_add((T *)pts_y + 2 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 2 * actual_compute_box_num,
|
||||
(T *)pts_y + 2 * actual_compute_box_num, (T *)pts_y,
|
||||
actual_compute_box_num);
|
||||
__bang_add((T *)pts_y + 3 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 3 * actual_compute_box_num,
|
||||
(T *)pts_y + 3 * actual_compute_box_num,
|
||||
(T *)pts_y + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void getIntersectPts(T *rotated_pts1_x, T *rotated_pts1_y,
|
||||
T *rotated_pts2_x, T *rotated_pts2_y,
|
||||
T *vec1_x, T *vec1_y, T *vec2_x, T *vec2_y,
|
||||
T *intersect_pts_x, T *intersect_pts_y,
|
||||
T *valid_pts, T *nums_in_ram, T *temp1_ram,
|
||||
T *temp2_ram, T *temp3_ram, T *temp4_ram,
|
||||
T *temp5_ram, T *temp6_ram, T *temp7_ram,
|
||||
T *temp8_ram, T *temp9_ram, T *temp10_ram,
|
||||
const uint32_t &actual_compute_box_num) {
|
||||
// Initialize const data to ram
|
||||
// temp3 = const 1e-14(@float), length = COMPUTE_COUNT_ALIGN
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (T)1e-14);
|
||||
#else
|
||||
// NOTE: Since active_reciphp function has strict value range,
|
||||
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (float)1e-14);
|
||||
#endif
|
||||
// temp4 = const T(0), length = COMPUTE_COUNT_ALIGN
|
||||
__bang_write_value((T *)temp4_ram, COMPUTE_COUNT_ALIGN, (T)0);
|
||||
// temp5 = const T(1), length = COMPUTE_COUNT_ALIGN
|
||||
__bang_write_value((T *)temp5_ram, COMPUTE_COUNT_ALIGN, (T)1);
|
||||
|
||||
// Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1]
|
||||
// for i = 0~3, vec[i] = pts[(i+1)%4] - pts[i]
|
||||
__bang_sub((T *)vec1_x, (T *)rotated_pts1_x + actual_compute_box_num,
|
||||
(T *)rotated_pts1_x, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec1_x + 3 * actual_compute_box_num, (T *)rotated_pts1_x,
|
||||
(T *)rotated_pts1_x + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)vec1_y, (T *)rotated_pts1_y + actual_compute_box_num,
|
||||
(T *)rotated_pts1_y, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec1_y + 3 * actual_compute_box_num, (T *)rotated_pts1_y,
|
||||
(T *)rotated_pts1_y + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
__bang_sub((T *)vec2_x, (T *)rotated_pts2_x + actual_compute_box_num,
|
||||
(T *)rotated_pts2_x, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec2_x + 3 * actual_compute_box_num, (T *)rotated_pts2_x,
|
||||
(T *)rotated_pts2_x + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)vec2_y, (T *)rotated_pts2_y + actual_compute_box_num,
|
||||
(T *)rotated_pts2_y, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec2_y + 3 * actual_compute_box_num, (T *)rotated_pts2_y,
|
||||
(T *)rotated_pts2_y + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// First, line test - test all line combos for intersection, 4x4 possible
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
// T det = cross2d<T>(vec2[j], vec1[i]) -- temp2
|
||||
cross2d<T>((T *)temp2_ram, (T *)vec2_x + j * actual_compute_box_num,
|
||||
(T *)vec2_y + j * actual_compute_box_num,
|
||||
(T *)vec1_x + i * actual_compute_box_num,
|
||||
(T *)vec1_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num, (T *)temp1_ram);
|
||||
// temp8 = sign(det), since active_reciphp only receive positive values
|
||||
__bang_active_sign((T *)temp8_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
// deal with parallel lines, temp2 = fabs(det), temp1 = temp2 > 1e-14
|
||||
__bang_active_abs((T *)temp2_ram, (T *)temp2_ram, actual_compute_box_num);
|
||||
__bang_cycle_gt((T *)temp1_ram, (T *)temp2_ram, (T *)temp3_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
// Where temp1 = false, set recip input to 1, avoiding recip(0), cause inf
|
||||
__bang_not((T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_add((T *)temp2_ram, (T *)temp2_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
// temp2 = 1/temp2, use mult (1/temp2) instead of div temp2
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_recip((float *)temp2_ram, (float *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
#else
|
||||
// NOTE: active_reciphp function has strict value range:
|
||||
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
|
||||
__bang_active_reciphp((T *)temp2_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
#endif
|
||||
// Restore temp2 invalid box value 1 and sign-bit
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp8_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// auto vec12 = pts2[j] - pts1[i], (temp6, temp7) = (x, y)
|
||||
__bang_sub((T *)temp6_ram,
|
||||
(T *)rotated_pts2_x + j * actual_compute_box_num,
|
||||
(T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)temp7_ram,
|
||||
(T *)rotated_pts2_y + j * actual_compute_box_num,
|
||||
(T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// T t1 = cross2d<T>(vec2[j], vec12) mult (1/det) -- temp8
|
||||
cross2d<T>((T *)temp8_ram, (T *)vec2_x + j * actual_compute_box_num,
|
||||
(T *)vec2_y + j * actual_compute_box_num, (T *)temp6_ram,
|
||||
(T *)temp7_ram, actual_compute_box_num, (T *)temp9_ram);
|
||||
__bang_mul((T *)temp8_ram, (T *)temp8_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// temp1 &= (t1 >= 0.0f && t1 <= 1.0f) -- temp9
|
||||
__bang_cycle_ge((T *)temp9_ram, (T *)temp8_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_cycle_le((T *)temp9_ram, (T *)temp8_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// T t2 = cross2d<T>(vec1[i], vec12) mult temp2 -- temp9
|
||||
// NOTE: temp8(t1) is used after, reuse temp7(p2_y) as cross2d temp ram
|
||||
cross2d<T>((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
|
||||
(T *)vec1_y + i * actual_compute_box_num, (T *)temp6_ram,
|
||||
(T *)temp7_ram, actual_compute_box_num, (T *)temp7_ram);
|
||||
__bang_mul((T *)temp9_ram, (T *)temp9_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// temp1 &= (t2 >= 0.0f && t2 <= 1.0f) -- temp9
|
||||
__bang_cycle_ge((T *)temp7_ram, (T *)temp9_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_cycle_le((T *)temp7_ram, (T *)temp9_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// intersections = (pts1[i] + vec1[i] * t1) * temp1
|
||||
__bang_mul((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
|
||||
(T *)temp8_ram, actual_compute_box_num);
|
||||
__bang_add((T *)temp9_ram,
|
||||
(T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
(T *)temp9_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_x + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)temp9_ram, (T *)vec1_y + i * actual_compute_box_num,
|
||||
(T *)temp8_ram, actual_compute_box_num);
|
||||
__bang_add((T *)temp9_ram,
|
||||
(T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
(T *)temp9_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_y + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
|
||||
|
||||
// Assign `valid_pts` bit and accumulate `nums_in` of valid points of each
|
||||
// box pair
|
||||
__bang_or((T *)valid_pts + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)valid_pts + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for vertices of rect1 inside rect2
|
||||
// temp5 = ABdotAB
|
||||
dot2d<T>((T *)temp5_ram, (T *)vec2_x, (T *)vec2_y, (T *)vec2_x, (T *)vec2_y,
|
||||
actual_compute_box_num, (T *)temp9_ram);
|
||||
// temp6 = ADdotAD
|
||||
dot2d<T>((T *)temp6_ram, (T *)vec2_x + 3 * actual_compute_box_num,
|
||||
(T *)vec2_y + 3 * actual_compute_box_num,
|
||||
(T *)vec2_x + 3 * actual_compute_box_num,
|
||||
(T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp9_ram);
|
||||
// assume ABCD is the rectangle, and P is the point to be judged
|
||||
// P is inside ABCD iff. P's projection on AB lines within AB
|
||||
// and P's projection on AD lies within AD
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// AP = pts1[i] - pts2[0] = (temp7, temp8)
|
||||
__bang_sub((T *)temp7_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
(T *)rotated_pts2_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp8_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
(T *)rotated_pts2_y, actual_compute_box_num);
|
||||
|
||||
// temp9 = APdotAB = dot2d<T>(AP, AB)
|
||||
dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec2_x,
|
||||
(T *)vec2_y, actual_compute_box_num, (T *)temp2_ram);
|
||||
// temp10 = APdotAD = -dot2d<T>(AP, DA)
|
||||
dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
|
||||
(T *)vec2_x + 3 * actual_compute_box_num,
|
||||
(T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp2_ram);
|
||||
__bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
|
||||
actual_compute_box_num);
|
||||
|
||||
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
|
||||
// ADdotAD))
|
||||
__bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// 16 means the 4x4 possible intersection points above
|
||||
__bang_mul((T *)intersect_pts_x + (16 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_y + (16 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// assign valid_pts bit and accumulate nums of valid points of each box pair
|
||||
__bang_or((T *)valid_pts + (16 + i) * actual_compute_box_num,
|
||||
(T *)valid_pts + (16 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
|
||||
// Reverse the check - check for vertices of rect2 inside rect1
|
||||
// temp5 = ABdotAB
|
||||
dot2d<T>((T *)temp5_ram, (T *)vec1_x, (T *)vec1_y, (T *)vec1_x, (T *)vec1_y,
|
||||
actual_compute_box_num, (T *)temp9_ram);
|
||||
// temp6 = ADdotAD
|
||||
dot2d<T>((T *)temp6_ram, (T *)vec1_x + 3 * actual_compute_box_num,
|
||||
(T *)vec1_y + 3 * actual_compute_box_num,
|
||||
(T *)vec1_x + 3 * actual_compute_box_num,
|
||||
(T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp9_ram);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// AP = pts2[i] - pts1[0] = (temp7, temp8)
|
||||
__bang_sub((T *)temp7_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
|
||||
(T *)rotated_pts1_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp8_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
|
||||
(T *)rotated_pts1_y, actual_compute_box_num);
|
||||
|
||||
// temp9 = APdotAB = dot2d<T>(AP, AB)
|
||||
dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec1_x,
|
||||
(T *)vec1_y, actual_compute_box_num, (T *)temp2_ram);
|
||||
// temp10 = APdotAD = -dot2d<T>(AP, DA)
|
||||
dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
|
||||
(T *)vec1_x + 3 * actual_compute_box_num,
|
||||
(T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp2_ram);
|
||||
__bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
|
||||
actual_compute_box_num);
|
||||
|
||||
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
|
||||
// ADdotAD))
|
||||
__bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// 20 means the (4x4+4) possible intersection points above
|
||||
__bang_mul((T *)intersect_pts_x + (20 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_y + (20 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// assign valid_pts bit and accumulate nums of valid points of each box pair
|
||||
__bang_or((T *)valid_pts + (20 + i) * actual_compute_box_num,
|
||||
(T *)valid_pts + (20 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void convexHullGraham(
|
||||
T *intersect_pts_x, T *intersect_pts_y, T *ordered_pts_x, T *ordered_pts_y,
|
||||
T *dist_ram, T *valid_box, T *valid_pts, T *nums_in_ram, T *temp1_ram,
|
||||
T *temp2_ram, T *temp3_ram, T *temp_long_1, T *temp_long_2, T *temp_long_3,
|
||||
const uint32_t &actual_box_num, const uint32_t &actual_compute_box_num) {
|
||||
// Step1. Find the point with minimum y, if more than 1 points have the same
|
||||
// minimum y,
|
||||
// pick the one with the minimum x.
|
||||
// set p[i].y to max_y_value if not valid_pts, to avoid invalid result
|
||||
// 24 means all possible intersection points
|
||||
__bang_max((T *)temp2_ram, (T *)intersect_pts_y, 24 * actual_compute_box_num);
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp2_ram)[0]);
|
||||
__bang_not((T *)temp_long_1, (T *)valid_pts, 24 * actual_compute_box_num);
|
||||
__bang_cycle_mul((T *)temp_long_1, (T *)temp_long_1, (T *)temp3_ram,
|
||||
24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_mul((T *)temp_long_2, (T *)intersect_pts_y, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_add((T *)temp_long_2, (T *)temp_long_2, (T *)temp_long_1,
|
||||
24 * actual_compute_box_num);
|
||||
// temp2 = min_y_value(temp_long_2), use min_pool, channel=box_num, h=1, w=24
|
||||
__bang_minpool((T *)temp2_ram, (T *)temp_long_2, actual_compute_box_num, 1,
|
||||
24, 1, 24, 1, 24);
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)valid_box,
|
||||
actual_compute_box_num);
|
||||
|
||||
// set p[i].x to max_x_value if not min_y point
|
||||
__bang_max((T *)temp1_ram, (T *)intersect_pts_x, 24 * actual_compute_box_num);
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp1_ram)[0]);
|
||||
__bang_cycle_eq((T *)temp_long_1, (T *)temp_long_2, (T *)temp2_ram,
|
||||
24 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_and((T *)temp_long_1, (T *)temp_long_1, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_not((T *)temp_long_3, (T *)temp_long_1, 24 * actual_compute_box_num);
|
||||
__bang_cycle_mul((T *)temp_long_3, (T *)temp_long_3, (T *)temp3_ram,
|
||||
24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_mul((T *)temp_long_1, (T *)intersect_pts_x, (T *)temp_long_1,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_add((T *)temp_long_1, (T *)temp_long_1, (T *)temp_long_3,
|
||||
24 * actual_compute_box_num);
|
||||
// temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1, w=24
|
||||
__bang_minpool((T *)temp3_ram, (T *)temp_long_1, actual_compute_box_num, 1,
|
||||
24, 1, 24, 1, 24);
|
||||
__bang_mul((T *)temp3_ram, (T *)temp3_ram, (T *)valid_box,
|
||||
actual_compute_box_num);
|
||||
|
||||
// Step2. All points subtract starting-point (for sorting in the next step)
|
||||
__bang_cycle_sub((T *)ordered_pts_x, (T *)intersect_pts_x, (T *)temp3_ram,
|
||||
24 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_cycle_sub((T *)ordered_pts_y, (T *)intersect_pts_y, (T *)temp2_ram,
|
||||
24 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_mul((T *)ordered_pts_x, (T *)ordered_pts_x, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_mul((T *)ordered_pts_y, (T *)ordered_pts_y, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
|
||||
// Step3. Sort every intersection point according to their relative
|
||||
// cross-product values (essentially sorting according to angles)
|
||||
// If the angles are the same, sort according to distance to origin
|
||||
dot2d<T>((T *)dist_ram, (T *)ordered_pts_x, (T *)ordered_pts_y,
|
||||
(T *)ordered_pts_x, (T *)ordered_pts_y, 24 * actual_compute_box_num,
|
||||
(T *)temp_long_3);
|
||||
|
||||
T temp, temp_nums_in, temp_dist_1, temp_dist_2;
|
||||
T temp1_x, temp1_y;
|
||||
T temp2_x, temp2_y;
|
||||
for (int i = 0; i < actual_box_num; i++) {
|
||||
if (((T *)valid_box)[i]) {
|
||||
// make sure all nums_in[i] points are at the front
|
||||
for (int ii = 0; ii < 23; ii++) {
|
||||
for (int jj = ii + 1; jj < 24; jj++) {
|
||||
int ii_index = ii * actual_compute_box_num + i;
|
||||
int jj_index = jj * actual_compute_box_num + i;
|
||||
// ii point is not valid and jj point is valid, swap jj for ii
|
||||
if ((!((T *)valid_pts)[ii_index]) && ((T *)valid_pts)[jj_index]) {
|
||||
((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[jj_index];
|
||||
((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[jj_index];
|
||||
((T *)dist_ram)[ii_index] = ((T *)dist_ram)[jj_index];
|
||||
((T *)valid_pts)[ii_index] = true;
|
||||
((T *)ordered_pts_x)[jj_index] = 0;
|
||||
((T *)ordered_pts_y)[jj_index] = 0;
|
||||
((T *)dist_ram)[jj_index] = 0;
|
||||
((T *)valid_pts)[jj_index] = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
temp_nums_in = ((T *)nums_in_ram)[i];
|
||||
// make original q[0] = min_x, min_y before sort
|
||||
for (int ii = 1; ii < temp_nums_in; ii++) {
|
||||
int ii_index = ii * actual_compute_box_num + i;
|
||||
if (((T *)dist_ram)[ii_index] == 0) {
|
||||
// swap q[ii_index] and q[0]
|
||||
((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[i];
|
||||
((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[i];
|
||||
((T *)dist_ram)[ii_index] = ((T *)dist_ram)[i];
|
||||
((T *)ordered_pts_x)[i] = 0;
|
||||
((T *)ordered_pts_y)[i] = 0;
|
||||
((T *)dist_ram)[i] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int ii = 1; ii < temp_nums_in - 1; ii++) {
|
||||
for (int jj = ii + 1; jj < temp_nums_in; jj++) {
|
||||
int ii_index = ii * actual_compute_box_num + i;
|
||||
int jj_index = jj * actual_compute_box_num + i;
|
||||
temp1_x = ((T *)ordered_pts_x)[ii_index];
|
||||
temp1_y = ((T *)ordered_pts_y)[ii_index];
|
||||
temp2_x = ((T *)ordered_pts_x)[jj_index];
|
||||
temp2_y = ((T *)ordered_pts_y)[jj_index];
|
||||
// calculate cross product and sort q (ordered_pts)
|
||||
temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
|
||||
temp_dist_1 = ((T *)dist_ram)[ii_index];
|
||||
temp_dist_2 = ((T *)dist_ram)[jj_index];
|
||||
if ((temp < (T)-1e-6) ||
|
||||
((fabs(temp) < (T)1e-6) && (temp_dist_1 > temp_dist_2))) {
|
||||
((T *)ordered_pts_x)[ii_index] = temp2_x;
|
||||
((T *)ordered_pts_y)[ii_index] = temp2_y;
|
||||
((T *)ordered_pts_x)[jj_index] = temp1_x;
|
||||
((T *)ordered_pts_y)[jj_index] = temp1_y;
|
||||
((T *)dist_ram)[ii_index] = temp_dist_2;
|
||||
((T *)dist_ram)[jj_index] = temp_dist_1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step4:
|
||||
// Make sure there are at least 2 points(that don't overlap with each
|
||||
// other) in the stack
|
||||
int k; // index of the non-overlapped second point
|
||||
for (k = 1; k < temp_nums_in; k++) {
|
||||
if (((T *)dist_ram)[k * actual_compute_box_num + i] > (T)1e-8) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (k == temp_nums_in) {
|
||||
// We reach the end, which means the convex hull is just one point
|
||||
// set valid_box = 0, to get ious = 0
|
||||
((T *)valid_box)[i] = 0;
|
||||
continue;
|
||||
}
|
||||
// q[1] = q[k];
|
||||
((T *)ordered_pts_x)[actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_x)[k * actual_compute_box_num + i];
|
||||
((T *)ordered_pts_y)[actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_y)[k * actual_compute_box_num + i];
|
||||
|
||||
// Step 5:
|
||||
// Finally we can start the scanning process.
|
||||
// When a non-convex relationship between the 3 points is found
|
||||
// (either concave shape or duplicated points),
|
||||
// we pop the previous point from the stack
|
||||
// until the 3-point relationship is convex again, or
|
||||
// until the stack only contains two points
|
||||
int m = 2; // 2 points in the stack
|
||||
for (int j = k + 1; j < temp_nums_in; j++) {
|
||||
// while (m > 1 && cross2d<T>(q[j] - q[m - 2], q[m - 1] - q[m - 2]) >=
|
||||
// 0) {
|
||||
// m--;
|
||||
// }
|
||||
temp1_x = ((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp1_y = ((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_x = ((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_y = ((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
|
||||
while ((m > 1) && (temp >= 0)) {
|
||||
m--;
|
||||
if (m > 1) {
|
||||
temp1_x =
|
||||
((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp1_y =
|
||||
((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_x =
|
||||
((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_y =
|
||||
((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
|
||||
}
|
||||
}
|
||||
// q[m++] = q[j];
|
||||
((T *)ordered_pts_x)[m * actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_x)[j * actual_compute_box_num + i];
|
||||
((T *)ordered_pts_y)[m * actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_y)[j * actual_compute_box_num + i];
|
||||
m++;
|
||||
}
|
||||
// set last(24-m) valid_pts to false, to erase invalid q in polygon area
|
||||
for (int j = m; j < temp_nums_in; j++) {
|
||||
((T *)valid_pts)[j * actual_compute_box_num + i] = 0;
|
||||
}
|
||||
((T *)nums_in_ram)[i] = m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box,
|
||||
T *valid_pts, T *nums_in_ram, T *temp1_ram,
|
||||
T *temp2_ram, T *temp3_ram, T *temp4_ram,
|
||||
T *temp5_ram, T *temp6_ram, T *temp7_ram,
|
||||
T *temp8_ram, T *temp9_ram,
|
||||
const uint32_t &actual_compute_box_num) {
|
||||
// Set where nums_in <= 2, valid_box = false
|
||||
__bang_write_value((T *)temp9_ram, COMPUTE_COUNT_ALIGN, (T)2);
|
||||
__bang_cycle_gt((T *)temp1_ram, (T *)nums_in_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)valid_box, (T *)valid_box, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// temp1 = area, initialize with all 0
|
||||
__bang_write_zero((T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_max((T *)temp7_ram, (T *)nums_in_ram, actual_compute_box_num);
|
||||
|
||||
// temp_nums_in = max(nums_in)
|
||||
T temp_nums_in = ((T *)temp7_ram)[0];
|
||||
for (int i = 1; i < temp_nums_in - 1; i++) {
|
||||
// q[i] - q[0]: (temp6, temp7)
|
||||
__bang_sub((T *)temp6_ram, (T *)ordered_pts_x + i * actual_compute_box_num,
|
||||
(T *)ordered_pts_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp7_ram, (T *)ordered_pts_y + i * actual_compute_box_num,
|
||||
(T *)ordered_pts_y, actual_compute_box_num);
|
||||
__bang_mul((T *)temp6_ram, (T *)temp6_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp7_ram, (T *)temp7_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// q[i + 1] - q[0]: (temp8, temp9)
|
||||
__bang_sub((T *)temp8_ram,
|
||||
(T *)ordered_pts_x + (i + 1) * actual_compute_box_num,
|
||||
(T *)ordered_pts_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp9_ram,
|
||||
(T *)ordered_pts_y + (i + 1) * actual_compute_box_num,
|
||||
(T *)ordered_pts_y, actual_compute_box_num);
|
||||
__bang_mul((T *)temp8_ram, (T *)temp8_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp9_ram, (T *)temp9_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// area += fabs(cross2d<T>(q[i] - q[0], q[i + 1] - q[0]));
|
||||
__bang_mul((T *)temp4_ram, (T *)temp6_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp5_ram, (T *)temp7_ram, (T *)temp8_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)temp3_ram, (T *)temp4_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_active_abs((T *)temp3_ram, (T *)temp3_ram, actual_compute_box_num);
|
||||
__bang_add((T *)temp1_ram, (T *)temp1_ram, (T *)temp3_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
// Set where valid_box = false, intersection = 0
|
||||
__bang_mul((T *)temp1_ram, (T *)temp1_ram, (T *)valid_box,
|
||||
actual_compute_box_num);
|
||||
// area = area / 2.0
|
||||
__bang_mul_scalar((T *)temp1_ram, (T *)temp1_ram, (T)0.5,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
|
||||
#endif // IOU3D_UTILS_HPP_
|
144
mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
Normal file
144
mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
Normal file
@ -0,0 +1,144 @@
|
||||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
|
||||
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t data_type_input, const void *boxes_dram,
|
||||
const int input_box_num, const float iou_threshold,
|
||||
void *workspace, void *output_size, void *output);
|
||||
|
||||
int selectType(uint32_t use_job, int box_num_per_core) {
|
||||
// the box_num_per_core should be at least 256, otherwise the real IO
|
||||
// bandwidth would be very low
|
||||
while (box_num_per_core < 256 && use_job >= 4) {
|
||||
box_num_per_core *= 2;
|
||||
use_job /= 2;
|
||||
}
|
||||
return use_job;
|
||||
}
|
||||
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
|
||||
int &core_num_per_class,
|
||||
const int input_box_num) {
|
||||
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
uint32_t job_limit = getJobLimitCapability();
|
||||
uint32_t core_number = job_limit;
|
||||
|
||||
int box_num_per_core = (input_box_num + core_number - 1) / core_number;
|
||||
int use_job = selectType(job_limit, box_num_per_core);
|
||||
// initiate k_type as Union1
|
||||
k_dim->x = core_dim;
|
||||
k_dim->y = 1;
|
||||
k_dim->z = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
switch (job_limit) {
|
||||
case CN_KERNEL_CLASS_BLOCK:
|
||||
case CN_KERNEL_CLASS_UNION:
|
||||
case CN_KERNEL_CLASS_UNION2:
|
||||
case CN_KERNEL_CLASS_UNION4:
|
||||
case CN_KERNEL_CLASS_UNION8:
|
||||
case CN_KERNEL_CLASS_UNION16: {
|
||||
if (use_job < 4) {
|
||||
k_dim->x = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_BLOCK;
|
||||
} else if (use_job == 4) {
|
||||
k_dim->x = core_dim;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
} else {
|
||||
k_dim->x = use_job;
|
||||
*k_type = (cnrtFunctionType_t)use_job;
|
||||
}
|
||||
}; break;
|
||||
default:
|
||||
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
|
||||
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
|
||||
}
|
||||
return CNNL_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
|
||||
float iou_threshold) {
|
||||
// dimension parameters check
|
||||
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
|
||||
boxes.dim(), "D");
|
||||
TORCH_CHECK(boxes.size(1) == 7,
|
||||
"boxes should have 7 elements in dimension 1, got ",
|
||||
boxes.size(1));
|
||||
|
||||
// data type check
|
||||
TORCH_CHECK(
|
||||
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
|
||||
"data type of boxes should be Float or Half, got ", boxes.scalar_type());
|
||||
|
||||
if (boxes.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
const size_t max_input_num = 2147483648; // 2^31, 2G num
|
||||
TORCH_CHECK(boxes.numel() < max_input_num,
|
||||
"boxes.numel() should be less than 2147483648, got ",
|
||||
boxes.numel());
|
||||
int input_box_num = boxes.size(0);
|
||||
|
||||
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtJobType_t k_type;
|
||||
|
||||
int core_num_per_class;
|
||||
policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);
|
||||
|
||||
// transpose boxes (n, 7) to (7, n) for better performance
|
||||
auto boxes_t = boxes.transpose(0, 1);
|
||||
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
|
||||
|
||||
auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
|
||||
auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
|
||||
|
||||
// workspace
|
||||
const int info_num = 7; // x, y,z, dx, dy, dz,angle
|
||||
size_t space_size = 0;
|
||||
if (boxes.scalar_type() == at::kHalf) {
|
||||
space_size = input_box_num * sizeof(int16_t) * info_num +
|
||||
input_box_num * sizeof(float) + sizeof(float);
|
||||
} else {
|
||||
space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
|
||||
}
|
||||
|
||||
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
|
||||
auto boxes_ptr = boxes_impl->cnnlMalloc();
|
||||
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
|
||||
auto workspace_ptr = workspace_impl->cnnlMalloc();
|
||||
auto output_impl = torch_mlu::getMluTensorImpl(keep);
|
||||
auto output_ptr = output_impl->cnnlMalloc();
|
||||
auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
|
||||
auto output_size_ptr = output_size_impl->cnnlMalloc();
|
||||
|
||||
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
|
||||
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
|
||||
KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
|
||||
iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
|
||||
}
|
||||
|
||||
void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
|
||||
float nms_overlap_thresh) {
|
||||
IoU3DNMS3DMLUKernelLauncher(boxes, keep, keep_num, nms_overlap_thresh);
|
||||
}
|
||||
|
||||
void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
|
||||
Tensor &keep_num, float nms_overlap_thresh);
|
||||
REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MLU, iou3d_nms3d_forward_mlu);
|
@ -4,7 +4,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
|
||||
from mmcv.utils import IS_CUDA_AVAILABLE
|
||||
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
|
||||
|
||||
|
||||
@pytest.mark.parametrize('device', [
|
||||
@ -73,7 +73,11 @@ def test_boxes_iou3d(device):
|
||||
pytest.param(
|
||||
'cuda',
|
||||
marks=pytest.mark.skipif(
|
||||
not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
|
||||
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
|
||||
pytest.param(
|
||||
'mlu',
|
||||
marks=pytest.mark.skipif(
|
||||
not IS_MLU_AVAILABLE, reason='requires MLU support'))
|
||||
])
|
||||
def test_nms3d(device):
|
||||
# test for 5 boxes
|
||||
@ -92,14 +96,20 @@ def test_nms3d(device):
|
||||
assert np.allclose(inds.cpu().numpy(), np_inds)
|
||||
|
||||
# test for many boxes
|
||||
np.random.seed(42)
|
||||
np_boxes = np.random.rand(555, 7).astype(np.float32)
|
||||
np_scores = np.random.rand(555).astype(np.float32)
|
||||
boxes = torch.from_numpy(np_boxes)
|
||||
scores = torch.from_numpy(np_scores)
|
||||
inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
|
||||
# In the float data type calculation process, float will be converted to
|
||||
# double in CUDA kernel (https://github.com/open-mmlab/mmcv/blob
|
||||
# /master/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp#L61),
|
||||
# always use float in MLU kernel. The difference between the mentioned
|
||||
# above leads to different results.
|
||||
if device != 'mlu':
|
||||
np.random.seed(42)
|
||||
np_boxes = np.random.rand(555, 7).astype(np.float32)
|
||||
np_scores = np.random.rand(555).astype(np.float32)
|
||||
boxes = torch.from_numpy(np_boxes)
|
||||
scores = torch.from_numpy(np_scores)
|
||||
inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
|
||||
|
||||
assert len(inds.cpu().numpy()) == 176
|
||||
assert len(inds.cpu().numpy()) == 176
|
||||
|
||||
|
||||
@pytest.mark.parametrize('device', [
|
||||
|
Loading…
x
Reference in New Issue
Block a user