mirror of https://github.com/open-mmlab/mmcv.git
[Refactor] Replace the MLU ops implementation with mlu-ops (#2750)
parent
e197effeca
commit
8725e683b4
|
@ -1,431 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "common_mlu_helper.hpp"
|
||||
#include "iou3d_utils.hpp"
|
||||
|
||||
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
|
||||
|
||||
/* NRAM buffer
|
||||
* Suppose deal N boxes once time.
|
||||
----------------------------------------------------------------
|
||||
| Basic |score (1N)+ |intersect_pts(48N)| |
|
||||
| |valid_box(1N) |+ ordered_pts(48N)| temp_long(72N) |
|
||||
| |+ temp_buffer(10N)| | |
|
||||
|--------------------------|------------------|----------------|
|
||||
| Reuse | null | null |rotated_pts(16N)|
|
||||
|-------|------------------|------------------|----------------|
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
| Basic | dist_ram(24N) | valid_pts(24N) |box1(5N) |box1_buffer(5KB) |
|
||||
| | |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB) |
|
||||
|--------------------------|-----------------|----------|-----------------|
|
||||
| Reuse | vec_buffer(5N) | null | null | null |
|
||||
|-------|------------------|-----------------|----------|-----------------|
|
||||
Total Basic Memory Size = 239N * sizeof(float) + 10KB
|
||||
*/
|
||||
|
||||
__nram__ char nram_buffer[MAX_NRAM_SIZE];
|
||||
__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
|
||||
const T *boxes_data, float *scores_data,
|
||||
const int core_limit, const int input_box_num,
|
||||
const float iou_threshold,
|
||||
mluMemcpyDirection_t scores_load_dir,
|
||||
mluMemcpyDirection_t scores_store_dir,
|
||||
mluMemcpyDirection_t boxes_load_dir) {
|
||||
// NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
|
||||
const int nram_save_limit_count = 256;
|
||||
int box_read_limit_count = 256;
|
||||
float div_thresh_iou = 1.0 / iou_threshold;
|
||||
// every box require 239 * sizeof(float) space in nram;
|
||||
const int32_t copies_of_nram = 239 * sizeof(float);
|
||||
const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
|
||||
nram_save_limit_count * sizeof(int32_t)) /
|
||||
copies_of_nram;
|
||||
|
||||
// x,y,z,dx,dy,dz,angle
|
||||
const T *input_x_ptr = boxes_data;
|
||||
const T *input_y_ptr = input_x_ptr + input_box_num;
|
||||
const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
|
||||
const T *input_dy_ptr = input_dx_ptr + input_box_num;
|
||||
const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
|
||||
float *input_score_ptr = scores_data;
|
||||
|
||||
// data split
|
||||
int avg_cluster = 0;
|
||||
int rem_cluster = 0;
|
||||
int len_cluster = 0;
|
||||
int cluster_offset = 0;
|
||||
if (clusterDim > 0) {
|
||||
// union
|
||||
avg_cluster = input_box_num / clusterDim;
|
||||
rem_cluster = input_box_num % clusterDim;
|
||||
len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
|
||||
cluster_offset = avg_cluster * clusterId +
|
||||
(clusterId <= rem_cluster ? clusterId : rem_cluster);
|
||||
} else {
|
||||
// block
|
||||
len_cluster = input_box_num;
|
||||
cluster_offset = 0;
|
||||
}
|
||||
int len_core = input_box_num;
|
||||
int input_offset = 0;
|
||||
if (core_limit > 1) {
|
||||
int avg_core = len_cluster / coreDim;
|
||||
int rem_core = len_cluster % coreDim;
|
||||
len_core = avg_core + (coreId < rem_core ? 1 : 0);
|
||||
int core_offset =
|
||||
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
|
||||
input_offset = cluster_offset + core_offset;
|
||||
}
|
||||
|
||||
int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
|
||||
int repeat_iou_compute = len_core / max_seg_pad;
|
||||
int remain_iou_compute = len_core % max_seg_pad;
|
||||
|
||||
// basic consistent memory layout
|
||||
void *score = ((char *)nram_buffer);
|
||||
void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
|
||||
void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
|
||||
void *intersect_pts_x =
|
||||
((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
|
||||
void *intersect_pts_y =
|
||||
((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
|
||||
void *ordered_pts_x =
|
||||
((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
|
||||
void *ordered_pts_y =
|
||||
((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
|
||||
void *temp_long_1 =
|
||||
((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
|
||||
void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
|
||||
void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
|
||||
void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
|
||||
void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
|
||||
void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
|
||||
T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
|
||||
T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
|
||||
void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
|
||||
int32_t *nram_save =
|
||||
(int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
|
||||
// nram_save ~ nram_save_limit_count * sizeof(int32_t)
|
||||
int nram_save_count = 0;
|
||||
|
||||
// reuse memory
|
||||
void *rotated_pts1_x = ((char *)dist_ram);
|
||||
void *rotated_pts1_y =
|
||||
((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
|
||||
void *rotated_pts2_x =
|
||||
((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
|
||||
void *rotated_pts2_y =
|
||||
((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
|
||||
void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
|
||||
// vec_buffer ~ 16 * max_seg_pad * sizeof(float)
|
||||
|
||||
// First, initialize ram with all 0, or could cause nan/inf unexcepted results
|
||||
__bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
|
||||
// number 8 and 0xff relay on box_read_limit_count initial as 256
|
||||
const int max_box_seg_id = (input_box_num - 1) >> 8;
|
||||
const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
|
||||
for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
|
||||
__sync_all();
|
||||
int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
|
||||
box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
|
||||
: box_read_limit_count;
|
||||
if (box_id == 0) {
|
||||
// x,y,z,dx,dy,dz,angle
|
||||
int offset_num = box_seg_id << 8;
|
||||
// x
|
||||
__memcpy((char *)box1_buffer, input_x_ptr + offset_num,
|
||||
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
|
||||
box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// y
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
|
||||
input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
|
||||
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// dx
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
|
||||
input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
|
||||
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// dy
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
|
||||
input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
|
||||
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
// angle
|
||||
__memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
|
||||
input_angle_ptr + offset_num,
|
||||
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
|
||||
box_read_limit_count * 1 * sizeof(T),
|
||||
box_read_limit_count * 1 * sizeof(T), 0);
|
||||
}
|
||||
if (((float *)input_score_ptr)[cur_box] == 0) {
|
||||
continue;
|
||||
}
|
||||
// save result
|
||||
nram_save[nram_save_count] = cur_box;
|
||||
result_box_num++;
|
||||
nram_save_count++;
|
||||
if (clusterId == 0 && coreId == 0 &&
|
||||
nram_save_count == nram_save_limit_count) {
|
||||
pvLock();
|
||||
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
|
||||
NRAM2GDRAM);
|
||||
pvUnlock();
|
||||
output_data += nram_save_count;
|
||||
nram_save_count = 0;
|
||||
}
|
||||
// prepare box1
|
||||
// x
|
||||
__bang_write_value((float *)box1, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id]));
|
||||
// y
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
|
||||
// dx
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad * 2, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
|
||||
// dy
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad * 3, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
|
||||
// angle
|
||||
__bang_write_value(
|
||||
(float *)box1 + max_seg_pad * 4, max_seg_pad,
|
||||
float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
|
||||
|
||||
float max_area = 1.0f *
|
||||
((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
|
||||
((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
|
||||
// update score
|
||||
|
||||
for (int i = 0; i <= repeat_iou_compute; i++) {
|
||||
if (i == repeat_iou_compute && remain_iou_compute == 0) {
|
||||
break;
|
||||
}
|
||||
int seg_len = max_seg_pad;
|
||||
int cpy_len =
|
||||
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
|
||||
// int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
|
||||
int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
|
||||
// score
|
||||
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * sizeof(float), scores_load_dir,
|
||||
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
|
||||
// x
|
||||
__memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// y
|
||||
__memcpy(box2 + half_offset + seg_len * 1,
|
||||
input_y_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// dx
|
||||
__memcpy(box2 + half_offset + seg_len * 2,
|
||||
input_dx_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// dy
|
||||
__memcpy(box2 + half_offset + seg_len * 3,
|
||||
input_dy_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// angle
|
||||
__memcpy(box2 + half_offset + seg_len * 4,
|
||||
input_angle_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
|
||||
cpy_len * 1 * sizeof(T), 0);
|
||||
// if (std::is_same<T, half>::value) {
|
||||
if (sizeof(T) == sizeof(half)) {
|
||||
__bang_half2float((float *)box2, (half *)(box2 + half_offset),
|
||||
seg_len * 5);
|
||||
}
|
||||
|
||||
// Calculate rotated vertices
|
||||
void *temp1_ram = ((char *)temp_buffer);
|
||||
void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
|
||||
void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
|
||||
void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
|
||||
getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
|
||||
(float *)box1, (float *)temp1_ram, (float *)temp2_ram,
|
||||
(float *)temp3_ram, (float *)temp4_ram, seg_len);
|
||||
getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
|
||||
(float *)box2, (float *)temp1_ram, (float *)temp2_ram,
|
||||
(float *)temp3_ram, (float *)temp4_ram, seg_len);
|
||||
|
||||
__bang_write_zero((float *)valid_pts, 24 * seg_len);
|
||||
__bang_write_zero((float *)nums_in_ram, seg_len);
|
||||
__bang_write_value(((float *)valid_box), seg_len, 1.0f);
|
||||
void *vec1_x = ((char *)vec_buffer);
|
||||
void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
|
||||
void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
|
||||
void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
|
||||
void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
|
||||
void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
|
||||
void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
|
||||
void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
|
||||
void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
|
||||
void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
|
||||
|
||||
// Get all intersection points
|
||||
getIntersectPts(
|
||||
(float *)rotated_pts1_x, (float *)rotated_pts1_y,
|
||||
(float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
|
||||
(float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
|
||||
(float *)intersect_pts_x, (float *)intersect_pts_y,
|
||||
(float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
|
||||
(float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
|
||||
(float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
|
||||
(float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
|
||||
|
||||
// Where nums_in <= 2, set valid_box to false
|
||||
__bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
|
||||
__bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
|
||||
(float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
|
||||
seg_len);
|
||||
__bang_cycle_and((float *)valid_pts, (float *)valid_pts,
|
||||
(float *)valid_box, 24 * seg_len, seg_len);
|
||||
|
||||
// Convex-hull-graham to order the intersection points in clockwise order
|
||||
// and find the contour area
|
||||
|
||||
convexHullGraham(
|
||||
(float *)intersect_pts_x, (float *)intersect_pts_y,
|
||||
(float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
|
||||
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
|
||||
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
|
||||
(float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
|
||||
seg_len, seg_len);
|
||||
// Calculate polygon area
|
||||
// set temp1 = intersection part area
|
||||
polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
|
||||
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
|
||||
(float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
|
||||
(float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
|
||||
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
|
||||
seg_len);
|
||||
// area
|
||||
__bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
|
||||
(float *)box2 + seg_len * 3, seg_len);
|
||||
// get the area_U: area + max_area - area_I
|
||||
__bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
|
||||
seg_len);
|
||||
__bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
|
||||
seg_len); // area_U
|
||||
if (iou_threshold > 0.0) {
|
||||
__bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
|
||||
div_thresh_iou, seg_len);
|
||||
} else {
|
||||
__bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
|
||||
seg_len);
|
||||
}
|
||||
__bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
|
||||
seg_len);
|
||||
__bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
|
||||
|
||||
pvLock();
|
||||
__memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
|
||||
cpy_len * sizeof(float), scores_store_dir,
|
||||
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
|
||||
pvUnlock();
|
||||
}
|
||||
}
|
||||
if (clusterId == 0 && coreId == 0 && nram_save_count) {
|
||||
pvLock();
|
||||
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
|
||||
NRAM2GDRAM);
|
||||
pvUnlock();
|
||||
}
|
||||
}
|
||||
__mlu_global__ void MLUBlockorUnionIKernelOU3D(
|
||||
const void *input_boxes, const int input_box_num, const float iou_threshold,
|
||||
const cnrtDataType_t data_type_input, void *workspace, void *result_num,
|
||||
void *output) {
|
||||
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
|
||||
mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
|
||||
mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
|
||||
mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
|
||||
float *scores_data = (float *)workspace;
|
||||
float *boxes_data = (float *)input_boxes;
|
||||
const int cluster_score_size = input_box_num * sizeof(float);
|
||||
const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
|
||||
char *sram_score = (char *)sram_buffer;
|
||||
char *sram_boxes = (char *)sram_buffer + cluster_score_size;
|
||||
if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
|
||||
scores_data = (float *)sram_score;
|
||||
scores_load_dir = SRAM2NRAM;
|
||||
scores_store_dir = NRAM2SRAM;
|
||||
if (coreId == 0x80) {
|
||||
__sramset((void *)sram_buffer, input_box_num, 1.0f);
|
||||
}
|
||||
} else {
|
||||
if (coreId == 0) {
|
||||
__gdramset(scores_data, input_box_num, 1.0f);
|
||||
}
|
||||
}
|
||||
if (clusterDim == 1 &&
|
||||
SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
|
||||
boxes_load_dir = SRAM2NRAM;
|
||||
boxes_data = (float *)sram_boxes;
|
||||
if (coreId == 0x80) {
|
||||
__memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
|
||||
GDRAM2SRAM);
|
||||
}
|
||||
}
|
||||
__sync_cluster();
|
||||
|
||||
int32_t result_box_num = 0;
|
||||
int32_t *out_data = (int32_t *)output;
|
||||
|
||||
switch (data_type_input) {
|
||||
default: { return; }
|
||||
case CNRT_FLOAT16: {
|
||||
iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
|
||||
taskDim, input_box_num, iou_threshold, scores_load_dir,
|
||||
scores_store_dir, boxes_load_dir);
|
||||
}; break;
|
||||
case CNRT_FLOAT32: {
|
||||
iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
|
||||
taskDim, input_box_num, iou_threshold, scores_load_dir,
|
||||
scores_store_dir, boxes_load_dir);
|
||||
}; break;
|
||||
}
|
||||
((int32_t *)result_num)[0] = result_box_num;
|
||||
}
|
||||
|
||||
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t data_type_input, const void *boxes_dram,
|
||||
const int input_box_num, const float iou_threshold,
|
||||
void *workspace, void *output_size, void *output) {
|
||||
switch (k_type) {
|
||||
default: { return; }
|
||||
case CNRT_FUNC_TYPE_BLOCK:
|
||||
case CNRT_FUNC_TYPE_UNION1:
|
||||
case CNRT_FUNC_TYPE_UNION2:
|
||||
case CNRT_FUNC_TYPE_UNION4:
|
||||
case CNRT_FUNC_TYPE_UNION8:
|
||||
case CNRT_FUNC_TYPE_UNION16: {
|
||||
MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
|
||||
(void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
|
||||
workspace, output_size, output);
|
||||
}; break;
|
||||
}
|
||||
}
|
|
@ -1,695 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#ifndef IOU3D_UTILS_HPP_
|
||||
#define IOU3D_UTILS_HPP_
|
||||
#include "common_mlu_helper.hpp"
|
||||
|
||||
#define IOU3D_SIZE 64
|
||||
#define IOU3D_UP(x, y) (x / y + (int)(x % y > 0)) * y
|
||||
#define IOU3D_DOWN(x, y) (x / y) * y
|
||||
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE)
|
||||
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
|
||||
#define COMPUTE_COUNT_ALIGN 64
|
||||
#define INFO_NUM (5) // score, x1, y1, x2, y2
|
||||
#define REDUCE_NUM \
|
||||
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
|
||||
#define SINGLE_BOX_DIM 5
|
||||
#define MEMORY_CORE (0x80)
|
||||
__mlu_func__ void pvLock() {
|
||||
#if __BANG_ARCH__ == 270
|
||||
if (coreId != MEMORY_CORE) {
|
||||
__bang_lock(0, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__mlu_func__ void pvUnlock() {
|
||||
#if __BANG_ARCH__ == 270
|
||||
if (coreId != MEMORY_CORE) {
|
||||
__bang_unlock(0, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// cross2d<T>(A, B) = A.x * B.y - A.y * B.x;
|
||||
template <typename T>
|
||||
inline __mlu_func__ void cross2d(T *result, const T *p1_x, const T *p1_y,
|
||||
const T *p2_x, const T *p2_y,
|
||||
const int &length, T *temp_ram) {
|
||||
__bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_y, length);
|
||||
__bang_mul((T *)result, (T *)p1_y, (T *)p2_x, length);
|
||||
__bang_sub((T *)result, (T *)temp_ram, (T *)result, length);
|
||||
}
|
||||
|
||||
// dot2d<T>(A, B) = A.x * B.x + A.y * B.y
|
||||
template <typename T>
|
||||
inline __mlu_func__ void dot2d(T *result, const T *p1_x, const T *p1_y,
|
||||
const T *p2_x, const T *p2_y, const int &length,
|
||||
T *temp_ram) {
|
||||
__bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_x, length);
|
||||
__bang_mul((T *)result, (T *)p1_y, (T *)p2_y, length);
|
||||
__bang_add((T *)result, (T *)temp_ram, (T *)result, length);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void getRotatedVertices(T *pts_x, T *pts_y, T *box, T *temp1,
|
||||
T *temp2, T *temp3, T *temp4,
|
||||
const uint32_t &actual_compute_box_num) {
|
||||
// T cosTheta2 = (T)cos(theta) * 0.5f; -- temp1
|
||||
// T sinTheta2 = (T)sin(theta) * 0.5f; -- temp2
|
||||
// theta is the box's 5th data: a, rotated radian;
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_cos((float *)temp1, ((float *)box) + 4 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sin((float *)temp2, ((float *)box) + 4 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
#else
|
||||
__bang_taylor4_cos((T *)temp1, ((T *)box) + 4 * actual_compute_box_num,
|
||||
(T *)temp3, (T *)temp4, actual_compute_box_num);
|
||||
__bang_taylor4_sin((T *)temp2, ((T *)box) + 4 * actual_compute_box_num,
|
||||
(T *)temp3, (T *)temp4, actual_compute_box_num);
|
||||
#endif
|
||||
__bang_mul_scalar((T *)temp1, (T *)temp1, (T)0.5, actual_compute_box_num);
|
||||
__bang_mul_scalar((T *)temp2, (T *)temp2, (T)0.5, actual_compute_box_num);
|
||||
|
||||
// Temp3 = sinTheta2 * box.h;
|
||||
// Temp4 = cosTheta2 * box.w;
|
||||
__bang_mul((T *)temp3, (T *)temp2, ((T *)box) + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp4, (T *)temp1, ((T *)box) + 2 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
|
||||
// pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
|
||||
__bang_sub((T *)pts_x, (T *)box, (T *)temp3, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x, (T *)pts_x, (T *)temp4, actual_compute_box_num);
|
||||
__bang_add((T *)pts_x + 1 * actual_compute_box_num, (T *)box, (T *)temp3,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x + 1 * actual_compute_box_num,
|
||||
(T *)pts_x + 1 * actual_compute_box_num, (T *)temp4,
|
||||
actual_compute_box_num);
|
||||
// Temp3 = cosTheta2 * box.h;
|
||||
// Temp4 = sinTheta2 * box.w;
|
||||
__bang_mul((T *)temp3, (T *)temp1, box + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp4, (T *)temp2, box + 2 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
|
||||
// pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
|
||||
__bang_add((T *)pts_y, (T *)box + 1 * actual_compute_box_num, (T *)temp3,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y, (T *)pts_y, (T *)temp4, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 1 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num, (T *)temp3,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 1 * actual_compute_box_num,
|
||||
(T *)pts_y + 1 * actual_compute_box_num, (T *)temp4,
|
||||
actual_compute_box_num);
|
||||
// pts[2].x = 2 * box.x_ctr - pts[0].x;
|
||||
// pts[3].x = 2 * box.x_ctr - pts[1].x;
|
||||
__bang_add((T *)pts_x + 2 * actual_compute_box_num, (T *)box, (T *)box,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x + 2 * actual_compute_box_num,
|
||||
(T *)pts_x + 2 * actual_compute_box_num, (T *)pts_x,
|
||||
actual_compute_box_num);
|
||||
__bang_add((T *)pts_x + 3 * actual_compute_box_num, (T *)box, (T *)box,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)pts_x + 3 * actual_compute_box_num,
|
||||
(T *)pts_x + 3 * actual_compute_box_num,
|
||||
(T *)pts_x + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
// pts[2].y = 2 * box.y_ctr - pts[0].y;
|
||||
// pts[3].y = 2 * box.y_ctr - pts[1].y;
|
||||
__bang_add((T *)pts_y + 2 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 2 * actual_compute_box_num,
|
||||
(T *)pts_y + 2 * actual_compute_box_num, (T *)pts_y,
|
||||
actual_compute_box_num);
|
||||
__bang_add((T *)pts_y + 3 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num,
|
||||
(T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_sub((T *)pts_y + 3 * actual_compute_box_num,
|
||||
(T *)pts_y + 3 * actual_compute_box_num,
|
||||
(T *)pts_y + 1 * actual_compute_box_num, actual_compute_box_num);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void getIntersectPts(T *rotated_pts1_x, T *rotated_pts1_y,
|
||||
T *rotated_pts2_x, T *rotated_pts2_y,
|
||||
T *vec1_x, T *vec1_y, T *vec2_x, T *vec2_y,
|
||||
T *intersect_pts_x, T *intersect_pts_y,
|
||||
T *valid_pts, T *nums_in_ram, T *temp1_ram,
|
||||
T *temp2_ram, T *temp3_ram, T *temp4_ram,
|
||||
T *temp5_ram, T *temp6_ram, T *temp7_ram,
|
||||
T *temp8_ram, T *temp9_ram, T *temp10_ram,
|
||||
const uint32_t &actual_compute_box_num) {
|
||||
// Initialize const data to ram
|
||||
// temp3 = const 1e-14(@float), length = COMPUTE_COUNT_ALIGN
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (T)1e-14);
|
||||
#else
|
||||
// NOTE: Since active_reciphp function has strict value range,
|
||||
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (float)1e-14);
|
||||
#endif
|
||||
// temp4 = const T(0), length = COMPUTE_COUNT_ALIGN
|
||||
__bang_write_value((T *)temp4_ram, COMPUTE_COUNT_ALIGN, (T)0);
|
||||
// temp5 = const T(1), length = COMPUTE_COUNT_ALIGN
|
||||
__bang_write_value((T *)temp5_ram, COMPUTE_COUNT_ALIGN, (T)1);
|
||||
|
||||
// Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1]
|
||||
// for i = 0~3, vec[i] = pts[(i+1)%4] - pts[i]
|
||||
__bang_sub((T *)vec1_x, (T *)rotated_pts1_x + actual_compute_box_num,
|
||||
(T *)rotated_pts1_x, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec1_x + 3 * actual_compute_box_num, (T *)rotated_pts1_x,
|
||||
(T *)rotated_pts1_x + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)vec1_y, (T *)rotated_pts1_y + actual_compute_box_num,
|
||||
(T *)rotated_pts1_y, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec1_y + 3 * actual_compute_box_num, (T *)rotated_pts1_y,
|
||||
(T *)rotated_pts1_y + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
__bang_sub((T *)vec2_x, (T *)rotated_pts2_x + actual_compute_box_num,
|
||||
(T *)rotated_pts2_x, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec2_x + 3 * actual_compute_box_num, (T *)rotated_pts2_x,
|
||||
(T *)rotated_pts2_x + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)vec2_y, (T *)rotated_pts2_y + actual_compute_box_num,
|
||||
(T *)rotated_pts2_y, 3 * actual_compute_box_num);
|
||||
__bang_sub((T *)vec2_y + 3 * actual_compute_box_num, (T *)rotated_pts2_y,
|
||||
(T *)rotated_pts2_y + 3 * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// First, line test - test all line combos for intersection, 4x4 possible
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
// T det = cross2d<T>(vec2[j], vec1[i]) -- temp2
|
||||
cross2d<T>((T *)temp2_ram, (T *)vec2_x + j * actual_compute_box_num,
|
||||
(T *)vec2_y + j * actual_compute_box_num,
|
||||
(T *)vec1_x + i * actual_compute_box_num,
|
||||
(T *)vec1_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num, (T *)temp1_ram);
|
||||
// temp8 = sign(det), since active_reciphp only receive positive values
|
||||
__bang_active_sign((T *)temp8_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
// deal with parallel lines, temp2 = fabs(det), temp1 = temp2 > 1e-14
|
||||
__bang_active_abs((T *)temp2_ram, (T *)temp2_ram, actual_compute_box_num);
|
||||
__bang_cycle_gt((T *)temp1_ram, (T *)temp2_ram, (T *)temp3_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
// Where temp1 = false, set recip input to 1, avoiding recip(0), cause inf
|
||||
__bang_not((T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_add((T *)temp2_ram, (T *)temp2_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
// temp2 = 1/temp2, use mult (1/temp2) instead of div temp2
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_recip((float *)temp2_ram, (float *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
#else
|
||||
// NOTE: active_reciphp function has strict value range:
|
||||
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
|
||||
__bang_active_reciphp((T *)temp2_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
#endif
|
||||
// Restore temp2 invalid box value 1 and sign-bit
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp8_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// auto vec12 = pts2[j] - pts1[i], (temp6, temp7) = (x, y)
|
||||
__bang_sub((T *)temp6_ram,
|
||||
(T *)rotated_pts2_x + j * actual_compute_box_num,
|
||||
(T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)temp7_ram,
|
||||
(T *)rotated_pts2_y + j * actual_compute_box_num,
|
||||
(T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// T t1 = cross2d<T>(vec2[j], vec12) mult (1/det) -- temp8
|
||||
cross2d<T>((T *)temp8_ram, (T *)vec2_x + j * actual_compute_box_num,
|
||||
(T *)vec2_y + j * actual_compute_box_num, (T *)temp6_ram,
|
||||
(T *)temp7_ram, actual_compute_box_num, (T *)temp9_ram);
|
||||
__bang_mul((T *)temp8_ram, (T *)temp8_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// temp1 &= (t1 >= 0.0f && t1 <= 1.0f) -- temp9
|
||||
__bang_cycle_ge((T *)temp9_ram, (T *)temp8_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_cycle_le((T *)temp9_ram, (T *)temp8_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// T t2 = cross2d<T>(vec1[i], vec12) mult temp2 -- temp9
|
||||
// NOTE: temp8(t1) is used after, reuse temp7(p2_y) as cross2d temp ram
|
||||
cross2d<T>((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
|
||||
(T *)vec1_y + i * actual_compute_box_num, (T *)temp6_ram,
|
||||
(T *)temp7_ram, actual_compute_box_num, (T *)temp7_ram);
|
||||
__bang_mul((T *)temp9_ram, (T *)temp9_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// temp1 &= (t2 >= 0.0f && t2 <= 1.0f) -- temp9
|
||||
__bang_cycle_ge((T *)temp7_ram, (T *)temp9_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_cycle_le((T *)temp7_ram, (T *)temp9_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// intersections = (pts1[i] + vec1[i] * t1) * temp1
|
||||
__bang_mul((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
|
||||
(T *)temp8_ram, actual_compute_box_num);
|
||||
__bang_add((T *)temp9_ram,
|
||||
(T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
(T *)temp9_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_x + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)temp9_ram, (T *)vec1_y + i * actual_compute_box_num,
|
||||
(T *)temp8_ram, actual_compute_box_num);
|
||||
__bang_add((T *)temp9_ram,
|
||||
(T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
(T *)temp9_ram, actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_y + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
|
||||
|
||||
// Assign `valid_pts` bit and accumulate `nums_in` of valid points of each
|
||||
// box pair
|
||||
__bang_or((T *)valid_pts + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)valid_pts + (4 * i + j) * actual_compute_box_num,
|
||||
(T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for vertices of rect1 inside rect2
|
||||
// temp5 = ABdotAB
|
||||
dot2d<T>((T *)temp5_ram, (T *)vec2_x, (T *)vec2_y, (T *)vec2_x, (T *)vec2_y,
|
||||
actual_compute_box_num, (T *)temp9_ram);
|
||||
// temp6 = ADdotAD
|
||||
dot2d<T>((T *)temp6_ram, (T *)vec2_x + 3 * actual_compute_box_num,
|
||||
(T *)vec2_y + 3 * actual_compute_box_num,
|
||||
(T *)vec2_x + 3 * actual_compute_box_num,
|
||||
(T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp9_ram);
|
||||
// assume ABCD is the rectangle, and P is the point to be judged
|
||||
// P is inside ABCD iff. P's projection on AB lines within AB
|
||||
// and P's projection on AD lies within AD
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// AP = pts1[i] - pts2[0] = (temp7, temp8)
|
||||
__bang_sub((T *)temp7_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
(T *)rotated_pts2_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp8_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
(T *)rotated_pts2_y, actual_compute_box_num);
|
||||
|
||||
// temp9 = APdotAB = dot2d<T>(AP, AB)
|
||||
dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec2_x,
|
||||
(T *)vec2_y, actual_compute_box_num, (T *)temp2_ram);
|
||||
// temp10 = APdotAD = -dot2d<T>(AP, DA)
|
||||
dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
|
||||
(T *)vec2_x + 3 * actual_compute_box_num,
|
||||
(T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp2_ram);
|
||||
__bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
|
||||
actual_compute_box_num);
|
||||
|
||||
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
|
||||
// ADdotAD))
|
||||
__bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// 16 means the 4x4 possible intersection points above
|
||||
__bang_mul((T *)intersect_pts_x + (16 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_y + (16 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// assign valid_pts bit and accumulate nums of valid points of each box pair
|
||||
__bang_or((T *)valid_pts + (16 + i) * actual_compute_box_num,
|
||||
(T *)valid_pts + (16 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
|
||||
// Reverse the check - check for vertices of rect2 inside rect1
|
||||
// temp5 = ABdotAB
|
||||
dot2d<T>((T *)temp5_ram, (T *)vec1_x, (T *)vec1_y, (T *)vec1_x, (T *)vec1_y,
|
||||
actual_compute_box_num, (T *)temp9_ram);
|
||||
// temp6 = ADdotAD
|
||||
dot2d<T>((T *)temp6_ram, (T *)vec1_x + 3 * actual_compute_box_num,
|
||||
(T *)vec1_y + 3 * actual_compute_box_num,
|
||||
(T *)vec1_x + 3 * actual_compute_box_num,
|
||||
(T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp9_ram);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// AP = pts2[i] - pts1[0] = (temp7, temp8)
|
||||
__bang_sub((T *)temp7_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
|
||||
(T *)rotated_pts1_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp8_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
|
||||
(T *)rotated_pts1_y, actual_compute_box_num);
|
||||
|
||||
// temp9 = APdotAB = dot2d<T>(AP, AB)
|
||||
dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec1_x,
|
||||
(T *)vec1_y, actual_compute_box_num, (T *)temp2_ram);
|
||||
// temp10 = APdotAD = -dot2d<T>(AP, DA)
|
||||
dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
|
||||
(T *)vec1_x + 3 * actual_compute_box_num,
|
||||
(T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
|
||||
(T *)temp2_ram);
|
||||
__bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
|
||||
actual_compute_box_num);
|
||||
|
||||
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
|
||||
// ADdotAD))
|
||||
__bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// 20 means the (4x4+4) possible intersection points above
|
||||
__bang_mul((T *)intersect_pts_x + (20 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)intersect_pts_y + (20 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
|
||||
// assign valid_pts bit and accumulate nums of valid points of each box pair
|
||||
__bang_or((T *)valid_pts + (20 + i) * actual_compute_box_num,
|
||||
(T *)valid_pts + (20 + i) * actual_compute_box_num,
|
||||
(T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void convexHullGraham(
|
||||
T *intersect_pts_x, T *intersect_pts_y, T *ordered_pts_x, T *ordered_pts_y,
|
||||
T *dist_ram, T *valid_box, T *valid_pts, T *nums_in_ram, T *temp1_ram,
|
||||
T *temp2_ram, T *temp3_ram, T *temp_long_1, T *temp_long_2, T *temp_long_3,
|
||||
const uint32_t &actual_box_num, const uint32_t &actual_compute_box_num) {
|
||||
// Step1. Find the point with minimum y, if more than 1 points have the same
|
||||
// minimum y,
|
||||
// pick the one with the minimum x.
|
||||
// set p[i].y to max_y_value if not valid_pts, to avoid invalid result
|
||||
// 24 means all possible intersection points
|
||||
__bang_max((T *)temp2_ram, (T *)intersect_pts_y, 24 * actual_compute_box_num);
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp2_ram)[0]);
|
||||
__bang_not((T *)temp_long_1, (T *)valid_pts, 24 * actual_compute_box_num);
|
||||
__bang_cycle_mul((T *)temp_long_1, (T *)temp_long_1, (T *)temp3_ram,
|
||||
24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_mul((T *)temp_long_2, (T *)intersect_pts_y, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_add((T *)temp_long_2, (T *)temp_long_2, (T *)temp_long_1,
|
||||
24 * actual_compute_box_num);
|
||||
// temp2 = min_y_value(temp_long_2), use min_pool, channel=box_num, h=1, w=24
|
||||
__bang_minpool((T *)temp2_ram, (T *)temp_long_2, actual_compute_box_num, 1,
|
||||
24, 1, 24, 1, 24);
|
||||
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)valid_box,
|
||||
actual_compute_box_num);
|
||||
|
||||
// set p[i].x to max_x_value if not min_y point
|
||||
__bang_max((T *)temp1_ram, (T *)intersect_pts_x, 24 * actual_compute_box_num);
|
||||
__bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp1_ram)[0]);
|
||||
__bang_cycle_eq((T *)temp_long_1, (T *)temp_long_2, (T *)temp2_ram,
|
||||
24 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_and((T *)temp_long_1, (T *)temp_long_1, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_not((T *)temp_long_3, (T *)temp_long_1, 24 * actual_compute_box_num);
|
||||
__bang_cycle_mul((T *)temp_long_3, (T *)temp_long_3, (T *)temp3_ram,
|
||||
24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_mul((T *)temp_long_1, (T *)intersect_pts_x, (T *)temp_long_1,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_add((T *)temp_long_1, (T *)temp_long_1, (T *)temp_long_3,
|
||||
24 * actual_compute_box_num);
|
||||
// temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1, w=24
|
||||
__bang_minpool((T *)temp3_ram, (T *)temp_long_1, actual_compute_box_num, 1,
|
||||
24, 1, 24, 1, 24);
|
||||
__bang_mul((T *)temp3_ram, (T *)temp3_ram, (T *)valid_box,
|
||||
actual_compute_box_num);
|
||||
|
||||
// Step2. All points subtract starting-point (for sorting in the next step)
|
||||
__bang_cycle_sub((T *)ordered_pts_x, (T *)intersect_pts_x, (T *)temp3_ram,
|
||||
24 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_cycle_sub((T *)ordered_pts_y, (T *)intersect_pts_y, (T *)temp2_ram,
|
||||
24 * actual_compute_box_num, actual_compute_box_num);
|
||||
__bang_mul((T *)ordered_pts_x, (T *)ordered_pts_x, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
__bang_mul((T *)ordered_pts_y, (T *)ordered_pts_y, (T *)valid_pts,
|
||||
24 * actual_compute_box_num);
|
||||
|
||||
// Step3. Sort every intersection point according to their relative
|
||||
// cross-product values (essentially sorting according to angles)
|
||||
// If the angles are the same, sort according to distance to origin
|
||||
dot2d<T>((T *)dist_ram, (T *)ordered_pts_x, (T *)ordered_pts_y,
|
||||
(T *)ordered_pts_x, (T *)ordered_pts_y, 24 * actual_compute_box_num,
|
||||
(T *)temp_long_3);
|
||||
|
||||
T temp, temp_nums_in, temp_dist_1, temp_dist_2;
|
||||
T temp1_x, temp1_y;
|
||||
T temp2_x, temp2_y;
|
||||
for (int i = 0; i < actual_box_num; i++) {
|
||||
if (((T *)valid_box)[i]) {
|
||||
// make sure all nums_in[i] points are at the front
|
||||
for (int ii = 0; ii < 23; ii++) {
|
||||
for (int jj = ii + 1; jj < 24; jj++) {
|
||||
int ii_index = ii * actual_compute_box_num + i;
|
||||
int jj_index = jj * actual_compute_box_num + i;
|
||||
// ii point is not valid and jj point is valid, swap jj for ii
|
||||
if ((!((T *)valid_pts)[ii_index]) && ((T *)valid_pts)[jj_index]) {
|
||||
((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[jj_index];
|
||||
((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[jj_index];
|
||||
((T *)dist_ram)[ii_index] = ((T *)dist_ram)[jj_index];
|
||||
((T *)valid_pts)[ii_index] = true;
|
||||
((T *)ordered_pts_x)[jj_index] = 0;
|
||||
((T *)ordered_pts_y)[jj_index] = 0;
|
||||
((T *)dist_ram)[jj_index] = 0;
|
||||
((T *)valid_pts)[jj_index] = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
temp_nums_in = ((T *)nums_in_ram)[i];
|
||||
// make original q[0] = min_x, min_y before sort
|
||||
for (int ii = 1; ii < temp_nums_in; ii++) {
|
||||
int ii_index = ii * actual_compute_box_num + i;
|
||||
if (((T *)dist_ram)[ii_index] == 0) {
|
||||
// swap q[ii_index] and q[0]
|
||||
((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[i];
|
||||
((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[i];
|
||||
((T *)dist_ram)[ii_index] = ((T *)dist_ram)[i];
|
||||
((T *)ordered_pts_x)[i] = 0;
|
||||
((T *)ordered_pts_y)[i] = 0;
|
||||
((T *)dist_ram)[i] = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int ii = 1; ii < temp_nums_in - 1; ii++) {
|
||||
for (int jj = ii + 1; jj < temp_nums_in; jj++) {
|
||||
int ii_index = ii * actual_compute_box_num + i;
|
||||
int jj_index = jj * actual_compute_box_num + i;
|
||||
temp1_x = ((T *)ordered_pts_x)[ii_index];
|
||||
temp1_y = ((T *)ordered_pts_y)[ii_index];
|
||||
temp2_x = ((T *)ordered_pts_x)[jj_index];
|
||||
temp2_y = ((T *)ordered_pts_y)[jj_index];
|
||||
// calculate cross product and sort q (ordered_pts)
|
||||
temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
|
||||
temp_dist_1 = ((T *)dist_ram)[ii_index];
|
||||
temp_dist_2 = ((T *)dist_ram)[jj_index];
|
||||
if ((temp < (T)-1e-6) ||
|
||||
((fabs(temp) < (T)1e-6) && (temp_dist_1 > temp_dist_2))) {
|
||||
((T *)ordered_pts_x)[ii_index] = temp2_x;
|
||||
((T *)ordered_pts_y)[ii_index] = temp2_y;
|
||||
((T *)ordered_pts_x)[jj_index] = temp1_x;
|
||||
((T *)ordered_pts_y)[jj_index] = temp1_y;
|
||||
((T *)dist_ram)[ii_index] = temp_dist_2;
|
||||
((T *)dist_ram)[jj_index] = temp_dist_1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step4:
|
||||
// Make sure there are at least 2 points(that don't overlap with each
|
||||
// other) in the stack
|
||||
int k; // index of the non-overlapped second point
|
||||
for (k = 1; k < temp_nums_in; k++) {
|
||||
if (((T *)dist_ram)[k * actual_compute_box_num + i] > (T)1e-8) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (k == temp_nums_in) {
|
||||
// We reach the end, which means the convex hull is just one point
|
||||
// set valid_box = 0, to get ious = 0
|
||||
((T *)valid_box)[i] = 0;
|
||||
continue;
|
||||
}
|
||||
// q[1] = q[k];
|
||||
((T *)ordered_pts_x)[actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_x)[k * actual_compute_box_num + i];
|
||||
((T *)ordered_pts_y)[actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_y)[k * actual_compute_box_num + i];
|
||||
|
||||
// Step 5:
|
||||
// Finally we can start the scanning process.
|
||||
// When a non-convex relationship between the 3 points is found
|
||||
// (either concave shape or duplicated points),
|
||||
// we pop the previous point from the stack
|
||||
// until the 3-point relationship is convex again, or
|
||||
// until the stack only contains two points
|
||||
int m = 2; // 2 points in the stack
|
||||
for (int j = k + 1; j < temp_nums_in; j++) {
|
||||
// while (m > 1 && cross2d<T>(q[j] - q[m - 2], q[m - 1] - q[m - 2]) >=
|
||||
// 0) {
|
||||
// m--;
|
||||
// }
|
||||
temp1_x = ((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp1_y = ((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_x = ((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_y = ((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
|
||||
while ((m > 1) && (temp >= 0)) {
|
||||
m--;
|
||||
if (m > 1) {
|
||||
temp1_x =
|
||||
((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp1_y =
|
||||
((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_x =
|
||||
((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
|
||||
temp2_y =
|
||||
((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
|
||||
((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
|
||||
temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
|
||||
}
|
||||
}
|
||||
// q[m++] = q[j];
|
||||
((T *)ordered_pts_x)[m * actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_x)[j * actual_compute_box_num + i];
|
||||
((T *)ordered_pts_y)[m * actual_compute_box_num + i] =
|
||||
((T *)ordered_pts_y)[j * actual_compute_box_num + i];
|
||||
m++;
|
||||
}
|
||||
// set last(24-m) valid_pts to false, to erase invalid q in polygon area
|
||||
for (int j = m; j < temp_nums_in; j++) {
|
||||
((T *)valid_pts)[j * actual_compute_box_num + i] = 0;
|
||||
}
|
||||
((T *)nums_in_ram)[i] = m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box,
|
||||
T *valid_pts, T *nums_in_ram, T *temp1_ram,
|
||||
T *temp2_ram, T *temp3_ram, T *temp4_ram,
|
||||
T *temp5_ram, T *temp6_ram, T *temp7_ram,
|
||||
T *temp8_ram, T *temp9_ram,
|
||||
const uint32_t &actual_compute_box_num) {
|
||||
// Set where nums_in <= 2, valid_box = false
|
||||
__bang_write_value((T *)temp9_ram, COMPUTE_COUNT_ALIGN, (T)2);
|
||||
__bang_cycle_gt((T *)temp1_ram, (T *)nums_in_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num, COMPUTE_COUNT_ALIGN);
|
||||
__bang_and((T *)valid_box, (T *)valid_box, (T *)temp1_ram,
|
||||
actual_compute_box_num);
|
||||
|
||||
// temp1 = area, initialize with all 0
|
||||
__bang_write_zero((T *)temp1_ram, actual_compute_box_num);
|
||||
__bang_max((T *)temp7_ram, (T *)nums_in_ram, actual_compute_box_num);
|
||||
|
||||
// temp_nums_in = max(nums_in)
|
||||
T temp_nums_in = ((T *)temp7_ram)[0];
|
||||
for (int i = 1; i < temp_nums_in - 1; i++) {
|
||||
// q[i] - q[0]: (temp6, temp7)
|
||||
__bang_sub((T *)temp6_ram, (T *)ordered_pts_x + i * actual_compute_box_num,
|
||||
(T *)ordered_pts_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp7_ram, (T *)ordered_pts_y + i * actual_compute_box_num,
|
||||
(T *)ordered_pts_y, actual_compute_box_num);
|
||||
__bang_mul((T *)temp6_ram, (T *)temp6_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp7_ram, (T *)temp7_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// q[i + 1] - q[0]: (temp8, temp9)
|
||||
__bang_sub((T *)temp8_ram,
|
||||
(T *)ordered_pts_x + (i + 1) * actual_compute_box_num,
|
||||
(T *)ordered_pts_x, actual_compute_box_num);
|
||||
__bang_sub((T *)temp9_ram,
|
||||
(T *)ordered_pts_y + (i + 1) * actual_compute_box_num,
|
||||
(T *)ordered_pts_y, actual_compute_box_num);
|
||||
__bang_mul((T *)temp8_ram, (T *)temp8_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp9_ram, (T *)temp9_ram,
|
||||
(T *)valid_pts + (i + 1) * actual_compute_box_num,
|
||||
actual_compute_box_num);
|
||||
// area += fabs(cross2d<T>(q[i] - q[0], q[i + 1] - q[0]));
|
||||
__bang_mul((T *)temp4_ram, (T *)temp6_ram, (T *)temp9_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_mul((T *)temp5_ram, (T *)temp7_ram, (T *)temp8_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_sub((T *)temp3_ram, (T *)temp4_ram, (T *)temp5_ram,
|
||||
actual_compute_box_num);
|
||||
__bang_active_abs((T *)temp3_ram, (T *)temp3_ram, actual_compute_box_num);
|
||||
__bang_add((T *)temp1_ram, (T *)temp1_ram, (T *)temp3_ram,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
// Set where valid_box = false, intersection = 0
|
||||
__bang_mul((T *)temp1_ram, (T *)temp1_ram, (T *)valid_box,
|
||||
actual_compute_box_num);
|
||||
// area = area / 2.0
|
||||
__bang_mul_scalar((T *)temp1_ram, (T *)temp1_ram, (T)0.5,
|
||||
actual_compute_box_num);
|
||||
}
|
||||
|
||||
#endif // IOU3D_UTILS_HPP_
|
File diff suppressed because it is too large
Load Diff
|
@ -1,483 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2021 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "nms_utils.hpp"
|
||||
|
||||
#define COORD_DIM (4)
|
||||
|
||||
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
|
||||
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
|
||||
|
||||
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
|
||||
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
|
||||
|
||||
enum Addr { SRAM, GDRAM };
|
||||
|
||||
template <typename IN_DT, typename OUT_DT>
|
||||
__mlu_func__ void nms_detection(
|
||||
uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
|
||||
IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
|
||||
IN_DT *sram, const int core_limit, const int input_num_boxes,
|
||||
const int max_output_size, const float thresh_iou, const float thresh_score,
|
||||
const float offset, const int algo) {
|
||||
// global value
|
||||
int32_t *exit_flag = (int32_t *)(sram + 28);
|
||||
exit_flag[0] = 0;
|
||||
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
|
||||
int nms_buffer_count1 = 9;
|
||||
// temp nram buffer to store selected target.
|
||||
int nram_save_limit_count = 256;
|
||||
float div_thresh_iou = 1.0 / thresh_iou;
|
||||
|
||||
// input data ptr
|
||||
const IN_DT *input_x1_ptr = input_data_box;
|
||||
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
|
||||
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
|
||||
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
|
||||
|
||||
int limit = 0; // find limit when GDRAM or SRAM
|
||||
int max_seg_pad = 0; // the max length every repeat
|
||||
int repeat = 0;
|
||||
int remain = 0;
|
||||
int remain_pad = 0;
|
||||
int input_offset = 0; // offset of input_data for current core
|
||||
int nram_save_count = 0;
|
||||
|
||||
if (output_mode == 0) {
|
||||
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
|
||||
nram_save_limit_count * sizeof(OUT_DT)) /
|
||||
(nms_buffer_count1 * sizeof(IN_DT));
|
||||
} else {
|
||||
// 5 maens: score, x1, y1, x2, y2
|
||||
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
|
||||
nram_save_limit_count * 5 * sizeof(OUT_DT)) /
|
||||
(nms_buffer_count1 * sizeof(IN_DT));
|
||||
}
|
||||
|
||||
int max_seg_iou_compute = 0;
|
||||
int repeat_iou_compute = 0;
|
||||
int remain_iou_compute = 0;
|
||||
int remain_pad_iou_compute = 0;
|
||||
|
||||
getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
|
||||
input_offset, max_seg_pad, repeat, remain,
|
||||
remain_pad, max_seg_iou_compute, repeat_iou_compute,
|
||||
remain_iou_compute, remain_pad_iou_compute);
|
||||
|
||||
// init the data ptr
|
||||
IN_DT *score = (IN_DT *)nram_buffer;
|
||||
IN_DT *x1 = score + max_seg_pad;
|
||||
IN_DT *y1 = x1 + max_seg_pad;
|
||||
IN_DT *x2 = y1 + max_seg_pad;
|
||||
IN_DT *y2 = x2 + max_seg_pad;
|
||||
IN_DT *inter_x1 = y2 + max_seg_pad;
|
||||
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
|
||||
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
|
||||
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
|
||||
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
|
||||
OUT_DT *nram_save =
|
||||
(OUT_DT *)((char *)max_box +
|
||||
NFU_ALIGN_SIZE); // offset two line from max_box
|
||||
|
||||
#if __BANG_ARCH__ >= 300
|
||||
float max_box_x1 = 0;
|
||||
float max_box_y1 = 0;
|
||||
float max_box_x2 = 0;
|
||||
float max_box_y2 = 0;
|
||||
#endif
|
||||
mluMemcpyDirection_t load_dir = SRAM2NRAM;
|
||||
mluMemcpyDirection_t store_dir = NRAM2SRAM;
|
||||
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
|
||||
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
|
||||
|
||||
for (int keep = 0; keep < max_output_size;
|
||||
keep++) { // loop until the max_score <= 0
|
||||
if (core_limit != 1) {
|
||||
__sync_cluster(); // sync before current loop
|
||||
}
|
||||
|
||||
/******FIND MAX START******/
|
||||
int max_index = 0; // the max score index
|
||||
int global_max_index = 0; // for U1
|
||||
float max_area = 0; // the max socre area
|
||||
max_box[0] = 0; // init 0
|
||||
findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
|
||||
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
|
||||
input_offset, repeat, remain, remain_pad, max_seg_pad,
|
||||
max_index);
|
||||
|
||||
if (core_limit == 1) {
|
||||
#if __BANG_ARCH__ >= 300
|
||||
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
|
||||
max_box_x2, max_box_y2);
|
||||
#else
|
||||
calMaxArea(max_box, algo, offset, max_area);
|
||||
#endif
|
||||
input_data_score[max_index] = 0;
|
||||
global_max_index = max_index;
|
||||
} else if (core_limit == 4) {
|
||||
__sync_cluster();
|
||||
findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
|
||||
|
||||
#if __BANG_ARCH__ >= 300
|
||||
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
|
||||
max_box_x2, max_box_y2);
|
||||
#else
|
||||
calMaxArea(max_box, algo, offset, max_area);
|
||||
#endif
|
||||
global_max_index = ((uint32_t *)(max_box + 5))[0];
|
||||
input_data_score[global_max_index] = 0;
|
||||
}
|
||||
// by now, we get: max_score|max_index|max_box|max_area
|
||||
/******FIND MAX END******/
|
||||
|
||||
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
|
||||
max_output_size, thresh_score, output_mode, nram_save_count,
|
||||
output_box_num);
|
||||
|
||||
// if the max score <= 0, end
|
||||
if (core_limit == 1) {
|
||||
if (float(max_box[0]) <= thresh_score) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (float(max_box[0]) <= thresh_score) {
|
||||
if (coreId == 0) {
|
||||
exit_flag[0] = 1;
|
||||
}
|
||||
}
|
||||
__sync_cluster();
|
||||
if (exit_flag[0] == 1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
/******NMS STORE END******/
|
||||
#if __BANG_ARCH__ >= 300
|
||||
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
|
||||
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
|
||||
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
|
||||
max_box_y1, max_box_x2, max_box_y2, nram_save,
|
||||
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
|
||||
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
|
||||
input_offset, offset, max_area, input_num_boxes, algo);
|
||||
#else
|
||||
scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
|
||||
input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
|
||||
inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
|
||||
max_box[2], max_box[3], max_box[4], nram_save,
|
||||
repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
|
||||
max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
|
||||
input_offset, offset, max_area, input_num_boxes, algo);
|
||||
#endif
|
||||
} // for max_output_size
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelNMS(
|
||||
const void *input_boxes, const void *input_confidence,
|
||||
const int input_num_boxes, const int max_output_size,
|
||||
const float iou_threshold, const float confidence_threshold,
|
||||
const int output_mode, void *workspace, void *result_num, void *output,
|
||||
const cnrtDataType_t data_type_input, const float offset, const int algo) {
|
||||
if (data_type_input == CNRT_FLOAT16) {
|
||||
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
|
||||
GDRAM2GDRAM);
|
||||
} else if (data_type_input == CNRT_FLOAT32) {
|
||||
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
|
||||
GDRAM2GDRAM);
|
||||
} else {
|
||||
}
|
||||
|
||||
uint32_t output_box_num = 0;
|
||||
float *score_data = (float *)workspace;
|
||||
float *boxes_data = (float *)input_boxes;
|
||||
float *sram = (float *)sram_buffer;
|
||||
|
||||
if (output_mode == 0) {
|
||||
if (data_type_input == CNRT_FLOAT32) {
|
||||
nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
|
||||
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
|
||||
max_output_size, iou_threshold, confidence_threshold,
|
||||
offset, algo);
|
||||
} else {
|
||||
nms_detection(output_box_num, output_mode, (uint32_t *)output,
|
||||
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
|
||||
taskDim, input_num_boxes, max_output_size, iou_threshold,
|
||||
confidence_threshold, offset, algo);
|
||||
}
|
||||
} else {
|
||||
if (data_type_input == CNRT_FLOAT32) {
|
||||
nms_detection(output_box_num, output_mode, (float *)output, score_data,
|
||||
boxes_data, GDRAM, sram, taskDim, input_num_boxes,
|
||||
max_output_size, iou_threshold, confidence_threshold,
|
||||
offset, algo);
|
||||
} else {
|
||||
nms_detection(output_box_num, output_mode, (half *)output,
|
||||
(half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
|
||||
taskDim, input_num_boxes, max_output_size, iou_threshold,
|
||||
confidence_threshold, offset, algo);
|
||||
}
|
||||
}
|
||||
((uint32_t *)result_num)[0] = output_box_num;
|
||||
}
|
||||
|
||||
template <typename IN_DT, typename OUT_DT>
|
||||
__mlu_func__ void nms_detection_ux(
|
||||
int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
|
||||
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
|
||||
const int input_num_boxes, const int max_output_size,
|
||||
const float thresh_iou, const float thresh_score, const float offset,
|
||||
const int output_mode, const int algo, char *cdma_gdram) {
|
||||
exit_flag[0] = 0;
|
||||
|
||||
IN_DT *sram = (IN_DT *)sram_buffer;
|
||||
|
||||
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
|
||||
int nms_buffer_count1 = 9;
|
||||
// temp nram buffer to store selected target.
|
||||
int nram_save_limit_count = 256;
|
||||
float div_thresh_iou = 1.0 / thresh_iou;
|
||||
|
||||
// input data ptr
|
||||
const IN_DT *input_x1_ptr = boxes_data;
|
||||
const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
|
||||
const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
|
||||
const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
|
||||
|
||||
int limit = 0; // find limit when GDRAM or SRAM
|
||||
int max_seg_pad = 0; // the max length every repeat
|
||||
int repeat = 0;
|
||||
int remain = 0;
|
||||
int remain_pad = 0;
|
||||
int nram_save_count = 0;
|
||||
|
||||
if (output_mode == 0) {
|
||||
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
|
||||
nram_save_limit_count * sizeof(OUT_DT)) /
|
||||
(nms_buffer_count1 * sizeof(IN_DT));
|
||||
} else {
|
||||
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
|
||||
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
|
||||
(nms_buffer_count1 * sizeof(IN_DT));
|
||||
}
|
||||
|
||||
int input_offset = 0;
|
||||
int max_seg_iou_compute = 0;
|
||||
int repeat_iou_compute = 0;
|
||||
int remain_iou_compute = 0;
|
||||
int remain_pad_iou_compute = 0;
|
||||
|
||||
getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
|
||||
max_seg_pad, repeat, remain, remain_pad,
|
||||
max_seg_iou_compute, repeat_iou_compute,
|
||||
remain_iou_compute, remain_pad_iou_compute);
|
||||
// init the nram ptr
|
||||
IN_DT *score = (IN_DT *)nram_buffer;
|
||||
IN_DT *x1 = score + max_seg_pad;
|
||||
IN_DT *y1 = x1 + max_seg_pad;
|
||||
IN_DT *x2 = y1 + max_seg_pad;
|
||||
IN_DT *y2 = x2 + max_seg_pad;
|
||||
IN_DT *inter_x1 = y2 + max_seg_pad;
|
||||
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
|
||||
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
|
||||
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
|
||||
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
|
||||
OUT_DT *nram_save =
|
||||
(OUT_DT *)((char *)max_box +
|
||||
NFU_ALIGN_SIZE); // offset two line from max_box
|
||||
#if __BANG_ARCH__ >= 300
|
||||
float max_box_x1 = 0;
|
||||
float max_box_y1 = 0;
|
||||
float max_box_x2 = 0;
|
||||
float max_box_y2 = 0;
|
||||
#endif
|
||||
mluMemcpyDirection_t load_dir = SRAM2NRAM;
|
||||
mluMemcpyDirection_t store_dir = NRAM2SRAM;
|
||||
load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
|
||||
store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
|
||||
|
||||
for (int keep = 0; keep < max_output_size;
|
||||
keep++) { // loop until the max_score <= 0
|
||||
__sync_all();
|
||||
|
||||
int max_index = 0;
|
||||
int global_max_index = 0; // for Ux
|
||||
float max_area = 0; // the max socre area
|
||||
max_box[0] = 0; // init 0
|
||||
|
||||
if (coreId == 0) {
|
||||
findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
|
||||
input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
|
||||
input_offset, repeat, remain, remain_pad, max_seg_pad,
|
||||
max_index);
|
||||
// copy max box info to sram
|
||||
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
|
||||
}
|
||||
__sync_all();
|
||||
#if __BANG_ARCH__ >= 590
|
||||
__memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram,
|
||||
REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM);
|
||||
__sync_all();
|
||||
if (clusterId == 0 && coreId == 0) {
|
||||
__bang_write_zero(inter_x1, NMS_SIZE);
|
||||
__memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM,
|
||||
sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
|
||||
__bang_max(max_box, inter_x1, NMS_SIZE);
|
||||
int max_cluster = (sizeof(IN_DT) == sizeof(half))
|
||||
? ((uint16_t *)max_box)[1]
|
||||
: ((uint32_t *)max_box)[1];
|
||||
__memcpy((char *)cdma_gdram,
|
||||
(char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT),
|
||||
REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM);
|
||||
}
|
||||
__sync_all();
|
||||
__memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM);
|
||||
#else
|
||||
findGlobalMaxBox(max_box, sram, inter_x1);
|
||||
#endif
|
||||
|
||||
#if __BANG_ARCH__ >= 300
|
||||
calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
|
||||
max_box_x2, max_box_y2);
|
||||
#else
|
||||
calMaxArea(max_box, algo, offset, max_area);
|
||||
#endif
|
||||
global_max_index = ((uint32_t *)(max_box + 5))[0];
|
||||
if (coreId != MEMORY_CORE) {
|
||||
score_data[global_max_index] = 0;
|
||||
}
|
||||
|
||||
storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
|
||||
max_output_size, thresh_score, output_mode, nram_save_count,
|
||||
output_box_num);
|
||||
|
||||
if (float(max_box[0]) <= thresh_score) {
|
||||
if (clusterId == 0 && coreId == 0) {
|
||||
exit_flag[0] = 1; // dram
|
||||
}
|
||||
}
|
||||
__sync_all();
|
||||
if (exit_flag[0] == 1) {
|
||||
break;
|
||||
}
|
||||
/******NMS STORE END******/
|
||||
#if __BANG_ARCH__ >= 300
|
||||
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
|
||||
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
|
||||
inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
|
||||
max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
|
||||
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
|
||||
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
|
||||
max_area, input_num_boxes, algo);
|
||||
#else
|
||||
scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
|
||||
input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
|
||||
inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
|
||||
max_box[3], max_box[4], nram_save, repeat_iou_compute,
|
||||
remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
|
||||
max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
|
||||
max_area, input_num_boxes, algo);
|
||||
#endif
|
||||
} // for max_output_size
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUionXKernelNMS(
|
||||
const void *input_boxes, const void *input_confidence,
|
||||
const int input_num_boxes, const int max_output_size,
|
||||
const float iou_threshold, const float confidence_threshold,
|
||||
const float offset, const cnrtDataType_t data_type_input,
|
||||
const int output_mode, const int algo, void *workspace, void *result_num,
|
||||
void *output) {
|
||||
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
|
||||
int32_t *exit_flag = (int32_t *)((char *)workspace +
|
||||
INFO_NUM * input_num_boxes * input_dwidth);
|
||||
char *cdma_addr = (char *)exit_flag + sizeof(int32_t);
|
||||
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
|
||||
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
|
||||
|
||||
int cluster_score_size = input_num_boxes * input_dwidth;
|
||||
int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
|
||||
char *sram_score = (char *)sram_buffer + reduce_sram_size;
|
||||
char *sram_boxes =
|
||||
(char *)sram_buffer + reduce_sram_size + cluster_score_size;
|
||||
Addr input_ram = GDRAM;
|
||||
if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
|
||||
input_ram = SRAM;
|
||||
__memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
|
||||
__memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
|
||||
} else {
|
||||
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
|
||||
}
|
||||
__sync_cluster();
|
||||
|
||||
uint32_t output_box_num = 0;
|
||||
float *score_data;
|
||||
float *boxes_data;
|
||||
score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
|
||||
boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
|
||||
|
||||
if (output_mode == 0) {
|
||||
if (data_type_input == CNRT_FLOAT32) {
|
||||
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
|
||||
score_data, boxes_data, input_ram, input_num_boxes,
|
||||
max_output_size, iou_threshold, confidence_threshold,
|
||||
offset, output_mode, algo, cdma_addr);
|
||||
} else {
|
||||
nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
|
||||
(half *)score_data, (half *)boxes_data, input_ram,
|
||||
input_num_boxes, max_output_size, iou_threshold,
|
||||
confidence_threshold, offset, output_mode, algo,
|
||||
cdma_addr);
|
||||
}
|
||||
} else {
|
||||
if (data_type_input == CNRT_FLOAT32) {
|
||||
nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
|
||||
boxes_data, input_ram, input_num_boxes, max_output_size,
|
||||
iou_threshold, confidence_threshold, offset, output_mode,
|
||||
algo, cdma_addr);
|
||||
} else {
|
||||
nms_detection_ux(exit_flag, output_box_num, (half *)output,
|
||||
(half *)score_data, (half *)boxes_data, input_ram,
|
||||
input_num_boxes, max_output_size, iou_threshold,
|
||||
confidence_threshold, offset, output_mode, algo,
|
||||
cdma_addr);
|
||||
}
|
||||
}
|
||||
((uint32_t *)result_num)[0] = output_box_num;
|
||||
}
|
||||
|
||||
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t data_type_input, const void *boxes_ptr,
|
||||
const void *scores_ptr, const int input_num_boxes,
|
||||
const int max_output_boxes, const float iou_threshold,
|
||||
const float offset, void *workspace_ptr, void *output_size_ptr,
|
||||
void *output_ptr) {
|
||||
switch (k_type) {
|
||||
default: { return; }
|
||||
case CNRT_FUNC_TYPE_BLOCK:
|
||||
case CNRT_FUNC_TYPE_UNION1: {
|
||||
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
|
||||
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
|
||||
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
|
||||
/*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
|
||||
data_type_input, offset, /*algo=*/1);
|
||||
}; break;
|
||||
case CNRT_FUNC_TYPE_UNION2:
|
||||
case CNRT_FUNC_TYPE_UNION4:
|
||||
case CNRT_FUNC_TYPE_UNION8:
|
||||
case CNRT_FUNC_TYPE_UNION16: {
|
||||
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
|
||||
(void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
|
||||
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
|
||||
data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
|
||||
output_size_ptr, output_ptr);
|
||||
}; break;
|
||||
}
|
||||
}
|
|
@ -1,553 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) [2019-2022] by Cambricon, Inc.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#ifndef NMS_UTILS_HPP_
|
||||
#define NMS_UTILS_HPP_
|
||||
#include "common_mlu_helper.hpp"
|
||||
|
||||
#define NMS_SIZE (64)
|
||||
#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
|
||||
#define NMS_DOWN(x, y) (x / y) * y
|
||||
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
|
||||
#define MEMORY_CORE (0x80)
|
||||
#define REDUCE_NUM \
|
||||
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
|
||||
|
||||
__mlu_func__ void pvLock() {
|
||||
#if __BANG_ARCH__ == 270
|
||||
if (coreId != MEMORY_CORE) {
|
||||
__bang_lock(0, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__mlu_func__ void pvUnlock() {
|
||||
#if __BANG_ARCH__ == 270
|
||||
if (coreId != MEMORY_CORE) {
|
||||
__bang_unlock(0, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
|
||||
const int deal_num,
|
||||
const T threshold = 0) {
|
||||
if (threshold < 0) {
|
||||
return;
|
||||
}
|
||||
if (threshold) {
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_relun(nram_dst, nram_src, deal_num, threshold);
|
||||
#else
|
||||
int align_num = NFU_ALIGN_SIZE / sizeof(T);
|
||||
T *nram_aux_a = (T *)nram_tmp;
|
||||
T *nram_aux_b = nram_aux_a + deal_num;
|
||||
T *nram_zero = nram_aux_b + align_num;
|
||||
__bang_write_value(nram_aux_b, align_num, threshold);
|
||||
__bang_write_zero(nram_zero, align_num);
|
||||
__bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
|
||||
align_num);
|
||||
__bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
|
||||
__bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
|
||||
align_num);
|
||||
__bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
|
||||
deal_num, align_num);
|
||||
__bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
|
||||
__bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
|
||||
align_num);
|
||||
__bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
|
||||
#endif
|
||||
} else {
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__bang_relu(nram_dst, nram_src, deal_num);
|
||||
#else
|
||||
__bang_active_relu(nram_dst, nram_src, deal_num);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_func__ void getComputeParamsBlockOrU1(
|
||||
const int input_dwidth, const int input_box_num, const int limit,
|
||||
const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
|
||||
int &remain, int &remain_pad, int &max_seg_iou_compute,
|
||||
int &repeat_iou_compute, int &remain_iou_compute,
|
||||
int &remain_pad_iou_compute) {
|
||||
int avg_core = input_box_num / core_limit;
|
||||
int rem = input_box_num % core_limit;
|
||||
int len_core = avg_core + (coreId < rem ? 1 : 0);
|
||||
input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
|
||||
max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
|
||||
repeat = len_core / max_seg_pad;
|
||||
remain = len_core % max_seg_pad;
|
||||
remain_pad = NMS_UP(remain, NMS_SIZE);
|
||||
|
||||
// if datatype is fp16, we should cvt to fp32 when compute iou
|
||||
max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
|
||||
repeat_iou_compute = len_core / max_seg_iou_compute;
|
||||
remain_iou_compute = len_core % max_seg_iou_compute;
|
||||
remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
|
||||
}
|
||||
|
||||
__mlu_func__ void getComputeParamsUx(
|
||||
const int input_dwidth, const int input_num_boxes, const int limit,
|
||||
int &input_offset, int &max_seg_pad, int &repeat, int &remain,
|
||||
int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
|
||||
int &remain_iou_compute, int &remain_pad_iou_compute) {
|
||||
// data split
|
||||
int avg_cluster = input_num_boxes / clusterDim;
|
||||
int rem_cluster = input_num_boxes % clusterDim;
|
||||
int len_cluster = avg_cluster + (clusterId < rem_cluster);
|
||||
int cluster_offset = avg_cluster * clusterId +
|
||||
(clusterId <= rem_cluster ? clusterId : rem_cluster);
|
||||
|
||||
int avg_core = len_cluster / coreDim;
|
||||
int rem_core = len_cluster % coreDim;
|
||||
int len_core = avg_core + (coreId < rem_core);
|
||||
int core_offset =
|
||||
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
|
||||
input_offset = cluster_offset + core_offset;
|
||||
|
||||
max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
|
||||
|
||||
// core 0 of each cluster calculate the max score index
|
||||
int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
|
||||
repeat = max_index_len_core / max_seg_pad;
|
||||
remain = max_index_len_core % max_seg_pad;
|
||||
remain_pad = NMS_UP(remain, NMS_SIZE);
|
||||
// if datatype is fp16, we should cvt to fp32 when compute iou
|
||||
max_seg_iou_compute =
|
||||
NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
|
||||
repeat_iou_compute = len_core / max_seg_iou_compute;
|
||||
remain_iou_compute = len_core % max_seg_iou_compute;
|
||||
remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
|
||||
}
|
||||
|
||||
template <typename IN_DT>
|
||||
__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
|
||||
IN_DT *inter_x1) {
|
||||
// copy all partial max to the sram of cluster 0
|
||||
if (clusterId != 0) {
|
||||
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
|
||||
SRAM2SRAM, 0);
|
||||
}
|
||||
__sync_all();
|
||||
|
||||
// reduce between clusters to get the global max box
|
||||
if (clusterId == 0) {
|
||||
if (coreId == 0) {
|
||||
__bang_write_zero(inter_x1, NMS_SIZE);
|
||||
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
|
||||
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
|
||||
__bang_max(max_box, inter_x1, NMS_SIZE);
|
||||
int max_cluster = (sizeof(IN_DT) == sizeof(half))
|
||||
? ((uint16_t *)max_box)[1]
|
||||
: ((uint32_t *)max_box)[1];
|
||||
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
|
||||
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
|
||||
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
|
||||
}
|
||||
__sync_cluster();
|
||||
if (coreId == 0x80 && clusterDim > 1) {
|
||||
// broadcast global max box to each cluster's sram
|
||||
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
|
||||
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
|
||||
cluster_idx);
|
||||
}
|
||||
}
|
||||
__sync_cluster();
|
||||
}
|
||||
__sync_all();
|
||||
|
||||
// copy the global max box to max_box
|
||||
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
|
||||
}
|
||||
|
||||
template <typename IN_DT>
|
||||
__mlu_func__ void findCoreMaxBox(
|
||||
IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
|
||||
const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
|
||||
const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
|
||||
const mluMemcpyDirection_t load_dir, const int input_offset,
|
||||
const int repeat, const int remain, const int remain_pad,
|
||||
const int max_seg_pad, int &max_index) {
|
||||
if (coreId != 0x80) {
|
||||
for (int i = 0; i <= repeat; i++) {
|
||||
if (i == repeat && remain == 0) {
|
||||
break;
|
||||
}
|
||||
int seg_len = 0; // the length every nms compute
|
||||
int cpy_len = 0; // the length every nms memcpy
|
||||
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
|
||||
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
|
||||
/******NMS LOAD START******/
|
||||
__bang_write_zero(score, seg_len);
|
||||
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
|
||||
cpy_len * sizeof(IN_DT), 0);
|
||||
|
||||
/******NMS LOAD END******/
|
||||
|
||||
__bang_max(inter_x1, score, seg_len);
|
||||
if (inter_x1[0] > max_box[0]) {
|
||||
max_box[0] = inter_x1[0];
|
||||
if (sizeof(IN_DT) == sizeof(half)) {
|
||||
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
|
||||
i * max_seg_pad; // offset start from head of input_data
|
||||
} else if (sizeof(IN_DT) == sizeof(float)) {
|
||||
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
|
||||
i * max_seg_pad; // offset start from head of input_data
|
||||
}
|
||||
}
|
||||
} // for repeat
|
||||
// the max box's x1, y1, x2, y2 on every core
|
||||
max_box[1] = input_x1_ptr[max_index];
|
||||
max_box[2] = input_y1_ptr[max_index];
|
||||
max_box[3] = input_x2_ptr[max_index];
|
||||
max_box[4] = input_y2_ptr[max_index];
|
||||
((uint32_t *)(max_box + 5))[0] = max_index;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IN_DT>
|
||||
__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
|
||||
IN_DT *inter_x1, IN_DT *input_data_score,
|
||||
const int core_limit) {
|
||||
// find the max with sram
|
||||
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
|
||||
__memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
|
||||
NRAM2SRAM); // int32_t datatype
|
||||
__sync_cluster();
|
||||
|
||||
// copy score from sram to nram and find the max
|
||||
__bang_write_zero(inter_x1, 64);
|
||||
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
|
||||
REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
|
||||
__bang_max(max_box, inter_x1, 64);
|
||||
int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
|
||||
: ((uint32_t *)max_box)[1];
|
||||
// copy the max box to max_box
|
||||
__memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
|
||||
SRAM2NRAM);
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/*******************************CALCULATE MAX AREA****************************/
|
||||
/*****************************************************************************/
|
||||
|
||||
template <typename IN_DT>
|
||||
__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
|
||||
float &max_area) {
|
||||
if (algo == 0 || offset == 0.0) {
|
||||
max_area = ((float)max_box[3] - (float)max_box[1]) *
|
||||
((float)max_box[4] - (float)max_box[2]);
|
||||
} else {
|
||||
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
|
||||
((float)max_box[4] - (float)max_box[2] + offset);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IN_DT>
|
||||
__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
|
||||
float &max_area, float &max_box_x1,
|
||||
float &max_box_y1, float &max_box_x2,
|
||||
float &max_box_y2) {
|
||||
// the case of random inf will break the requirement of x1<=x2, y1<=y2
|
||||
// so exchange it if it happens.
|
||||
max_box_x1 = float(max_box[1]);
|
||||
max_box_x2 = float(max_box[3]);
|
||||
if (max_box[1] > max_box[3]) {
|
||||
max_box_x1 = float(max_box[3]);
|
||||
max_box_x2 = float(max_box[1]);
|
||||
}
|
||||
max_box_y1 = float(max_box[2]);
|
||||
max_box_y2 = float(max_box[4]);
|
||||
if (max_box[2] > max_box[4]) {
|
||||
max_box_y1 = float(max_box[4]);
|
||||
max_box_y2 = float(max_box[2]);
|
||||
}
|
||||
if (algo == 0 || offset == 0.0) {
|
||||
max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
|
||||
} else {
|
||||
max_area =
|
||||
(max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
|
||||
}
|
||||
}
|
||||
|
||||
/***********************************************************************/
|
||||
/*******************************STORE RESULT****************************/
|
||||
/***********************************************************************/
|
||||
template <typename IN_DT, typename OUT_DT>
|
||||
__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
|
||||
OUT_DT *&output_dram, const int keep,
|
||||
const int nram_save_limit_count,
|
||||
const int max_output_size,
|
||||
const float thresh_score, const int output_mode,
|
||||
int &nram_save_count, uint32_t &output_box_num) {
|
||||
/******NMS STORE START******/
|
||||
// store to nram
|
||||
if (float(max_box[0]) > thresh_score) {
|
||||
OUT_DT *save_ptr;
|
||||
int save_offset = 0;
|
||||
int save_str_num = 0;
|
||||
save_ptr = nram_save;
|
||||
save_offset = nram_save_count;
|
||||
save_str_num = nram_save_limit_count;
|
||||
if (clusterId == 0 && coreId == 0) {
|
||||
if (output_mode == 0) { // index1, index2, ...
|
||||
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
|
||||
} else if (output_mode == 1) { // score, x1, y1, x2, y2
|
||||
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
|
||||
INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
|
||||
INFO_NUM * sizeof(IN_DT), 0);
|
||||
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
|
||||
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
|
||||
save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
|
||||
}
|
||||
}
|
||||
nram_save_count++;
|
||||
output_box_num++;
|
||||
}
|
||||
|
||||
// store to sram/gdram
|
||||
if (output_box_num != 0) {
|
||||
if ((nram_save_count == nram_save_limit_count) ||
|
||||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
|
||||
if (nram_save_count != 0) {
|
||||
if (clusterId == 0 && coreId == 0) {
|
||||
if (output_mode == 0) { // index1, index2, ...
|
||||
pvLock();
|
||||
__memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
|
||||
NRAM2GDRAM);
|
||||
pvUnlock();
|
||||
output_dram += nram_save_count;
|
||||
} else if (output_mode == 1) { // score, x1, y1, x2, y2
|
||||
pvLock();
|
||||
__memcpy(output_dram, nram_save,
|
||||
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
|
||||
pvUnlock();
|
||||
output_dram += nram_save_count * INFO_NUM;
|
||||
} else if (output_mode ==
|
||||
2) { // score---, x1---, y1---, x2---, y2---
|
||||
pvLock();
|
||||
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
|
||||
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
|
||||
nram_save_limit_count * sizeof(IN_DT), 4);
|
||||
pvUnlock();
|
||||
output_dram += nram_save_count;
|
||||
}
|
||||
nram_save_count = 0;
|
||||
}
|
||||
}
|
||||
} // if move data nram->sram/gdram
|
||||
} // if dst
|
||||
}
|
||||
|
||||
template <typename IN_DT, typename OUT_DT>
|
||||
__mlu_func__ void scoreUpdate(
|
||||
IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
|
||||
const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
|
||||
const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
|
||||
const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
|
||||
IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
|
||||
IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
|
||||
const float max_box_y1, const float max_box_x2, const float max_box_y2,
|
||||
OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
|
||||
int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
|
||||
const float thresh_iou, const float div_thresh_iou, const int input_offset,
|
||||
const float offset, const float max_area, const int input_num_boxes,
|
||||
const int algo) {
|
||||
for (int i = 0; i <= repeat_iou_compute; i++) {
|
||||
if (i == repeat_iou_compute && remain_iou_compute == 0) {
|
||||
break;
|
||||
}
|
||||
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
|
||||
: max_seg_iou_compute;
|
||||
int cpy_len =
|
||||
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
|
||||
/******NMS LOAD START******/
|
||||
int dt_offset = 0;
|
||||
if (sizeof(IN_DT) == sizeof(float)) {
|
||||
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
|
||||
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
|
||||
cpy_len * sizeof(IN_DT), 0);
|
||||
dt_offset = 0;
|
||||
} else if (sizeof(IN_DT) == sizeof(half)) {
|
||||
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
|
||||
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
|
||||
cpy_len * sizeof(IN_DT), 0);
|
||||
__bang_half2float((float *)score, (half *)x1, seg_len);
|
||||
dt_offset = max_seg_iou_compute;
|
||||
}
|
||||
#if __BANG_ARCH__ >= 300
|
||||
__memcpy(inter_x1 + dt_offset,
|
||||
input_x1_ptr + input_offset + i * max_seg_iou_compute,
|
||||
cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
|
||||
input_num_boxes * sizeof(IN_DT), 3);
|
||||
|
||||
if (sizeof(IN_DT) == sizeof(half)) {
|
||||
__bang_half2float((float *)inter_x1,
|
||||
(half *)inter_x1 + max_seg_iou_compute, seg_len);
|
||||
__bang_half2float((float *)inter_y1,
|
||||
(half *)inter_y1 + max_seg_iou_compute, seg_len);
|
||||
__bang_half2float((float *)inter_x2,
|
||||
(half *)inter_x2 + max_seg_iou_compute, seg_len);
|
||||
__bang_half2float((float *)inter_y2,
|
||||
(half *)inter_y2 + max_seg_iou_compute, seg_len);
|
||||
}
|
||||
// box transfer
|
||||
__bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
|
||||
__bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
|
||||
__bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
|
||||
__bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
|
||||
// 1、 compute IOU
|
||||
// get the area_I
|
||||
__bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
|
||||
seg_len); // inter_x1
|
||||
__bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
|
||||
seg_len); // inter_x2
|
||||
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
|
||||
seg_len);
|
||||
if (algo == 1 && offset != 0.0) {
|
||||
__bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
|
||||
}
|
||||
computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
|
||||
seg_len); // inter_w
|
||||
__bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
|
||||
seg_len); // inter_y1
|
||||
__bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
|
||||
seg_len); // inter_y2
|
||||
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
|
||||
seg_len);
|
||||
if (algo == 1 && offset != 0.0) {
|
||||
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
|
||||
}
|
||||
computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
|
||||
seg_len); // inter_h
|
||||
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
|
||||
seg_len); // area_I
|
||||
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
|
||||
if (algo == 1 && offset != 0.0) {
|
||||
__bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
|
||||
offset, seg_len, seg_len);
|
||||
__bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
|
||||
offset, seg_len, seg_len);
|
||||
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
|
||||
seg_len); // area
|
||||
} else {
|
||||
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
|
||||
__bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
|
||||
(float *)inter_y1, seg_len, seg_len);
|
||||
}
|
||||
// get the area_U: area + max_area - area_I
|
||||
__bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
|
||||
(float *)inter_x1, seg_len, seg_len);
|
||||
// 2、 select the box
|
||||
// if IOU greater than thres, set the score to zero, abort it: area_U >
|
||||
// area_I * (1 / thresh)?
|
||||
if (thresh_iou > 0.0) {
|
||||
__bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
|
||||
seg_len);
|
||||
} else {
|
||||
__bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
|
||||
seg_len);
|
||||
}
|
||||
// process for nan
|
||||
__bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
|
||||
__bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
|
||||
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
|
||||
/******NMS COMPUTE END******/
|
||||
#else
|
||||
__memcpy(x1 + dt_offset,
|
||||
input_x1_ptr + input_offset + i * max_seg_iou_compute,
|
||||
cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
|
||||
input_num_boxes * sizeof(IN_DT), 3);
|
||||
if (sizeof(IN_DT) == sizeof(half)) {
|
||||
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
|
||||
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
|
||||
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
|
||||
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
|
||||
}
|
||||
// 1、 compute IOU
|
||||
// get the area_I
|
||||
__bang_write_value((float *)inter_y1, seg_len,
|
||||
float(max_box[1])); // max_x1
|
||||
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
|
||||
seg_len); // inter_x1
|
||||
__bang_write_value((float *)inter_y2, seg_len,
|
||||
float(max_box[3])); // max_x2
|
||||
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
|
||||
seg_len); // inter_x2
|
||||
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
|
||||
seg_len);
|
||||
if (algo == 1 && offset != 0.0) {
|
||||
__bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
|
||||
}
|
||||
computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
|
||||
seg_len); // inter_w
|
||||
__bang_write_value((float *)inter_x2, seg_len,
|
||||
float(max_box[2])); // max_y1
|
||||
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
|
||||
seg_len); // inter_y1
|
||||
__bang_write_value((float *)inter_x2, seg_len,
|
||||
float(max_box[4])); // max_y2
|
||||
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
|
||||
seg_len); // inter_y2
|
||||
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
|
||||
seg_len);
|
||||
if (algo == 1 && offset != 0.0) {
|
||||
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
|
||||
}
|
||||
computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
|
||||
seg_len); // inter_h
|
||||
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
|
||||
seg_len); // area_I
|
||||
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
|
||||
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
|
||||
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
|
||||
if (algo == 1 && offset != 0.0) {
|
||||
__bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
|
||||
__bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
|
||||
}
|
||||
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
|
||||
seg_len); // area
|
||||
// get the area_U: area + max_area - area_I
|
||||
__bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
|
||||
seg_len);
|
||||
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
|
||||
seg_len); // area_U
|
||||
// 2、 select the box
|
||||
// if IOU greater than thresh, set the score to zero, abort it: area_U >
|
||||
// area_I * (1 / thresh)?
|
||||
if (thresh_iou > 0.0) {
|
||||
__bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
|
||||
seg_len);
|
||||
} else {
|
||||
__bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
|
||||
seg_len);
|
||||
}
|
||||
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
|
||||
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
|
||||
/******NMS COMPUTE END******/
|
||||
#endif
|
||||
// update the score
|
||||
if (sizeof(IN_DT) == sizeof(half)) {
|
||||
convertFloat2half((half *)score, (float *)score, seg_len);
|
||||
}
|
||||
pvLock();
|
||||
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
|
||||
cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
|
||||
cpy_len * sizeof(IN_DT), 0);
|
||||
pvUnlock();
|
||||
}
|
||||
}
|
||||
|
||||
#endif // NMS_UTILS_HPP_
|
|
@ -1,493 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2021 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "common_mlu_helper.hpp"
|
||||
|
||||
#define ROI_OFFSET 5
|
||||
|
||||
__nram__ char buffer[MAX_NRAM_SIZE];
|
||||
|
||||
namespace forward {
|
||||
template <typename T>
|
||||
__mlu_func__ void bilinearInterpolate(const int input_height,
|
||||
const int input_width, T y, T x, T *w1,
|
||||
T *w2, T *w3, T *w4, int *x_low,
|
||||
int *x_high, int *y_low, int *y_high,
|
||||
bool *empty) {
|
||||
// deal with cases that inverse elements are of feature map boundary
|
||||
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
|
||||
*empty = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (y <= 0) y = 0;
|
||||
if (x <= 0) x = 0;
|
||||
|
||||
int y_low_ = int(y);
|
||||
int x_low_ = int(x);
|
||||
|
||||
if (y_low_ >= input_height - 1) {
|
||||
*y_high = y_low_ = input_height - 1;
|
||||
y = (T)y_low_;
|
||||
} else {
|
||||
*y_high = y_low_ + 1;
|
||||
}
|
||||
|
||||
if (x_low_ >= input_width - 1) {
|
||||
*x_high = x_low_ = input_width - 1;
|
||||
x = T(x_low_);
|
||||
} else {
|
||||
*x_high = x_low_ + 1;
|
||||
}
|
||||
|
||||
*y_low = y_low_;
|
||||
*x_low = x_low_;
|
||||
|
||||
T ly = y - y_low_;
|
||||
T lx = x - x_low_;
|
||||
T hy = 1.0 - ly;
|
||||
T hx = 1.0 - lx;
|
||||
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
|
||||
T *nram_out, const int roi_bin_grid_h,
|
||||
const int roi_bin_grid_w, const T roi_start_h,
|
||||
const T roi_start_w, const int ph,
|
||||
const int pw, const T bin_size_h,
|
||||
const T bin_size_w, const float count,
|
||||
const int input_height, const int input_width,
|
||||
const int channels, const int cyc_num,
|
||||
const int max_elements) {
|
||||
int cyc_channel = max_elements;
|
||||
|
||||
for (int i = 0; i < cyc_num; i++) {
|
||||
int real_channel =
|
||||
(i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
|
||||
int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
|
||||
__bang_write_zero(nram_out, align_channel);
|
||||
uint32_t real_size = real_channel * sizeof(T);
|
||||
|
||||
int iy, ix;
|
||||
for (iy = 0; iy < roi_bin_grid_h; iy++) {
|
||||
// 1. compute the coordinates of the y axis in the current roi_bin_grid_h
|
||||
T y = roi_start_h + ph * bin_size_h +
|
||||
(T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
|
||||
for (ix = 0; ix < roi_bin_grid_w; ix++) {
|
||||
// 2. compute the coordinates of the x axis in the current
|
||||
// roi_bin_grid_w
|
||||
T x = roi_start_w + pw * bin_size_w +
|
||||
(T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
|
||||
|
||||
// 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
|
||||
// and y_high) and weight (x_low and x_high) of input feature map in
|
||||
// the current roi bin grid, and the flag (empty) which shows if x, y
|
||||
// are out of input feature map ranges
|
||||
T w1, w2, w3, w4;
|
||||
int x_low, x_high, y_low, y_high;
|
||||
bool empty = false;
|
||||
|
||||
bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
|
||||
&x_low, &x_high, &y_low, &y_high, &empty);
|
||||
|
||||
// 4. compute interpolation of the current roi bin grid
|
||||
// tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
|
||||
// to compute the interpolation, and then reused to compute
|
||||
// the argmax_x and argmax_y.
|
||||
T *tmp_cyc1 = nram_in + cyc_channel;
|
||||
T *tmp_cyc2 = nram_in + cyc_channel * 2;
|
||||
T *tmp_cyc3 = nram_in + cyc_channel * 3;
|
||||
T *tmp_cyc4 = nram_in + cyc_channel * 4;
|
||||
|
||||
if (empty) { // exits abnormal values
|
||||
__bang_write_zero(nram_in, align_channel);
|
||||
} else {
|
||||
__bang_write_zero(nram_in, align_channel);
|
||||
uint32_t offset1 = (y_low * input_width + x_low) * channels;
|
||||
uint32_t offset2 = (y_low * input_width + x_high) * channels;
|
||||
uint32_t offset3 = (y_high * input_width + x_low) * channels;
|
||||
uint32_t offset4 = (y_high * input_width + x_high) * channels;
|
||||
T *input1 = (T *)input_core + offset1 + i * cyc_channel;
|
||||
T *input2 = (T *)input_core + offset2 + i * cyc_channel;
|
||||
T *input3 = (T *)input_core + offset3 + i * cyc_channel;
|
||||
T *input4 = (T *)input_core + offset4 + i * cyc_channel;
|
||||
|
||||
// load the four pixels (p1, p2, p3 and p4) of input feature map to
|
||||
// compute interpolation
|
||||
__memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
|
||||
__memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
|
||||
__memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
|
||||
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
|
||||
|
||||
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
|
||||
__bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
|
||||
__bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
|
||||
__bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
|
||||
__bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
|
||||
|
||||
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
|
||||
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
|
||||
__bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
|
||||
__bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
|
||||
}
|
||||
// 5. compute sum value and corresponding coordinates of x axis and y
|
||||
// axis. Update the sum value.
|
||||
__bang_add(nram_out, nram_in, nram_out, align_channel);
|
||||
} // loop_roi_grid_w
|
||||
} // loop_roi_grid_h
|
||||
T count_value = (T)(1.0 / count);
|
||||
__bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
|
||||
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
|
||||
} // loop_cyc_num
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void roialignForwardAvg(
|
||||
T *input, T *rois, T *output, const bool aligned, const int channels,
|
||||
const int pooled_height, const int pooled_width, const int input_height,
|
||||
const int input_width, const int sampling_ratio, const T spatial_scale,
|
||||
const int num_rois) {
|
||||
// find limit for channel, the nram space is divided to 6 parts that are
|
||||
// input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
|
||||
|
||||
// max_elements : 300 : float datatype : 27296, half datatype : 54592
|
||||
// max_elements : 200 : float datatype : 16384, half datatype : 32768
|
||||
int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
|
||||
int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
|
||||
T offset = aligned ? (T)0.5 : (T)0.0;
|
||||
int task_num = num_rois * pooled_height * pooled_width;
|
||||
T *nram_out = (T *)buffer;
|
||||
T *nram_in = nram_out + max_elements;
|
||||
if (task_num < taskDim) {
|
||||
if (taskId >= task_num) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
|
||||
if (bin_idx >= task_num) {
|
||||
return;
|
||||
}
|
||||
|
||||
// (n,ph.pw) is a c in the pooled output
|
||||
int pw = bin_idx % pooled_width;
|
||||
int ph = (bin_idx / pooled_width) % pooled_height;
|
||||
int n = bin_idx / pooled_width / pooled_height;
|
||||
|
||||
T *roi_id_tmp = rois + n * ROI_OFFSET;
|
||||
// 1. compute width and height of roi region.
|
||||
int batch_idx = (int)roi_id_tmp[0];
|
||||
T roi_x1 = roi_id_tmp[1];
|
||||
T roi_y1 = roi_id_tmp[2];
|
||||
T roi_x2 = roi_id_tmp[3];
|
||||
T roi_y2 = roi_id_tmp[4];
|
||||
T roi_start_w = roi_x1 * spatial_scale - offset;
|
||||
T roi_start_h = roi_y1 * spatial_scale - offset;
|
||||
T roi_end_w = roi_x2 * spatial_scale - offset;
|
||||
T roi_end_h = roi_y2 * spatial_scale - offset;
|
||||
T roi_width = roi_end_w - roi_start_w;
|
||||
T roi_height = roi_end_h - roi_start_h;
|
||||
|
||||
if (!aligned) {
|
||||
roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
|
||||
roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
|
||||
}
|
||||
|
||||
// 2. compute float-type width and height of roi bin region.
|
||||
T bin_size_w = (T)roi_width / (T)pooled_width;
|
||||
T bin_size_h = (T)roi_height / (T)pooled_height;
|
||||
|
||||
// 3. compute int-type width and height of roi bin region.
|
||||
int roi_bin_grid_h, roi_bin_grid_w;
|
||||
roi_bin_grid_h = (sampling_ratio > 0)
|
||||
? sampling_ratio
|
||||
: int(ceilf(roi_height / pooled_height));
|
||||
roi_bin_grid_w = (sampling_ratio > 0)
|
||||
? sampling_ratio
|
||||
: int(ceilf(roi_width / pooled_width));
|
||||
float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
|
||||
? roi_bin_grid_h * roi_bin_grid_w
|
||||
: 1.0);
|
||||
T *input_core = input + batch_idx * channels * input_width * input_height;
|
||||
T *output_core = output + bin_idx * channels;
|
||||
// 4. compute avg value and corresponding coordinates of x axis and y axis.
|
||||
computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
|
||||
roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
|
||||
bin_size_w, count, input_height, input_width, channels,
|
||||
cyc_num, max_elements);
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
|
||||
const void *input, const void *rois, const int channels, const bool aligned,
|
||||
const int pooled_height, const int pooled_width, const int input_height,
|
||||
const int input_width, const int sampling_ratio, const float spatial_scale,
|
||||
const int num_rois, const cnrtDataType_t data_type, void *output) {
|
||||
// make sure that memcore is not used
|
||||
if (coreId == 0x80) {
|
||||
return;
|
||||
}
|
||||
|
||||
switch (data_type) {
|
||||
case CNRT_FLOAT16: {
|
||||
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
|
||||
channels, pooled_height, pooled_width, input_height,
|
||||
input_width, sampling_ratio, (half)spatial_scale,
|
||||
num_rois);
|
||||
}; break;
|
||||
case CNRT_FLOAT32: {
|
||||
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
|
||||
aligned, channels, pooled_height, pooled_width,
|
||||
input_height, input_width, sampling_ratio,
|
||||
(float)spatial_scale, num_rois);
|
||||
}; break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
} // namespace forward
|
||||
|
||||
namespace backward {
|
||||
__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
|
||||
float x, float *w1, float *w2,
|
||||
float *w3, float *w4, int *x_low,
|
||||
int *x_high, int *y_low,
|
||||
int *y_high) {
|
||||
if (y < -1.0 || y > height || x < -1.0 || x > width) {
|
||||
*w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
|
||||
*x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
|
||||
return;
|
||||
}
|
||||
if (y <= 0) {
|
||||
y = 0;
|
||||
}
|
||||
if (x <= 0) {
|
||||
x = 0;
|
||||
}
|
||||
*y_low = (int)y;
|
||||
*x_low = (int)x;
|
||||
if (*y_low >= height - 1) {
|
||||
*y_high = height - 1, *y_low = height - 1;
|
||||
y = (float)(*y_low);
|
||||
} else {
|
||||
*y_high = *y_low + 1;
|
||||
}
|
||||
if (*x_low >= width - 1) {
|
||||
*x_high = width - 1, *x_low = width - 1;
|
||||
x = (float)(*x_low);
|
||||
} else {
|
||||
*x_high = *x_low + 1;
|
||||
}
|
||||
float ly = y - *y_low, lx = x - *x_low;
|
||||
float hy = 1.0 - ly, hx = 1.0 - lx;
|
||||
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void unionRoiAlignBp(
|
||||
T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
|
||||
const int wi, const int c, const int no, const int ho, const int wo,
|
||||
const float spatial_scale, const int sampling_ratio, const bool aligned) {
|
||||
int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
|
||||
int deal_all = boxes_num * hi * wi;
|
||||
int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
|
||||
for (int i = 0; i < deal_this_core; ++i) {
|
||||
int bhw_id = i * taskDim + taskId;
|
||||
int box_id = bhw_id / (hi * wi);
|
||||
int ih = (bhw_id / wi) % hi;
|
||||
int iw = bhw_id % wi;
|
||||
T *box = boxes + box_id * 5;
|
||||
int image_id = (int)box[0];
|
||||
T *image_offset = grads_image + image_id * ho * wo * c;
|
||||
T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
|
||||
|
||||
float offset = aligned ? 0.5 : 0.0;
|
||||
float x1 = box[1] * spatial_scale - offset;
|
||||
float y1 = box[2] * spatial_scale - offset;
|
||||
float x2 = box[3] * spatial_scale - offset;
|
||||
float y2 = box[4] * spatial_scale - offset;
|
||||
float roi_width = x2 - x1;
|
||||
float roi_height = y2 - y1;
|
||||
if (!aligned) {
|
||||
roi_width = (roi_width > 1.0) ? roi_width : 1.0;
|
||||
roi_height = (roi_height > 1.0) ? roi_height : 1.0;
|
||||
}
|
||||
float bin_size_h = roi_height / hi;
|
||||
float bin_size_w = roi_width / wi;
|
||||
|
||||
int roi_grid_h =
|
||||
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
|
||||
int roi_grid_w =
|
||||
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
|
||||
const T count = roi_grid_h * roi_grid_w;
|
||||
if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
|
||||
for (int iy = 0; iy < roi_grid_h; ++iy) {
|
||||
const float y =
|
||||
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
|
||||
for (int ix = 0; ix < roi_grid_w; ++ix) {
|
||||
const float x =
|
||||
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
|
||||
float w1, w2, w3, w4;
|
||||
int x_low, x_high, y_low, y_high;
|
||||
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
|
||||
&x_high, &y_low, &y_high);
|
||||
if (x_low >= 0 && y_low >= 0) {
|
||||
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
|
||||
c_align);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
|
||||
1 / count, c_align);
|
||||
__bang_atomic_add((T *)buffer + c_align,
|
||||
image_offset + y_low * wo * c + x_low * c,
|
||||
(T *)buffer + c_align, c);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
|
||||
c_align);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
|
||||
1 / count, c_align);
|
||||
__bang_atomic_add((T *)buffer + c_align,
|
||||
image_offset + y_low * wo * c + x_high * c,
|
||||
(T *)buffer + c_align, c);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
|
||||
c_align);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
|
||||
1 / count, c_align);
|
||||
__bang_atomic_add((T *)buffer + c_align,
|
||||
image_offset + y_high * wo * c + x_low * c,
|
||||
(T *)buffer + c_align, c);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
|
||||
c_align);
|
||||
__bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
|
||||
1 / count, c_align);
|
||||
__bang_atomic_add((T *)buffer + c_align,
|
||||
image_offset + y_high * wo * c + x_high * c,
|
||||
(T *)buffer + c_align, c);
|
||||
} // x_low && y_low
|
||||
} // ix
|
||||
} // iy
|
||||
} else {
|
||||
for (int iy = 0; iy < roi_grid_h; ++iy) {
|
||||
const float y =
|
||||
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
|
||||
for (int ix = 0; ix < roi_grid_w; ++ix) {
|
||||
const float x =
|
||||
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
|
||||
float w1, w2, w3, w4;
|
||||
int x_low, x_high, y_low, y_high;
|
||||
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
|
||||
&x_high, &y_low, &y_high);
|
||||
if (x_low >= 0 && y_low >= 0) {
|
||||
int deal_once =
|
||||
PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
|
||||
int c_repeat = c / deal_once + (int)(c % deal_once != 0);
|
||||
for (int i = 0; i < c_repeat; ++i) {
|
||||
int deal_c = deal_once;
|
||||
int align_c = deal_once;
|
||||
if (i == c_repeat - 1) {
|
||||
deal_c = c - i * deal_once;
|
||||
align_c = c_align - i * deal_once;
|
||||
}
|
||||
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
|
||||
GDRAM2NRAM);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
|
||||
align_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
|
||||
1 / count, align_c);
|
||||
__bang_atomic_add(
|
||||
(T *)buffer + align_c,
|
||||
image_offset + y_low * wo * c + x_low * c + i * deal_once,
|
||||
(T *)buffer + align_c, deal_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
|
||||
align_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
|
||||
1 / count, align_c);
|
||||
__bang_atomic_add(
|
||||
(T *)buffer + align_c,
|
||||
image_offset + y_low * wo * c + x_high * c + i * deal_once,
|
||||
(T *)buffer + align_c, deal_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
|
||||
align_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
|
||||
1 / count, align_c);
|
||||
__bang_atomic_add(
|
||||
(T *)buffer + align_c,
|
||||
image_offset + y_high * wo * c + x_low * c + i * deal_once,
|
||||
(T *)buffer + align_c, deal_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
|
||||
align_c);
|
||||
__bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
|
||||
1 / count, align_c);
|
||||
__bang_atomic_add(
|
||||
(T *)buffer + align_c,
|
||||
image_offset + y_high * wo * c + x_high * c + i * deal_once,
|
||||
(T *)buffer + align_c, deal_c);
|
||||
} // for c_repeat
|
||||
} // x_low >= 0 && y_low >= 0
|
||||
} // ix
|
||||
} // iy
|
||||
} // if c
|
||||
} // i
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
|
||||
const void *grads, const void *boxes, void *grads_image,
|
||||
const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
|
||||
const int c, const int no, const int ho, const int wo,
|
||||
const float spatial_scale, const int sampling_ratio, const bool aligned) {
|
||||
// make sure that memcore is not used
|
||||
if (coreId == 0x80) {
|
||||
return;
|
||||
}
|
||||
switch (dtype) {
|
||||
case CNRT_FLOAT16: {
|
||||
unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
|
||||
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
|
||||
sampling_ratio, aligned);
|
||||
}; break;
|
||||
case CNRT_FLOAT32: {
|
||||
unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
|
||||
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
|
||||
sampling_ratio, aligned);
|
||||
}; break;
|
||||
default: { return; }
|
||||
}
|
||||
}
|
||||
} // namespace backward
|
||||
|
||||
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const cnrtDataType_t d_type,
|
||||
const void *input, const void *rois, const int channels,
|
||||
const bool aligned, const int pooled_height,
|
||||
const int pooled_width, const int input_height,
|
||||
const int input_width, const int sampling_ratio,
|
||||
const float spatial_scale, const int num_rois,
|
||||
void *output) {
|
||||
forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
|
||||
input, rois, channels, aligned, pooled_height, pooled_width, input_height,
|
||||
input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
|
||||
}
|
||||
|
||||
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const cnrtDataType_t dtype,
|
||||
const void *grads, const void *boxes,
|
||||
void *grads_image, const int boxes_num,
|
||||
const int hi, const int wi, const int c,
|
||||
const int no, const int ho, const int wo,
|
||||
const float spatial_scale, const int sampling_ratio,
|
||||
const bool aligned) {
|
||||
backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
|
||||
grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
|
||||
spatial_scale, sampling_ratio, aligned);
|
||||
}
|
|
@ -1,649 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2022 by Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "common_mlu_helper.hpp"
|
||||
|
||||
__nram__ char nram_buffer[MAX_NRAM_SIZE];
|
||||
|
||||
__mlu_func__ void floor(float *dst_ram, float *src_ram, int size) {
|
||||
#if (__BANG_ARCH__ >= 322)
|
||||
__bang_floor(dst_ram, src_ram, size); // This bang interface is for nan/inf
|
||||
// temp.
|
||||
#else
|
||||
int16 *mid = (int16 *)(dst_ram + size / 2);
|
||||
__bang_float2int16_dn(mid, (float *)src_ram, size, 0);
|
||||
__bang_int162float((float *)dst_ram, mid, size, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
__mlu_func__ void computeDynamicVoxelize(
|
||||
char *points_x, char *points_y, char *points_z, char *auxiliary_a,
|
||||
char *auxiliary_b, char *auxiliary_c, const float coors_x_min,
|
||||
const float coors_y_min, const float coors_z_min, const float voxel_x,
|
||||
const float voxel_y, const float voxel_z, const int32_t grid_x,
|
||||
const int32_t grid_y, const int32_t grid_z, const int32_t deal_num) {
|
||||
// x - coors_x_min
|
||||
__bang_sub_scalar((float *)points_x, (float *)points_x, coors_x_min,
|
||||
deal_num);
|
||||
// (x - coors_x_min) / voxel_x
|
||||
__bang_mul_scalar((float *)points_x, (float *)points_x, 1.0 / voxel_x,
|
||||
deal_num);
|
||||
|
||||
// y - coors_y_min
|
||||
__bang_sub_scalar((float *)points_y, (float *)points_y, coors_y_min,
|
||||
deal_num);
|
||||
// (y - coors_y_min) / voxel_y
|
||||
__bang_mul_scalar((float *)points_y, (float *)points_y, 1.0 / voxel_y,
|
||||
deal_num);
|
||||
|
||||
// z - coors_z_min
|
||||
__bang_sub_scalar((float *)points_z, (float *)points_z, coors_z_min,
|
||||
deal_num);
|
||||
// (z - coors_z_min) / voxel_z
|
||||
__bang_mul_scalar((float *)points_z, (float *)points_z, 1.0 / voxel_z,
|
||||
deal_num);
|
||||
|
||||
#if __BANG_ARCH__ >= 322
|
||||
// c_x = floor((x - coors_x_min) / voxel_x)
|
||||
__bang_floor((float *)auxiliary_a, (float *)points_x, deal_num);
|
||||
__bang_float2int32((int32_t *)points_x, (float *)auxiliary_a, deal_num, 0);
|
||||
// c_y = floor((y - coors_y_min) / voxel_y)
|
||||
__bang_floor((float *)auxiliary_a, (float *)points_y, deal_num);
|
||||
__bang_float2int32((int32_t *)points_y, (float *)auxiliary_a, deal_num, 0);
|
||||
// c_z = floor((z - coors_z_min) / voxel_z)
|
||||
__bang_floor((float *)auxiliary_a, (float *)points_z, deal_num);
|
||||
__bang_float2int32((int32_t *)points_z, (float *)auxiliary_a, deal_num, 0);
|
||||
// c_x >= 0
|
||||
__bang_ge_scalar((int32_t *)auxiliary_b, (int32_t *)points_x, (int32_t)0,
|
||||
deal_num);
|
||||
// c_x < grid_x
|
||||
__bang_lt_scalar((int32_t *)auxiliary_c, (int32_t *)points_x, grid_x,
|
||||
deal_num);
|
||||
// 0 <= c_x < grid_x
|
||||
__bang_and((int32_t *)auxiliary_a, (int32_t *)auxiliary_b,
|
||||
(int32_t *)auxiliary_c, deal_num);
|
||||
// c_y >= 0
|
||||
__bang_ge_scalar((int32_t *)auxiliary_b, (int32_t *)points_y, (int32_t)0,
|
||||
deal_num);
|
||||
// c_y < grid_y
|
||||
__bang_lt_scalar((int32_t *)auxiliary_c, (int32_t *)points_y, grid_y,
|
||||
deal_num);
|
||||
// 0 <= c_y < grid_y
|
||||
__bang_and((int32_t *)auxiliary_b, (int32_t *)auxiliary_b,
|
||||
(int32_t *)auxiliary_c, deal_num);
|
||||
// c_x >= 0 && c_x < grid_x && c_y >= 0 && c_y < grid_y
|
||||
__bang_and((int32_t *)auxiliary_a, (int32_t *)auxiliary_a,
|
||||
(int32_t *)auxiliary_b, deal_num);
|
||||
// c_z >= 0
|
||||
__bang_ge_scalar((int32_t *)auxiliary_b, (int32_t *)points_z, (int32_t)0,
|
||||
deal_num);
|
||||
// c_z < grid_z
|
||||
__bang_lt_scalar((int32_t *)auxiliary_c, (int32_t *)points_z, grid_z,
|
||||
deal_num);
|
||||
// 0 <= c_z < grid_z
|
||||
__bang_and((int32_t *)auxiliary_b, (int32_t *)auxiliary_b,
|
||||
(int32_t *)auxiliary_c, deal_num);
|
||||
// 0 <= c_x < grid_x && 0 <= c_y < grid_y && 0 <= c_z < grid_z
|
||||
__bang_and((int32_t *)auxiliary_a, (int32_t *)auxiliary_a,
|
||||
(int32_t *)auxiliary_b, deal_num);
|
||||
__bang_not((int32_t *)auxiliary_c, (int32_t *)auxiliary_a, deal_num);
|
||||
|
||||
__bang_mul((int32_t *)points_x, (int32_t *)points_x, (int32_t *)auxiliary_a,
|
||||
deal_num);
|
||||
__bang_mul_scalar((int32_t *)auxiliary_b, (int32_t *)auxiliary_c,
|
||||
(int32_t)(-1), deal_num);
|
||||
__bang_add((int32_t *)points_x, (int32_t *)points_x, (int32_t *)auxiliary_b,
|
||||
deal_num);
|
||||
__bang_mul((int32_t *)points_y, (int32_t *)points_y, (int32_t *)auxiliary_a,
|
||||
deal_num);
|
||||
__bang_add((int32_t *)points_y, (int32_t *)points_y, (int32_t *)auxiliary_b,
|
||||
deal_num);
|
||||
__bang_mul((int32_t *)points_z, (int32_t *)points_z, (int32_t *)auxiliary_a,
|
||||
deal_num);
|
||||
__bang_add((int32_t *)points_z, (int32_t *)points_z, (int32_t *)auxiliary_b,
|
||||
deal_num);
|
||||
#else
|
||||
// c_x >= 0
|
||||
__bang_ge_scalar((float *)auxiliary_b, (float *)points_x, (float)0, deal_num);
|
||||
|
||||
// c_x < grid_x
|
||||
__bang_write_value((float *)auxiliary_a, deal_num, (float)grid_x);
|
||||
__bang_lt((float *)auxiliary_c, (float *)points_x, (float *)auxiliary_a,
|
||||
deal_num);
|
||||
// 0 <= c_x < grid_x
|
||||
__bang_and((float *)auxiliary_a, (float *)auxiliary_b, (float *)auxiliary_c,
|
||||
deal_num);
|
||||
// c_y >= 0
|
||||
__bang_ge_scalar((float *)auxiliary_b, (float *)points_y, (float)0, deal_num);
|
||||
// c_y < grid_y
|
||||
__bang_write_value((float *)auxiliary_c, deal_num, (float)grid_y);
|
||||
__bang_lt((float *)auxiliary_c, (float *)points_y, (float *)auxiliary_c,
|
||||
deal_num);
|
||||
// 0 <= c_y < grid_y
|
||||
__bang_and((float *)auxiliary_b, (float *)auxiliary_b, (float *)auxiliary_c,
|
||||
deal_num);
|
||||
// c_x >= 0 && c_x < grid_x && c_y >= 0 && c_y < grid_y
|
||||
__bang_and((float *)auxiliary_a, (float *)auxiliary_a, (float *)auxiliary_b,
|
||||
deal_num);
|
||||
// c_z >= 0
|
||||
__bang_ge_scalar((float *)auxiliary_b, (float *)points_z, (float)0, deal_num);
|
||||
// c_z < grid_z
|
||||
__bang_write_value((float *)auxiliary_c, deal_num, (float)grid_z);
|
||||
__bang_lt((float *)auxiliary_c, (float *)points_z, (float *)auxiliary_c,
|
||||
deal_num);
|
||||
// 0 <= c_z < grid_z
|
||||
__bang_and((float *)auxiliary_b, (float *)auxiliary_b, (float *)auxiliary_c,
|
||||
deal_num);
|
||||
// 0 <= c_x < grid_x && 0 <= c_y < grid_y && 0 <= c_z < grid_z
|
||||
__bang_and((float *)auxiliary_a, (float *)auxiliary_a, (float *)auxiliary_b,
|
||||
deal_num);
|
||||
__bang_not((float *)auxiliary_c, (float *)auxiliary_a, deal_num);
|
||||
|
||||
__bang_mul((float *)points_x, (float *)points_x, (float *)auxiliary_a,
|
||||
deal_num);
|
||||
__bang_mul_scalar((float *)auxiliary_b, (float *)auxiliary_c, (float)(-1.0),
|
||||
deal_num);
|
||||
__bang_add((float *)points_x, (float *)points_x, (float *)auxiliary_b,
|
||||
deal_num);
|
||||
__bang_mul((float *)points_y, (float *)points_y, (float *)auxiliary_a,
|
||||
deal_num);
|
||||
__bang_add((float *)points_y, (float *)points_y, (float *)auxiliary_b,
|
||||
deal_num);
|
||||
__bang_mul((float *)points_z, (float *)points_z, (float *)auxiliary_a,
|
||||
deal_num);
|
||||
__bang_add((float *)points_z, (float *)points_z, (float *)auxiliary_b,
|
||||
deal_num);
|
||||
|
||||
floor((float *)auxiliary_a, (float *)points_x, deal_num);
|
||||
convertFloat2Int((int32_t *)points_x, (float *)auxiliary_b,
|
||||
(float *)auxiliary_a, (float *)auxiliary_c, deal_num);
|
||||
|
||||
floor((float *)auxiliary_a, (float *)points_y, deal_num);
|
||||
convertFloat2Int((int32_t *)points_y, (float *)auxiliary_b,
|
||||
(float *)auxiliary_a, (float *)auxiliary_c, deal_num);
|
||||
|
||||
floor((float *)auxiliary_a, (float *)points_z, deal_num);
|
||||
convertFloat2Int((int32_t *)points_z, (float *)auxiliary_b,
|
||||
(float *)auxiliary_a, (float *)auxiliary_c, deal_num);
|
||||
#endif
|
||||
}
|
||||
|
||||
__mlu_func__ void computePoint2Voxel(
|
||||
char *coors_x, char *coors_y, char *coors_z, char *src_addition,
|
||||
char *dst_addition, char *dst, const int32_t c_x, const int32_t c_y,
|
||||
const int32_t c_z, const int32_t max_points, int32_t *num,
|
||||
int32_t *first_point, const int32_t deal_idx, const int32_t deal_num) {
|
||||
#if __BANG_ARCH__ >= 322
|
||||
__bang_eq_scalar((int32_t *)coors_x, (int32_t *)coors_x, c_x, deal_num);
|
||||
__bang_eq_scalar((int32_t *)coors_y, (int32_t *)coors_y, c_y, deal_num);
|
||||
__bang_eq_scalar((int32_t *)coors_z, (int32_t *)coors_z, c_z, deal_num);
|
||||
__bang_mul((int32_t *)coors_x, (int32_t *)coors_x, (int32_t *)coors_y,
|
||||
deal_num);
|
||||
__bang_mul((int32_t *)coors_x, (int32_t *)coors_x, (int32_t *)coors_z,
|
||||
deal_num);
|
||||
if (*num == 0) {
|
||||
*num = (int32_t)__bang_count((float *)coors_x, deal_num);
|
||||
if (*num > 0) {
|
||||
*first_point =
|
||||
(int32_t)__bang_findfirst1((float *)coors_x, deal_num) + deal_idx;
|
||||
}
|
||||
} else {
|
||||
*num += (int32_t)__bang_count((float *)coors_x, deal_num);
|
||||
}
|
||||
#else
|
||||
convertInt2Float((float *)dst, (float *)dst_addition, (int32_t *)coors_x,
|
||||
(float *)src_addition, deal_num);
|
||||
__bang_write_value((float *)src_addition, deal_num, (float)c_x);
|
||||
__bang_eq((float *)coors_x, (float *)dst, (float *)src_addition, deal_num);
|
||||
|
||||
convertInt2Float((float *)dst, (float *)dst_addition, (int32_t *)coors_y,
|
||||
(float *)src_addition, deal_num);
|
||||
__bang_write_value((float *)src_addition, deal_num, (float)c_y);
|
||||
__bang_eq((float *)coors_y, (float *)dst, (float *)src_addition, deal_num);
|
||||
|
||||
convertInt2Float((float *)dst, (float *)dst_addition, (int32_t *)coors_z,
|
||||
(float *)src_addition, deal_num);
|
||||
__bang_write_value((float *)src_addition, deal_num, (float)c_z);
|
||||
__bang_eq((float *)coors_z, (float *)dst, (float *)src_addition, deal_num);
|
||||
|
||||
__bang_mul((float *)coors_x, (float *)coors_x, (float *)coors_y, deal_num);
|
||||
__bang_mul((float *)coors_x, (float *)coors_x, (float *)coors_z, deal_num);
|
||||
if (*num == 0) {
|
||||
*num = (int32_t)__bang_count((float *)coors_x, deal_num);
|
||||
if (*num > 0) {
|
||||
*first_point =
|
||||
(int32_t)__bang_findfirst1((float *)coors_x, deal_num) + deal_idx;
|
||||
}
|
||||
} else {
|
||||
*num += (int32_t)__bang_count((float *)coors_x, deal_num);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelDynamicVoxelize(
|
||||
const float *points, int32_t *coors, const float voxel_x,
|
||||
const float voxel_y, const float voxel_z, const float coors_x_min,
|
||||
const float coors_y_min, const float coors_z_min, const float coors_x_max,
|
||||
const float coors_y_max, const float coors_z_max, const int32_t grid_x,
|
||||
const int32_t grid_y, const int32_t grid_z, const int32_t num_points,
|
||||
const int32_t num_features) {
|
||||
if (coreId == 0x80) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t points_rem = num_points % taskDim;
|
||||
const int32_t points_per_core =
|
||||
taskId < points_rem ? num_points / taskDim + 1 : num_points / taskDim;
|
||||
const int32_t points_start = taskId < points_rem
|
||||
? taskId * points_per_core
|
||||
: taskId * points_per_core + points_rem;
|
||||
const int32_t split_num = 9;
|
||||
const int32_t deal_num =
|
||||
PAD_DOWN(MAX_NRAM_SIZE / split_num / sizeof(float), NFU_ALIGN_SIZE);
|
||||
const int32_t repeat = points_per_core / deal_num;
|
||||
const int32_t rem = points_per_core % deal_num;
|
||||
const int32_t rem_align = CEIL_ALIGN(rem, NFU_ALIGN_SIZE);
|
||||
const int32_t ping_pong_gap = 3 * deal_num * sizeof(float);
|
||||
char *points_x = nram_buffer;
|
||||
char *points_y = points_x + deal_num * sizeof(float);
|
||||
char *points_z = points_y + deal_num * sizeof(float);
|
||||
char *auxiliary_a = points_x + 2 * ping_pong_gap;
|
||||
char *auxiliary_b = auxiliary_a + deal_num * sizeof(float);
|
||||
char *auxiliary_c = auxiliary_b + deal_num * sizeof(float);
|
||||
|
||||
int32_t *coors_z_start = coors + points_start;
|
||||
int32_t *coors_y_start = coors + num_points + points_start;
|
||||
int32_t *coors_x_start = coors + num_points * 2 + points_start;
|
||||
|
||||
if (repeat > 0) {
|
||||
__memcpy_async(points_x, points + points_start * num_features,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
__memcpy_async(points_y, points + points_start * num_features + 1,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
__memcpy_async(points_z, points + points_start * num_features + 2,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
if (repeat > 1) {
|
||||
__memcpy_async(points_x + ping_pong_gap,
|
||||
points + (points_start + deal_num) * num_features,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
__memcpy_async(points_y + ping_pong_gap,
|
||||
points + (points_start + deal_num) * num_features + 1,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
__memcpy_async(points_z + ping_pong_gap,
|
||||
points + (points_start + deal_num) * num_features + 2,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
computeDynamicVoxelize(points_x, points_y, points_z, auxiliary_a,
|
||||
auxiliary_b, auxiliary_c, coors_x_min, coors_y_min,
|
||||
coors_z_min, voxel_x, voxel_y, voxel_z, grid_x,
|
||||
grid_y, grid_z, deal_num);
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < repeat - 2; ++i) {
|
||||
__memcpy_async(coors_x_start + i * deal_num,
|
||||
points_x + (i % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_y_start + i * deal_num,
|
||||
points_y + (i % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_z_start + i * deal_num,
|
||||
points_z + (i % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(points_x + (i % 2) * ping_pong_gap,
|
||||
points + (points_start + (i + 2) * deal_num) * num_features,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), deal_num - 1);
|
||||
__memcpy_async(
|
||||
points_y + (i % 2) * ping_pong_gap,
|
||||
points + (points_start + (i + 2) * deal_num) * num_features + 1,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float),
|
||||
deal_num - 1);
|
||||
__memcpy_async(
|
||||
points_z + (i % 2) * ping_pong_gap,
|
||||
points + (points_start + (i + 2) * deal_num) * num_features + 2,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float),
|
||||
deal_num - 1);
|
||||
computeDynamicVoxelize(points_x + ((i + 1) % 2) * ping_pong_gap,
|
||||
points_y + ((i + 1) % 2) * ping_pong_gap,
|
||||
points_z + ((i + 1) % 2) * ping_pong_gap,
|
||||
auxiliary_a, auxiliary_b, auxiliary_c, coors_x_min,
|
||||
coors_y_min, coors_z_min, voxel_x, voxel_y, voxel_z,
|
||||
grid_x, grid_y, grid_z, deal_num);
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
|
||||
if (repeat >= 2) {
|
||||
__memcpy_async(coors_x_start + (repeat - 2) * deal_num,
|
||||
points_x + (repeat % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_y_start + (repeat - 2) * deal_num,
|
||||
points_y + (repeat % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_z_start + (repeat - 2) * deal_num,
|
||||
points_z + (repeat % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
}
|
||||
if (rem > 0) {
|
||||
__memcpy_async(points_x + (repeat % 2) * ping_pong_gap,
|
||||
points + (points_start + repeat * deal_num) * num_features,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float),
|
||||
num_features * sizeof(float), rem - 1);
|
||||
__memcpy_async(
|
||||
points_y + (repeat % 2) * ping_pong_gap,
|
||||
points + (points_start + repeat * deal_num) * num_features + 1,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float),
|
||||
rem - 1);
|
||||
__memcpy_async(
|
||||
points_z + (repeat % 2) * ping_pong_gap,
|
||||
points + (points_start + repeat * deal_num) * num_features + 2,
|
||||
sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float),
|
||||
rem - 1);
|
||||
}
|
||||
if (repeat > 0) {
|
||||
computeDynamicVoxelize(points_x + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
points_y + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
points_z + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
auxiliary_a, auxiliary_b, auxiliary_c, coors_x_min,
|
||||
coors_y_min, coors_z_min, voxel_x, voxel_y, voxel_z,
|
||||
grid_x, grid_y, grid_z, deal_num);
|
||||
}
|
||||
__asm__ volatile("sync;");
|
||||
|
||||
if (repeat > 0) {
|
||||
__memcpy_async(coors_x_start + (repeat - 1) * deal_num,
|
||||
points_x + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_y_start + (repeat - 1) * deal_num,
|
||||
points_y + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_z_start + (repeat - 1) * deal_num,
|
||||
points_z + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
deal_num * sizeof(int32_t), NRAM2GDRAM);
|
||||
}
|
||||
if (rem > 0) {
|
||||
computeDynamicVoxelize(points_x + (repeat % 2) * ping_pong_gap,
|
||||
points_y + (repeat % 2) * ping_pong_gap,
|
||||
points_z + (repeat % 2) * ping_pong_gap, auxiliary_a,
|
||||
auxiliary_b, auxiliary_c, coors_x_min, coors_y_min,
|
||||
coors_z_min, voxel_x, voxel_y, voxel_z, grid_x,
|
||||
grid_y, grid_z, rem_align);
|
||||
__asm__ volatile("sync;");
|
||||
__memcpy_async(coors_x_start + repeat * deal_num,
|
||||
points_x + (repeat % 2) * ping_pong_gap,
|
||||
rem * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_y_start + repeat * deal_num,
|
||||
points_y + (repeat % 2) * ping_pong_gap,
|
||||
rem * sizeof(int32_t), NRAM2GDRAM);
|
||||
__memcpy_async(coors_z_start + repeat * deal_num,
|
||||
points_z + (repeat % 2) * ping_pong_gap,
|
||||
rem * sizeof(int32_t), NRAM2GDRAM);
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelPoint2Voxel(int32_t *coors,
|
||||
int32_t *point_to_pointidx,
|
||||
int32_t *point_to_voxelidx,
|
||||
const int32_t num_points,
|
||||
const int32_t max_points) {
|
||||
#if __BANG_ARCH__ >= 322
|
||||
const int32_t split_num = 6;
|
||||
#else
|
||||
const int32_t split_num = 9; // one temp space for computePoint2Voxel in
|
||||
// mlu2xx
|
||||
#endif
|
||||
const int32_t deal_num =
|
||||
PAD_DOWN(MAX_NRAM_SIZE / split_num / sizeof(int32_t), NFU_ALIGN_SIZE);
|
||||
char *coors_x = nram_buffer;
|
||||
char *coors_y = coors_x + deal_num * sizeof(int32_t);
|
||||
char *coors_z = coors_y + deal_num * sizeof(int32_t);
|
||||
const int32_t ping_pong_gap = 3 * deal_num * sizeof(int32_t);
|
||||
#if __BANG_ARCH__ >= 322
|
||||
char *src_addition = nullptr;
|
||||
char *dst_addition = nullptr;
|
||||
char *dst = nullptr;
|
||||
#else
|
||||
char *src_addition = coors_x + 2 * ping_pong_gap;
|
||||
char *dst_addition = src_addition + deal_num * sizeof(int32_t);
|
||||
char *dst = dst_addition + deal_num * sizeof(int32_t);
|
||||
#endif
|
||||
|
||||
int32_t *coors_z_start = coors;
|
||||
int32_t *coors_y_start = coors + num_points;
|
||||
int32_t *coors_x_start = coors + num_points * 2;
|
||||
|
||||
for (int32_t point_idx = taskId; point_idx < num_points;
|
||||
point_idx += taskDim) {
|
||||
if (coors_x_start[point_idx] == -1) {
|
||||
point_to_pointidx[point_idx] = -1;
|
||||
point_to_voxelidx[point_idx] = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
int32_t c_x = coors_x_start[point_idx];
|
||||
int32_t c_y = coors_y_start[point_idx];
|
||||
int32_t c_z = coors_z_start[point_idx];
|
||||
|
||||
int32_t deal_total_num = point_idx;
|
||||
const int32_t repeat = deal_total_num / deal_num;
|
||||
const int32_t rem = deal_total_num % deal_num;
|
||||
int32_t rem_align = CEIL_ALIGN(rem, NFU_ALIGN_SIZE);
|
||||
|
||||
#if __BANG_ARCH__ >= 322
|
||||
rem_align = rem;
|
||||
#endif
|
||||
int32_t num = 0;
|
||||
int32_t first_point = -1;
|
||||
if (repeat > 0) {
|
||||
__memcpy_async(coors_x, coors_x_start, deal_num * sizeof(int32_t),
|
||||
GDRAM2NRAM);
|
||||
__memcpy_async(coors_y, coors_y_start, deal_num * sizeof(int32_t),
|
||||
GDRAM2NRAM);
|
||||
__memcpy_async(coors_z, coors_z_start, deal_num * sizeof(int32_t),
|
||||
GDRAM2NRAM);
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < repeat - 1; ++i) {
|
||||
__memcpy_async(coors_x + ((i + 1) % 2) * ping_pong_gap,
|
||||
coors_x_start + (i + 1) * deal_num,
|
||||
deal_num * sizeof(int32_t), GDRAM2NRAM);
|
||||
__memcpy_async(coors_y + ((i + 1) % 2) * ping_pong_gap,
|
||||
coors_y_start + (i + 1) * deal_num,
|
||||
deal_num * sizeof(int32_t), GDRAM2NRAM);
|
||||
__memcpy_async(coors_z + ((i + 1) % 2) * ping_pong_gap,
|
||||
coors_z_start + (i + 1) * deal_num,
|
||||
deal_num * sizeof(int32_t), GDRAM2NRAM);
|
||||
computePoint2Voxel(coors_x + (i % 2) * ping_pong_gap,
|
||||
coors_y + (i % 2) * ping_pong_gap,
|
||||
coors_z + (i % 2) * ping_pong_gap, src_addition,
|
||||
dst_addition, dst, c_x, c_y, c_z, max_points, &num,
|
||||
&first_point, i * deal_num, deal_num);
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
|
||||
if (rem > 0) {
|
||||
__bang_write_value((int32_t *)(coors_x + (repeat % 2) * ping_pong_gap),
|
||||
rem_align, -1);
|
||||
__bang_write_value((int32_t *)(coors_y + (repeat % 2) * ping_pong_gap),
|
||||
rem_align, -1);
|
||||
__bang_write_value((int32_t *)(coors_z + (repeat % 2) * ping_pong_gap),
|
||||
rem_align, -1);
|
||||
|
||||
__memcpy_async(coors_x + (repeat % 2) * ping_pong_gap,
|
||||
coors_x_start + repeat * deal_num, rem * sizeof(int32_t),
|
||||
GDRAM2NRAM);
|
||||
__memcpy_async(coors_y + (repeat % 2) * ping_pong_gap,
|
||||
coors_y_start + repeat * deal_num, rem * sizeof(int32_t),
|
||||
GDRAM2NRAM);
|
||||
__memcpy_async(coors_z + (repeat % 2) * ping_pong_gap,
|
||||
coors_z_start + repeat * deal_num, rem * sizeof(int32_t),
|
||||
GDRAM2NRAM);
|
||||
}
|
||||
if (repeat > 0) {
|
||||
computePoint2Voxel(coors_x + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
coors_y + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
coors_z + ((repeat - 1) % 2) * ping_pong_gap,
|
||||
src_addition, dst_addition, dst, c_x, c_y, c_z,
|
||||
max_points, &num, &first_point,
|
||||
(repeat - 1) * deal_num, deal_num);
|
||||
}
|
||||
__asm__ volatile("sync;");
|
||||
|
||||
if (rem > 0) {
|
||||
computePoint2Voxel(coors_x + (repeat % 2) * ping_pong_gap,
|
||||
coors_y + (repeat % 2) * ping_pong_gap,
|
||||
coors_z + (repeat % 2) * ping_pong_gap, src_addition,
|
||||
dst_addition, dst, c_x, c_y, c_z, max_points, &num,
|
||||
&first_point, repeat * deal_num, rem_align);
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
|
||||
if (num == 0) {
|
||||
point_to_pointidx[point_idx] = point_idx;
|
||||
} else if (num > 0) {
|
||||
point_to_pointidx[point_idx] = first_point;
|
||||
}
|
||||
|
||||
if (num < max_points) {
|
||||
point_to_voxelidx[point_idx] = num;
|
||||
} else {
|
||||
point_to_voxelidx[point_idx] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelCalcPointsPerVoxel(
|
||||
int32_t *point_to_pointidx, int32_t *point_to_voxelidx,
|
||||
int32_t *coor_to_voxelidx, int32_t *num_points_per_voxel,
|
||||
int32_t *voxel_num, const int32_t max_voxels, const int32_t num_points) {
|
||||
if (coreId == 0) {
|
||||
int32_t voxel_num_temp = 0;
|
||||
for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
|
||||
int32_t point_pos_in_voxel = point_to_voxelidx[point_idx];
|
||||
coor_to_voxelidx[point_idx] = -1;
|
||||
if (point_pos_in_voxel == -1) {
|
||||
continue;
|
||||
} else if (point_pos_in_voxel == 0) {
|
||||
int32_t voxel_idx = voxel_num_temp;
|
||||
if (voxel_num_temp >= max_voxels) {
|
||||
continue;
|
||||
}
|
||||
voxel_num_temp += 1;
|
||||
coor_to_voxelidx[point_idx] = voxel_idx;
|
||||
num_points_per_voxel[voxel_idx] = 1;
|
||||
} else {
|
||||
int32_t point_idx_temp = point_to_pointidx[point_idx];
|
||||
int32_t voxel_idx = coor_to_voxelidx[point_idx_temp];
|
||||
if (voxel_idx != -1) {
|
||||
coor_to_voxelidx[point_idx] = voxel_idx;
|
||||
num_points_per_voxel[voxel_idx] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
*voxel_num = voxel_num_temp;
|
||||
}
|
||||
}
|
||||
|
||||
__mlu_global__ void MLUUnion1KernelAssignVoxelsCoors(
|
||||
const float *points, int32_t *temp_coors, int32_t *point_to_voxelidx,
|
||||
int32_t *coor_to_voxelidx, float *voxels, int32_t *coors,
|
||||
const int32_t max_points, const int32_t num_points,
|
||||
const int32_t num_features) {
|
||||
if (coreId == 0x80) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t points_per_core = num_points / taskDim;
|
||||
int32_t points_rem = num_points % taskDim;
|
||||
int32_t points_start = taskId < points_rem
|
||||
? taskId * (points_per_core + 1)
|
||||
: taskId * points_per_core + points_rem;
|
||||
int32_t points_end = taskId < points_rem ? points_start + points_per_core + 1
|
||||
: points_start + points_per_core;
|
||||
|
||||
for (int32_t point_idx = points_start; point_idx < points_end; ++point_idx) {
|
||||
int32_t num = point_to_voxelidx[point_idx];
|
||||
int32_t voxel_idx = coor_to_voxelidx[point_idx];
|
||||
if (num > -1 && voxel_idx > -1) {
|
||||
float *voxels_offset =
|
||||
voxels + voxel_idx * max_points * num_features + num * num_features;
|
||||
const float *points_offset = points + point_idx * num_features;
|
||||
__memcpy_async(voxels_offset, points_offset, num_features * sizeof(float),
|
||||
GDRAM2GDRAM);
|
||||
|
||||
if (num == 0) {
|
||||
int32_t *coors_offset = coors + voxel_idx * 3;
|
||||
__memcpy_async(coors_offset, temp_coors + point_idx, sizeof(int32_t),
|
||||
GDRAM2GDRAM, sizeof(int32_t),
|
||||
num_points * sizeof(int32_t), 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
__asm__ volatile("sync;");
|
||||
}
|
||||
|
||||
void KernelDynamicVoxelize(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const void *points, void *coors,
|
||||
const float voxel_x, const float voxel_y,
|
||||
const float voxel_z, const float coors_x_min,
|
||||
const float coors_y_min, const float coors_z_min,
|
||||
const float coors_x_max, const float coors_y_max,
|
||||
const float coors_z_max, const int32_t grid_x,
|
||||
const int32_t grid_y, const int32_t grid_z,
|
||||
const int32_t num_points,
|
||||
const int32_t num_features) {
|
||||
MLUUnion1KernelDynamicVoxelize<<<k_dim, k_type, queue>>>(
|
||||
(float *)points, (int32_t *)coors, voxel_x, voxel_y, voxel_z, coors_x_min,
|
||||
coors_y_min, coors_z_min, coors_x_max, coors_y_max, coors_z_max, grid_x,
|
||||
grid_y, grid_z, num_points, num_features);
|
||||
}
|
||||
|
||||
void KernelPoint2Voxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, void *coors, void *point_to_pointidx,
|
||||
void *point_to_voxelidx, const int32_t num_points,
|
||||
const int32_t max_points) {
|
||||
MLUUnion1KernelPoint2Voxel<<<k_dim, k_type, queue>>>(
|
||||
(int32_t *)coors, (int32_t *)point_to_pointidx,
|
||||
(int32_t *)point_to_voxelidx, num_points, max_points);
|
||||
}
|
||||
|
||||
void KernelCalcPointsPerVoxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, void *point_to_pointidx,
|
||||
void *point_to_voxelidx, void *coor_to_voxelidx,
|
||||
void *num_points_per_voxel, void *voxel_num,
|
||||
const int32_t max_voxels,
|
||||
const int32_t num_points) {
|
||||
MLUUnion1KernelCalcPointsPerVoxel<<<k_dim, k_type, queue>>>(
|
||||
(int32_t *)point_to_pointidx, (int32_t *)point_to_voxelidx,
|
||||
(int32_t *)coor_to_voxelidx, (int32_t *)num_points_per_voxel,
|
||||
(int32_t *)voxel_num, max_voxels, num_points);
|
||||
}
|
||||
|
||||
void KernelAssignVoxelsCoors(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const void *points,
|
||||
void *temp_coors, void *point_to_voxelidx,
|
||||
void *coor_to_voxelidx, void *voxels, void *coors,
|
||||
const int32_t max_points, const int32_t num_points,
|
||||
const int32_t num_features) {
|
||||
MLUUnion1KernelAssignVoxelsCoors<<<k_dim, k_type, queue>>>(
|
||||
(float *)points, (int32_t *)temp_coors, (int32_t *)point_to_voxelidx,
|
||||
(int32_t *)coor_to_voxelidx, (float *)voxels, (int32_t *)coors,
|
||||
max_points, num_points, num_features);
|
||||
}
|
|
@ -10,114 +10,30 @@
|
|||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
|
||||
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t data_type_input, const void *boxes_dram,
|
||||
const int input_box_num, const float iou_threshold,
|
||||
void *workspace, void *output_size, void *output);
|
||||
|
||||
int selectType(uint32_t use_job, int box_num_per_core) {
|
||||
// the box_num_per_core should be at least 256, otherwise the real IO
|
||||
// bandwidth would be very low
|
||||
while (box_num_per_core < 256 && use_job >= 4) {
|
||||
box_num_per_core *= 2;
|
||||
use_job /= 2;
|
||||
}
|
||||
return use_job;
|
||||
}
|
||||
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
|
||||
int &core_num_per_class,
|
||||
const int input_box_num) {
|
||||
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
uint32_t job_limit = getJobLimitCapability();
|
||||
uint32_t core_number = job_limit;
|
||||
|
||||
int box_num_per_core = (input_box_num + core_number - 1) / core_number;
|
||||
int use_job = selectType(job_limit, box_num_per_core);
|
||||
// initiate k_type as Union1
|
||||
k_dim->x = core_dim;
|
||||
k_dim->y = 1;
|
||||
k_dim->z = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
switch (job_limit) {
|
||||
case CN_KERNEL_CLASS_BLOCK:
|
||||
case CN_KERNEL_CLASS_UNION:
|
||||
case CN_KERNEL_CLASS_UNION2:
|
||||
case CN_KERNEL_CLASS_UNION4:
|
||||
case CN_KERNEL_CLASS_UNION8:
|
||||
case CN_KERNEL_CLASS_UNION16: {
|
||||
if (use_job < 4) {
|
||||
k_dim->x = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_BLOCK;
|
||||
} else if (use_job == 4) {
|
||||
k_dim->x = core_dim;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
} else {
|
||||
k_dim->x = use_job;
|
||||
*k_type = (cnrtFunctionType_t)use_job;
|
||||
}
|
||||
}; break;
|
||||
default:
|
||||
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
|
||||
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
|
||||
}
|
||||
return CNNL_STATUS_SUCCESS;
|
||||
}
|
||||
#include "mlu_common_helper.h"
|
||||
|
||||
void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
|
||||
float iou_threshold) {
|
||||
// dimension parameters check
|
||||
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
|
||||
boxes.dim(), "D");
|
||||
TORCH_CHECK(boxes.size(1) == 7,
|
||||
"boxes should have 7 elements in dimension 1, got ",
|
||||
boxes.size(1));
|
||||
|
||||
// data type check
|
||||
TORCH_CHECK(
|
||||
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
|
||||
"data type of boxes should be Float or Half, got ", boxes.scalar_type());
|
||||
|
||||
if (boxes.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
const size_t max_input_num = 2147483648; // 2^31, 2G num
|
||||
TORCH_CHECK(boxes.numel() < max_input_num,
|
||||
"boxes.numel() should be less than 2147483648, got ",
|
||||
boxes.numel());
|
||||
|
||||
int input_box_num = boxes.size(0);
|
||||
|
||||
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtJobType_t k_type;
|
||||
|
||||
int core_num_per_class;
|
||||
policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);
|
||||
|
||||
// transpose boxes (n, 7) to (7, n) for better performance
|
||||
auto boxes_t = boxes.transpose(0, 1);
|
||||
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
|
||||
|
||||
auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
|
||||
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
|
||||
auto output = keep.to(boxes.options().dtype(at::kInt));
|
||||
auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
|
||||
|
||||
// workspace
|
||||
const int info_num = 7; // x, y,z, dx, dy, dz,angle
|
||||
size_t space_size = 0;
|
||||
if (boxes.scalar_type() == at::kHalf) {
|
||||
space_size = input_box_num * sizeof(int16_t) * info_num +
|
||||
input_box_num * sizeof(float) + sizeof(float);
|
||||
} else {
|
||||
space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
|
||||
}
|
||||
MluOpTensorDescriptor boxes_desc, output_desc;
|
||||
boxes_desc.set(boxes_);
|
||||
output_desc.set(output);
|
||||
|
||||
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
|
||||
// workspace
|
||||
size_t workspace_size = 0;
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL, &workspace_size);
|
||||
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
|
||||
auto boxes_ptr = boxes_impl->cnnlMalloc();
|
||||
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
|
||||
|
@ -127,11 +43,29 @@ void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
|
|||
auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
|
||||
auto output_size_ptr = output_size_impl->cnnlMalloc();
|
||||
|
||||
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
|
||||
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
|
||||
KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
|
||||
iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
|
||||
// nms desc
|
||||
mluOpNmsDescriptor_t nms_desc;
|
||||
const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
|
||||
const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
|
||||
const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
|
||||
const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
|
||||
const float soft_nms_sigma = 0.0;
|
||||
const float confidence_threshold = 0.0;
|
||||
const int input_layout = 0;
|
||||
const bool pad_to_max_output_size = false;
|
||||
const int max_output_size = input_box_num;
|
||||
const float offset = 0.0;
|
||||
|
||||
mluOpCreateNmsDescriptor(&nms_desc);
|
||||
mluOpSetNmsDescriptor(nms_desc, box_mode, output_mode, algo, method_mode,
|
||||
iou_threshold, soft_nms_sigma, max_output_size,
|
||||
confidence_threshold, offset, input_layout,
|
||||
pad_to_max_output_size);
|
||||
|
||||
mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr, NULL, NULL,
|
||||
workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
|
||||
output_size_ptr);
|
||||
mluOpDestroyNmsDescriptor(nms_desc);
|
||||
}
|
||||
|
||||
void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
|
||||
|
|
|
@ -18,8 +18,8 @@
|
|||
#include "pytorch_device_registry.hpp"
|
||||
|
||||
#define MLUOP_MAJOR 0
|
||||
#define MLUOP_MINOR 5
|
||||
#define MLUOP_PATCHLEVEL 302
|
||||
#define MLUOP_MINOR 6
|
||||
#define MLUOP_PATCHLEVEL 0
|
||||
|
||||
mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
|
||||
mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
|
||||
|
|
|
@ -9,122 +9,95 @@
|
|||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "mlu_common_helper.h"
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
|
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||
/*************************************************************************
|
||||
* This MACRO contains operations of simple tensor to mlu-tensor.
|
||||
* _contiguous, _desc, _impl, _ptr will be automatically generated in
|
||||
* this MACRO.
|
||||
*************************************************************************/
|
||||
#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME) \
|
||||
auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous( \
|
||||
NAME, NAME.suggest_memory_format()); \
|
||||
MluOpTensorDescriptor NAME##_desc; \
|
||||
NAME##_desc.set(NAME##_contigous); \
|
||||
auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
|
||||
auto NAME##_ptr = NAME##_impl->cnnlMalloc();
|
||||
|
||||
typedef enum {
|
||||
MS_DEFORM_ATTN_FORWARD_INVALID = 0, /*!< Index is invalid. */
|
||||
MS_DEFORM_ATTN_FORWARD_DEFAULT =
|
||||
1, /*!< MLUKernelMsDeformAttnForwardDefault */
|
||||
MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL =
|
||||
2, /*!< MLUKernelMsDeformAttnForwardSmallChannel */
|
||||
} MsDeformAttnForwardPolicy;
|
||||
Tensor MsDeformAttnForwardLauncher(const Tensor& value,
|
||||
const Tensor& spatial_shapes,
|
||||
const Tensor& level_start_index,
|
||||
const Tensor& sampling_loc,
|
||||
const Tensor& attn_weight,
|
||||
const int im2col_step) {
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
const int batch_size = value.size(0);
|
||||
const int num_heads = value.size(2);
|
||||
const int channels = value.size(3);
|
||||
const int num_queries = sampling_loc.size(1);
|
||||
auto output = at::zeros({batch_size, num_queries, num_heads, channels},
|
||||
value.options());
|
||||
auto spatial_shapes_int = spatial_shapes.to(at::kInt);
|
||||
auto level_start_index_int = level_start_index.to(at::kInt);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(output);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(value);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
|
||||
|
||||
void KernelMsDeformAttnForwardDefault(
|
||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t d_type, const char* data_value_gdram,
|
||||
const char* data_spatial_shapes_gdram,
|
||||
const char* data_level_start_index_gdram,
|
||||
const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
|
||||
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
|
||||
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
|
||||
const int32_t num_points, char* data_col_gdram);
|
||||
void KernelMsDeformAttnForwardSmallChannel(
|
||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t d_type, const char* data_value_gdram,
|
||||
const char* data_spatial_shapes_gdram,
|
||||
const char* data_level_start_index_gdram,
|
||||
const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
|
||||
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
|
||||
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
|
||||
const int32_t num_points, char* data_col_gdram);
|
||||
mluOpMsDeformAttnForward(
|
||||
handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
|
||||
spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
|
||||
level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
|
||||
attn_weight_desc.desc(), attn_weight_ptr, im2col_step, output_desc.desc(),
|
||||
output_ptr);
|
||||
|
||||
typedef enum {
|
||||
MS_DEFORM_ATTN_BACKWARD_DEFAULT = 0,
|
||||
MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL = 1,
|
||||
} MsDeformAttnBackwardKernelPolicy;
|
||||
|
||||
MsDeformAttnBackwardKernelPolicy msDeformAttnBackwardPolicyFunc(
|
||||
const int32_t channels, const int32_t num_levels, const int32_t num_points,
|
||||
const int32_t num_heads) {
|
||||
const int32_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
|
||||
const int num_hlp = num_heads * num_levels * num_points;
|
||||
int num_per_time_theory = (nram_size - num_levels * sizeof(float) -
|
||||
3 * num_levels * sizeof(int32_t)) /
|
||||
sizeof(float) / (8 * PAD_UP(channels, 32) + 28) /
|
||||
PAD_UP((num_hlp), 32);
|
||||
if (num_per_time_theory >= 1) {
|
||||
return MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL;
|
||||
}
|
||||
return MS_DEFORM_ATTN_BACKWARD_DEFAULT;
|
||||
output = output.view({batch_size, num_queries, num_heads * channels});
|
||||
return output;
|
||||
}
|
||||
|
||||
void KernelMsDeformAttnBackwardDefaultKernel(
|
||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t d_type, const float* data_value,
|
||||
const int32_t* spatial_shapes, const int32_t* data_level_start_index,
|
||||
const float* data_sampling_loc, const float* data_attn_weight,
|
||||
const float* grad_output, const int32_t batch_size, const int32_t num_keys,
|
||||
const int32_t num_heads, const int32_t channels, const int32_t num_levels,
|
||||
const int32_t num_queries, const int32_t num_points, float* grad_value,
|
||||
float* grad_sampling_loc, float* grad_attn_weight);
|
||||
void MsDeformAttnBackwardLauncher(
|
||||
const Tensor& value, const Tensor& spatial_shapes,
|
||||
const Tensor& level_start_index, const Tensor& sampling_loc,
|
||||
const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
|
||||
Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
|
||||
const int im2col_step) {
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
auto spatial_shapes_int = spatial_shapes.to(at::kInt);
|
||||
auto level_start_index_int = level_start_index.to(at::kInt);
|
||||
const int batch_size = value.size(0);
|
||||
const int num_heads = value.size(2);
|
||||
const int channels = value.size(3);
|
||||
const int num_queries = sampling_loc.size(1);
|
||||
|
||||
void KernelMsDeformAttnBackwardSmallChannelsKernel(
|
||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t d_type, const float* data_value,
|
||||
const int32_t* spatial_shapes, const int32_t* data_level_start_index,
|
||||
const float* data_sampling_loc, const float* data_attn_weight,
|
||||
const float* grad_output, const int32_t batch, const int32_t spatial_size,
|
||||
const int32_t num_heads, const int32_t channels, const int32_t num_levels,
|
||||
const int32_t num_query, const int32_t num_points, float* grad_value,
|
||||
float* grad_sampling_loc, float* grad_attn_weight);
|
||||
auto grad_output_dim4 =
|
||||
grad_output.view({batch_size, num_queries, num_heads, channels});
|
||||
// auto grad_output_dim4 = grad_output.view({batch_size, num_queries,
|
||||
// num_heads, channels}).detach();
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(value);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(grad_output_dim4);
|
||||
// INITIAL_MLU_PARAM_WITH_TENSOR(grad_output);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(grad_value);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(grad_sampling_loc);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(grad_attn_weight);
|
||||
|
||||
// policy function
|
||||
MsDeformAttnForwardPolicy msDeformAttnForwardPolicyFunc(
|
||||
cnrtDim3_t* k_dim, cnrtFunctionType_t* k_type, const int32_t batch_size,
|
||||
const int32_t num_keys, const int32_t num_heads, const int32_t channels,
|
||||
const int32_t num_levels, const int32_t num_queries,
|
||||
const int32_t num_points) {
|
||||
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
k_dim->y =
|
||||
MIN((batch_size * num_queries * num_heads + k_dim->x - 1) / k_dim->x,
|
||||
torch_mlu::getDeviceAttr(cnrtAttrClusterCount));
|
||||
k_dim->z = 1;
|
||||
#if __BANG_ARCH__ == 520
|
||||
*k_type = CNRT_FUNC_TYPE_BLOCK;
|
||||
#else
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
#endif
|
||||
mluOpMsDeformAttnBackward(
|
||||
handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
|
||||
spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
|
||||
level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
|
||||
attn_weight_desc.desc(), attn_weight_ptr, grad_output_dim4_desc.desc(),
|
||||
grad_output_dim4_ptr, im2col_step, grad_value_desc.desc(), grad_value_ptr,
|
||||
grad_sampling_loc_desc.desc(), grad_sampling_loc_ptr,
|
||||
grad_attn_weight_desc.desc(), grad_attn_weight_ptr);
|
||||
|
||||
int32_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
|
||||
if (num_levels * num_points * 3 * sizeof(int32_t) > nram_size) {
|
||||
return MS_DEFORM_ATTN_FORWARD_DEFAULT;
|
||||
} else if (channels > nram_size / 12 / sizeof(float) || channels > 96 ||
|
||||
channels < 16) {
|
||||
return MS_DEFORM_ATTN_FORWARD_DEFAULT;
|
||||
} else {
|
||||
return MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL;
|
||||
}
|
||||
}
|
||||
|
||||
// policy function for backward
|
||||
static void policyFuncBackward(const int32_t batch_size,
|
||||
const int32_t num_queries,
|
||||
const int32_t num_heads,
|
||||
const int32_t num_levels,
|
||||
cnrtFunctionType_t* k_type, cnrtDim3_t* k_dim) {
|
||||
size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
|
||||
size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
k_dim->x = core_limit;
|
||||
int32_t total_num = batch_size * num_queries * num_heads * num_levels;
|
||||
size_t total_num_align = CEIL_ALIGN(total_num, core_limit);
|
||||
k_dim->y = (total_num_align / core_limit) > cluster_limit
|
||||
? cluster_limit
|
||||
: (total_num_align / core_limit);
|
||||
k_dim->z = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
return;
|
||||
}
|
||||
|
||||
Tensor ms_deform_attn_mlu_forward(const Tensor& value,
|
||||
|
@ -133,188 +106,8 @@ Tensor ms_deform_attn_mlu_forward(const Tensor& value,
|
|||
const Tensor& sampling_loc,
|
||||
const Tensor& attn_weight,
|
||||
const int im2col_step) {
|
||||
// check contiguous
|
||||
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
|
||||
AT_ASSERTM(spatial_shapes.is_contiguous(),
|
||||
"spatial_shapes tensor has to be contiguous");
|
||||
AT_ASSERTM(level_start_index.is_contiguous(),
|
||||
"level_start_index tensor has to be contiguous");
|
||||
AT_ASSERTM(sampling_loc.is_contiguous(),
|
||||
"sampling_loc tensor has to be contiguous");
|
||||
AT_ASSERTM(attn_weight.is_contiguous(),
|
||||
"attn_weight tensor has to be contiguous");
|
||||
|
||||
// check datatype
|
||||
TORCH_CHECK((value.scalar_type() == at::kFloat),
|
||||
"value type should be Float, got ", value.scalar_type(), ".");
|
||||
TORCH_CHECK((spatial_shapes.scalar_type() == at::kInt ||
|
||||
spatial_shapes.scalar_type() == at::kLong),
|
||||
"spatial_shapes type should be Int, got ",
|
||||
spatial_shapes.scalar_type(), ".");
|
||||
TORCH_CHECK((level_start_index.scalar_type() == at::kInt ||
|
||||
level_start_index.scalar_type() == at::kLong),
|
||||
"level_start_index type should be Int, got ",
|
||||
level_start_index.scalar_type(), ".");
|
||||
TORCH_CHECK((sampling_loc.scalar_type() == at::kFloat),
|
||||
"sampling_loc type should be Float, got ",
|
||||
sampling_loc.scalar_type(), ".");
|
||||
TORCH_CHECK((attn_weight.scalar_type() == at::kFloat),
|
||||
"attn_weight type should be Float, got ",
|
||||
attn_weight.scalar_type(), ".");
|
||||
|
||||
// check shape
|
||||
TORCH_CHECK(value.dim() == 4, "value should be a 4d tensor, got ",
|
||||
value.dim(), "D.");
|
||||
TORCH_CHECK(spatial_shapes.dim() == 2,
|
||||
"spatial_shapes should be a 2d tensor, got ",
|
||||
spatial_shapes.dim(), "D.");
|
||||
TORCH_CHECK(level_start_index.dim() == 1,
|
||||
"level_start_index should be a 1d tensor, got ",
|
||||
level_start_index.dim(), "D.");
|
||||
TORCH_CHECK(sampling_loc.dim() == 6,
|
||||
"sampling_loc should be a 6d tensor, got ", sampling_loc.dim(),
|
||||
"D.");
|
||||
TORCH_CHECK(attn_weight.dim() == 5, "attn_weight should be a 5d tensor, got ",
|
||||
attn_weight.dim(), "D.");
|
||||
|
||||
const int batch_size = value.size(0);
|
||||
const int num_keys = value.size(1);
|
||||
const int num_heads = value.size(2);
|
||||
const int channels = value.size(3);
|
||||
const int num_levels = spatial_shapes.size(0);
|
||||
const int num_queries = sampling_loc.size(1);
|
||||
const int num_points = sampling_loc.size(4);
|
||||
|
||||
TORCH_CHECK(spatial_shapes.size(1) == 2,
|
||||
"the 2nd dimensions of spatial_shapes should be 2, got ",
|
||||
spatial_shapes.size(1), ".");
|
||||
TORCH_CHECK(sampling_loc.size(5) == 2,
|
||||
"the 6th dimensions of sampling_loc should be 2, got ",
|
||||
sampling_loc.size(5), ".");
|
||||
TORCH_CHECK((sampling_loc.size(0) == batch_size),
|
||||
"the 1st dimensions of sampling_loc should be batch_size, ",
|
||||
"but now the 1st dimension of sampling_loc is ",
|
||||
sampling_loc.size(0), ", and batch_size is ", batch_size, ".");
|
||||
TORCH_CHECK((attn_weight.size(0) == batch_size),
|
||||
"the 1st dimensions of attn_weight should be batch_size, ",
|
||||
"but now the 1st dimension of attn_weight is ",
|
||||
attn_weight.size(0), ", and batch_size is ", batch_size, ".");
|
||||
TORCH_CHECK((sampling_loc.size(2) == num_heads),
|
||||
"the 3rd dimensions of sampling_loc should be num_heads, ",
|
||||
"but now the 3rd dimension of sampling_loc is ",
|
||||
sampling_loc.size(2), ", and num_heads is ", num_heads, ".");
|
||||
TORCH_CHECK((attn_weight.size(2) == num_heads),
|
||||
"the 3rd dimensions of attn_weight should be num_heads, ",
|
||||
"but now the 3rd dimension of attn_weight is ",
|
||||
attn_weight.size(2), ", and num_heads is ", num_heads, ".");
|
||||
TORCH_CHECK((level_start_index.size(0) == num_levels),
|
||||
"the 1st dimensions of level_start_index should be num_levels, ",
|
||||
"but now the 1st dimension of level_start_index is ",
|
||||
level_start_index.size(0), ", and num_levels is ", num_levels,
|
||||
".");
|
||||
TORCH_CHECK((sampling_loc.size(3) == num_levels),
|
||||
"the 4th dimensions of sampling_loc should be num_levels, ",
|
||||
"but now the 4th dimension of sampling_loc is ",
|
||||
sampling_loc.size(3), ", and num_levels is ", num_levels, ".");
|
||||
TORCH_CHECK((attn_weight.size(3) == num_levels),
|
||||
"the 4th dimensions of attn_weight should be num_levels, ",
|
||||
"but now the 4th dimension of attn_weight is ",
|
||||
attn_weight.size(3), ", and num_levels is ", num_levels, ".");
|
||||
TORCH_CHECK((attn_weight.size(1) == num_queries),
|
||||
"the 2nd dimensions of attn_weight should be num_queries, ",
|
||||
"but now the 2nd dimension of attn_weight is ",
|
||||
attn_weight.size(1), ", and num_queries is ", num_queries, ".");
|
||||
TORCH_CHECK((attn_weight.size(4) == num_points),
|
||||
"the 5th dimensions of attn_weight should be num_points, ",
|
||||
"but now the 5th dimension of attn_weight is ",
|
||||
attn_weight.size(4), ", and num_points is ", num_points, ".");
|
||||
|
||||
auto output = at::zeros({batch_size, num_queries, num_heads, channels},
|
||||
value.options());
|
||||
|
||||
// large tensor check
|
||||
const size_t max_input_size = 2147483648;
|
||||
TORCH_CHECK(value.numel() < max_input_size,
|
||||
"value element num should be less than 2^31, got ", value.numel(),
|
||||
".");
|
||||
TORCH_CHECK(sampling_loc.numel() < max_input_size,
|
||||
"sampling_loc element num should be less than 2^31, got ",
|
||||
sampling_loc.numel(), ".");
|
||||
TORCH_CHECK(output.numel() < max_input_size,
|
||||
"output element num should be less than 2^31, got ",
|
||||
output.numel(), ".");
|
||||
|
||||
// check zero element
|
||||
TORCH_CHECK(batch_size != 0, "batch_size should not be zero");
|
||||
TORCH_CHECK(num_heads != 0, "num_heads should not be zero");
|
||||
TORCH_CHECK(channels != 0, "channels should not be zero");
|
||||
TORCH_CHECK(num_queries != 0, "num_queries should not be zero");
|
||||
|
||||
if (num_keys == 0 || num_levels == 0 || num_points == 0) {
|
||||
return output;
|
||||
}
|
||||
|
||||
// calculate task dimension
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtFunctionType_t k_type;
|
||||
MsDeformAttnForwardPolicy policy = msDeformAttnForwardPolicyFunc(
|
||||
&k_dim, &k_type, batch_size, num_keys, num_heads, channels, num_levels,
|
||||
num_queries, num_points);
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
auto spatial_shapes_ = spatial_shapes.to(at::kInt);
|
||||
auto level_start_index_ = level_start_index.to(at::kInt);
|
||||
|
||||
// get ptr of tensors
|
||||
auto value_impl = torch_mlu::getMluTensorImpl(value);
|
||||
auto value_ptr = value_impl->cnnlMalloc();
|
||||
auto spatial_shapes_impl = torch_mlu::getMluTensorImpl(spatial_shapes_);
|
||||
auto spatial_shapes_ptr = spatial_shapes_impl->cnnlMalloc();
|
||||
auto level_start_index_impl = torch_mlu::getMluTensorImpl(level_start_index_);
|
||||
auto level_start_index_ptr = level_start_index_impl->cnnlMalloc();
|
||||
auto sampling_loc_impl = torch_mlu::getMluTensorImpl(sampling_loc);
|
||||
auto sampling_loc_ptr = sampling_loc_impl->cnnlMalloc();
|
||||
auto attn_weight_impl = torch_mlu::getMluTensorImpl(attn_weight);
|
||||
auto attn_weight_ptr = attn_weight_impl->cnnlMalloc();
|
||||
auto output_impl = torch_mlu::getMluTensorImpl(output);
|
||||
auto output_ptr = output_impl->cnnlMalloc();
|
||||
|
||||
// get compute dtype of input
|
||||
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(value.dtype());
|
||||
|
||||
// launch kernel
|
||||
switch (policy) {
|
||||
default: {
|
||||
VLOG(5) << "MsDeformAttnForward Policy not supported";
|
||||
}; break;
|
||||
case MS_DEFORM_ATTN_FORWARD_DEFAULT: {
|
||||
CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<<"
|
||||
<< k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
|
||||
KernelMsDeformAttnForwardDefault(
|
||||
k_dim, k_type, queue, data_type, (char*)value_ptr,
|
||||
(char*)spatial_shapes_ptr, (char*)level_start_index_ptr,
|
||||
(char*)sampling_loc_ptr, (char*)attn_weight_ptr, batch_size, num_keys,
|
||||
num_heads, channels, num_levels, num_queries, num_points,
|
||||
(char*)output_ptr);
|
||||
break;
|
||||
}
|
||||
case MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL: {
|
||||
CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnForwardSmallChannel<<<"
|
||||
<< k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
|
||||
KernelMsDeformAttnForwardSmallChannel(
|
||||
k_dim, k_type, queue, data_type, (char*)value_ptr,
|
||||
(char*)spatial_shapes_ptr, (char*)level_start_index_ptr,
|
||||
(char*)sampling_loc_ptr, (char*)attn_weight_ptr, batch_size, num_keys,
|
||||
num_heads, channels, num_levels, num_queries, num_points,
|
||||
(char*)output_ptr);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
output = output.view({batch_size, num_queries, num_heads * channels});
|
||||
return output;
|
||||
return MsDeformAttnForwardLauncher(value, spatial_shapes, level_start_index,
|
||||
sampling_loc, attn_weight, im2col_step);
|
||||
}
|
||||
|
||||
void ms_deform_attn_mlu_backward(
|
||||
|
@ -323,181 +116,10 @@ void ms_deform_attn_mlu_backward(
|
|||
const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
|
||||
Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
|
||||
const int im2col_step) {
|
||||
// check contiguous
|
||||
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
|
||||
AT_ASSERTM(spatial_shapes.is_contiguous(),
|
||||
"spatial_shapes tensor has to be contiguous");
|
||||
AT_ASSERTM(level_start_index.is_contiguous(),
|
||||
"level_start_index tensor has to be contiguous");
|
||||
AT_ASSERTM(sampling_loc.is_contiguous(),
|
||||
"sampling_loc tensor has to be contiguous");
|
||||
AT_ASSERTM(attn_weight.is_contiguous(),
|
||||
"attn_weight tensor has to be contiguous");
|
||||
AT_ASSERTM(grad_output.is_contiguous(),
|
||||
"grad_output tensor has to be contiguous");
|
||||
|
||||
// check datatype
|
||||
TORCH_CHECK((value.scalar_type() == at::kFloat),
|
||||
"value type should be Float, got ", value.scalar_type(), ".");
|
||||
TORCH_CHECK((spatial_shapes.scalar_type() == at::kInt ||
|
||||
spatial_shapes.scalar_type() == at::kLong),
|
||||
"spatial_shapes type should be Int, got ",
|
||||
spatial_shapes.scalar_type(), ".");
|
||||
TORCH_CHECK((level_start_index.scalar_type() == at::kInt ||
|
||||
level_start_index.scalar_type() == at::kLong),
|
||||
"level_start_index type should be Int, got ",
|
||||
level_start_index.scalar_type(), ".");
|
||||
TORCH_CHECK((sampling_loc.scalar_type() == at::kFloat),
|
||||
"sampling_loc type should be Float, got ",
|
||||
sampling_loc.scalar_type(), ".");
|
||||
TORCH_CHECK((attn_weight.scalar_type() == at::kFloat),
|
||||
"attn_weight type should be Float, got ",
|
||||
attn_weight.scalar_type(), ".");
|
||||
TORCH_CHECK((grad_output.scalar_type() == at::kFloat),
|
||||
"grad_output type should be Float, got ",
|
||||
grad_output.scalar_type(), ".");
|
||||
|
||||
const int batch_size = value.size(0);
|
||||
const int num_keys = value.size(1);
|
||||
const int num_heads = value.size(2);
|
||||
const int channels = value.size(3);
|
||||
const int num_levels = spatial_shapes.size(0);
|
||||
const int num_queries = sampling_loc.size(1);
|
||||
const int num_points = sampling_loc.size(4);
|
||||
// Check shape.
|
||||
TORCH_CHECK(spatial_shapes.size(1) == 2,
|
||||
"the 2nd dimensions of spatial_shapes should be 2, got ",
|
||||
spatial_shapes.size(1), ".");
|
||||
|
||||
TORCH_CHECK((level_start_index.size(0) == num_levels),
|
||||
"the 1st dimensions of level_start_index should be num_levels, ",
|
||||
"but now the 1st dimension of level_start_index is ",
|
||||
level_start_index.size(0), ", and num_levels is ", num_levels,
|
||||
".");
|
||||
|
||||
TORCH_CHECK((sampling_loc.size(0) == batch_size),
|
||||
"the 1st dimensions of sampling_loc should be batch_size, ",
|
||||
"but now the 1st dimension of sampling_loc is ",
|
||||
sampling_loc.size(0), ", and batch_size is ", batch_size, ".");
|
||||
TORCH_CHECK((sampling_loc.size(2) == num_heads),
|
||||
"the 3rd dimensions of sampling_loc should be num_heads, ",
|
||||
"but now the 3rd dimension of sampling_loc is ",
|
||||
sampling_loc.size(2), ", and num_heads is ", num_heads, ".");
|
||||
TORCH_CHECK((sampling_loc.size(3) == num_levels),
|
||||
"the 4th dimensions of sampling_loc should be num_levels, ",
|
||||
"but now the 4th dimension of sampling_loc is ",
|
||||
sampling_loc.size(3), ", and num_levels is ", num_levels, ".");
|
||||
TORCH_CHECK(sampling_loc.size(5) == 2,
|
||||
"the 6th dimensions of sampling_loc should be 2, got ",
|
||||
sampling_loc.size(5), ".");
|
||||
|
||||
TORCH_CHECK((attn_weight.size(0) == batch_size),
|
||||
"the 1st dimensions of attn_weight should be batch_size, ",
|
||||
"but now the 1st dimension of attn_weight is ",
|
||||
attn_weight.size(0), ", and batch_size is ", batch_size, ".");
|
||||
TORCH_CHECK((attn_weight.size(1) == num_queries),
|
||||
"the 2nd dimensions of attn_weight should be num_queries, ",
|
||||
"but now the 2nd dimension of attn_weight is ",
|
||||
attn_weight.size(1), ", and num_queries is ", num_queries, ".");
|
||||
|
||||
TORCH_CHECK((attn_weight.size(2) == num_heads),
|
||||
"the 3rd dimensions of attn_weight should be num_heads, ",
|
||||
"but now the 3rd dimension of attn_weight is ",
|
||||
attn_weight.size(2), ", and num_heads is ", num_heads, ".");
|
||||
TORCH_CHECK((attn_weight.size(3) == num_levels),
|
||||
"the 4th dimensions of attn_weight should be num_levels, ",
|
||||
"but now the 4th dimension of attn_weight is ",
|
||||
attn_weight.size(3), ", and num_levels is ", num_levels, ".");
|
||||
TORCH_CHECK((attn_weight.size(4) == num_points),
|
||||
"the 5th dimensions of attn_weight should be num_points, ",
|
||||
"but now the 5th dimension of attn_weight is ",
|
||||
attn_weight.size(4), ", and num_points is ", num_points, ".");
|
||||
|
||||
TORCH_CHECK((grad_output.size(0) == batch_size),
|
||||
"the 1st dimensions of grad_output should be batch_size, ",
|
||||
"but now the 1st dimension of grad_output is ",
|
||||
grad_output.size(0), ", and batch_size is ", batch_size, ".");
|
||||
TORCH_CHECK((grad_output.size(1) == num_queries),
|
||||
"the 2nd dimensions of grad_output should be num_queries, ",
|
||||
"but now the 2nd dimension of grad_output is ",
|
||||
grad_output.size(1), ", and num_queries is ", num_queries, ".");
|
||||
TORCH_CHECK(
|
||||
(grad_output.size(2) == num_heads * channels),
|
||||
"the 3rd dimensions of grad_output should be num_heads * channels, ",
|
||||
"but now the 3rd dimension of grad_output is ", grad_output.size(2),
|
||||
", and num_heads * channels is ", num_heads * channels, ".");
|
||||
|
||||
// check zero element
|
||||
TORCH_CHECK(batch_size != 0, "The batch_size is zero.");
|
||||
TORCH_CHECK(channels != 0, "The channels is zero.");
|
||||
TORCH_CHECK(num_keys != 0, "The num_keys is zero.");
|
||||
TORCH_CHECK(num_heads != 0, "The num_heads is zero.");
|
||||
TORCH_CHECK(num_queries != 0, "The num_queries is zero.");
|
||||
if (num_levels == 0 || num_points == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// calculate task dimension
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtFunctionType_t k_type;
|
||||
policyFuncBackward(batch_size, num_queries, num_heads, num_levels, &k_type,
|
||||
&k_dim);
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
// get ptr of tensors
|
||||
auto value_impl = torch_mlu::getMluTensorImpl(value);
|
||||
auto value_ptr = value_impl->cnnlMalloc();
|
||||
auto spatial_shapes_impl = torch_mlu::getMluTensorImpl(spatial_shapes);
|
||||
auto spatial_shapes_ptr = spatial_shapes_impl->cnnlMalloc();
|
||||
auto level_start_index_impl = torch_mlu::getMluTensorImpl(level_start_index);
|
||||
auto level_start_index_ptr = level_start_index_impl->cnnlMalloc();
|
||||
auto sampling_loc_impl = torch_mlu::getMluTensorImpl(sampling_loc);
|
||||
auto sampling_loc_ptr = sampling_loc_impl->cnnlMalloc();
|
||||
auto attn_weight_impl = torch_mlu::getMluTensorImpl(attn_weight);
|
||||
auto attn_weight_ptr = attn_weight_impl->cnnlMalloc();
|
||||
auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
|
||||
auto grad_output_ptr = grad_output_impl->cnnlMalloc();
|
||||
auto grad_value_impl = torch_mlu::getMluTensorImpl(grad_value);
|
||||
auto grad_value_ptr = grad_value_impl->cnnlMalloc();
|
||||
auto grad_sampling_loc_impl = torch_mlu::getMluTensorImpl(grad_sampling_loc);
|
||||
auto grad_sampling_loc_ptr = grad_sampling_loc_impl->cnnlMalloc();
|
||||
auto grad_attn_weight_impl = torch_mlu::getMluTensorImpl(grad_attn_weight);
|
||||
auto grad_attn_weight_ptr = grad_attn_weight_impl->cnnlMalloc();
|
||||
|
||||
// get comput dtype of input
|
||||
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(value.dtype());
|
||||
|
||||
// launch kernel
|
||||
CNLOG(INFO) << "Launch Kernel MLUKernelMsDeformAttnBackward<<<" << k_dim.x
|
||||
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
|
||||
MsDeformAttnBackwardKernelPolicy kernelPolicy =
|
||||
msDeformAttnBackwardPolicyFunc(channels, num_levels, num_points,
|
||||
num_heads);
|
||||
switch (kernelPolicy) {
|
||||
default: {
|
||||
VLOG(5) << "NotImplemented.";
|
||||
} break;
|
||||
case MS_DEFORM_ATTN_BACKWARD_DEFAULT: {
|
||||
KernelMsDeformAttnBackwardDefaultKernel(
|
||||
k_dim, k_type, queue, data_type, (float*)value_ptr,
|
||||
(int32_t*)spatial_shapes_ptr, (int32_t*)level_start_index_ptr,
|
||||
(float*)sampling_loc_ptr, (float*)attn_weight_ptr,
|
||||
(float*)grad_output_ptr, batch_size, num_keys, num_heads, channels,
|
||||
num_levels, num_queries, num_points, (float*)grad_value_ptr,
|
||||
(float*)grad_sampling_loc_ptr, (float*)grad_attn_weight_ptr);
|
||||
} break;
|
||||
case MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL: {
|
||||
KernelMsDeformAttnBackwardSmallChannelsKernel(
|
||||
k_dim, k_type, queue, data_type, (float*)value_ptr,
|
||||
(int32_t*)spatial_shapes_ptr, (int32_t*)level_start_index_ptr,
|
||||
(float*)sampling_loc_ptr, (float*)attn_weight_ptr,
|
||||
(float*)grad_output_ptr, batch_size, num_keys, num_heads, channels,
|
||||
num_levels, num_queries, num_points, (float*)grad_value_ptr,
|
||||
(float*)grad_sampling_loc_ptr, (float*)grad_attn_weight_ptr);
|
||||
} break;
|
||||
}
|
||||
return MsDeformAttnBackwardLauncher(value, spatial_shapes, level_start_index,
|
||||
sampling_loc, attn_weight, grad_output,
|
||||
grad_value, grad_sampling_loc,
|
||||
grad_attn_weight, im2col_step);
|
||||
}
|
||||
|
||||
Tensor ms_deform_attn_impl_forward(const Tensor& value,
|
||||
|
@ -515,5 +137,6 @@ void ms_deform_attn_impl_backward(
|
|||
|
||||
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,
|
||||
ms_deform_attn_mlu_forward);
|
||||
|
||||
REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,
|
||||
ms_deform_attn_mlu_backward);
|
||||
|
|
|
@ -10,123 +10,35 @@
|
|||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
|
||||
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const cnrtDataType_t data_type_input, const void *boxes_ptr,
|
||||
const void *scores_ptr, const int input_num_boxes,
|
||||
const int max_output_boxes, const float iou_threshold,
|
||||
const float offset, void *workspace_ptr, void *output_size_ptr,
|
||||
void *output_ptr);
|
||||
|
||||
int selectUnionType(uint32_t use_job, int box_num_per_core) {
|
||||
// the box_num_per_core should be at least 256, otherwise the real IO
|
||||
// bandwidth would be very low
|
||||
while (box_num_per_core < 256 && use_job >= 4) {
|
||||
box_num_per_core *= 2;
|
||||
use_job /= 2;
|
||||
}
|
||||
return use_job;
|
||||
}
|
||||
|
||||
static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
|
||||
int &core_num_per_class,
|
||||
const int input_box_num) {
|
||||
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
|
||||
uint32_t job_limit = getJobLimitCapability();
|
||||
uint32_t core_number = job_limit;
|
||||
|
||||
int box_num_per_core = (input_box_num + core_number - 1) / core_number;
|
||||
int use_job = selectUnionType(job_limit, box_num_per_core);
|
||||
// initiate k_type as Union1
|
||||
k_dim->x = core_dim;
|
||||
k_dim->y = 1;
|
||||
k_dim->z = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
switch (job_limit) {
|
||||
case CN_KERNEL_CLASS_BLOCK:
|
||||
case CN_KERNEL_CLASS_UNION:
|
||||
case CN_KERNEL_CLASS_UNION2:
|
||||
case CN_KERNEL_CLASS_UNION4:
|
||||
case CN_KERNEL_CLASS_UNION8:
|
||||
case CN_KERNEL_CLASS_UNION16: {
|
||||
if (use_job < 4) {
|
||||
k_dim->x = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_BLOCK;
|
||||
} else if (use_job == 4) {
|
||||
k_dim->x = core_dim;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
} else {
|
||||
k_dim->x = use_job;
|
||||
*k_type = (cnrtFunctionType_t)use_job;
|
||||
}
|
||||
}; break;
|
||||
default:
|
||||
LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
|
||||
<< " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
|
||||
}
|
||||
return CNNL_STATUS_SUCCESS;
|
||||
}
|
||||
#include "mlu_common_helper.h"
|
||||
|
||||
Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
|
||||
int offset) {
|
||||
// dimension parameters check
|
||||
TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
|
||||
boxes.dim(), "D");
|
||||
TORCH_CHECK(boxes.size(1) == 4,
|
||||
"boxes should have 4 elements in dimension 1, got ",
|
||||
boxes.size(1));
|
||||
TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
|
||||
scores.dim(), "D");
|
||||
|
||||
// data type check
|
||||
TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
|
||||
"boxes should have the same type as scores");
|
||||
TORCH_CHECK(
|
||||
boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
|
||||
"data type of boxes should be Float or Half, got ", boxes.scalar_type());
|
||||
|
||||
if (boxes.numel() == 0) {
|
||||
return at::empty({0}, boxes.options().dtype(at::kLong));
|
||||
}
|
||||
|
||||
int input_num_boxes = boxes.size(0);
|
||||
int max_output_boxes = boxes.size(0);
|
||||
|
||||
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtJobType_t k_type;
|
||||
|
||||
int core_num_per_class;
|
||||
policyFunc(&k_dim, &k_type, core_num_per_class, input_num_boxes);
|
||||
|
||||
// transpose boxes (n, 4) to (4, n) for better performance
|
||||
auto boxes_t = boxes.transpose(0, 1);
|
||||
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
|
||||
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
|
||||
auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
|
||||
auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
|
||||
auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kInt));
|
||||
auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
|
||||
|
||||
MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
|
||||
boxes_desc.set(boxes_);
|
||||
scores_desc.set(scores_);
|
||||
output_desc.set(output);
|
||||
|
||||
// workspace
|
||||
const int info_num = 5; // x1, x2, y1, y2 and score
|
||||
size_t space_size = 0;
|
||||
if (boxes.scalar_type() == at::kHalf) {
|
||||
space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
|
||||
} else {
|
||||
space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
|
||||
}
|
||||
#if __BANG_ARCH__ > 370
|
||||
int cluster_num = getCoreNumOfJobLimitCapability() /
|
||||
torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
space_size += cluster_number * sizeof(float) * 7;
|
||||
#endif
|
||||
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
|
||||
size_t workspace_size = 0;
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), scores_desc.desc(),
|
||||
&workspace_size);
|
||||
auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
|
||||
auto boxes_ptr = boxes_impl->cnnlMalloc();
|
||||
auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
|
||||
|
@ -138,14 +50,31 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
|
|||
auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
|
||||
auto output_size_ptr = output_size_impl->cnnlMalloc();
|
||||
|
||||
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
|
||||
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
|
||||
KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
|
||||
input_num_boxes, max_output_boxes, iou_threshold, offset,
|
||||
workspace_ptr, output_size_ptr, output_ptr);
|
||||
// nms desc
|
||||
mluOpNmsDescriptor_t nms_desc;
|
||||
const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
|
||||
const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
|
||||
const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
|
||||
const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
|
||||
const float soft_nms_sigma = 0.0;
|
||||
const float confidence_threshold = 0.0;
|
||||
const int input_layout = 0;
|
||||
const bool pad_to_max_output_size = false;
|
||||
const int max_output_size = max_output_boxes;
|
||||
|
||||
mluOpCreateNmsDescriptor(&nms_desc);
|
||||
mluOpSetNmsDescriptor(nms_desc, box_mode, output_mode, algo, method_mode,
|
||||
iou_threshold, soft_nms_sigma, max_output_size,
|
||||
confidence_threshold, (float)offset, input_layout,
|
||||
pad_to_max_output_size);
|
||||
|
||||
mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr, scores_desc.desc(),
|
||||
scores_ptr, workspace_ptr, workspace_size, output_desc.desc(),
|
||||
output_ptr, output_size_ptr);
|
||||
mluOpDestroyNmsDescriptor(nms_desc);
|
||||
int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
|
||||
return output.slice(0, 0, output_num);
|
||||
auto ret = output.to(boxes.options().dtype(at::kLong));
|
||||
return ret.slice(0, 0, output_num);
|
||||
}
|
||||
|
||||
Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
|
||||
|
|
|
@ -9,26 +9,7 @@
|
|||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
|
||||
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const cnrtDataType_t d_type,
|
||||
const void *input, const void *rois, const int channels,
|
||||
const bool aligned, const int pooled_height,
|
||||
const int pooled_width, const int input_height,
|
||||
const int input_width, const int sampling_ratio,
|
||||
const float spatial_scale, const int num_rois,
|
||||
void *output);
|
||||
|
||||
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const cnrtDataType_t dtype,
|
||||
const void *grads, const void *boxes,
|
||||
void *grads_image, const int boxes_num,
|
||||
const int hi, const int wi, const int c,
|
||||
const int no, const int ho, const int wo,
|
||||
const float spatial_scale, const int sampling_ratio,
|
||||
const bool aligned);
|
||||
#include "mlu_common_helper.h"
|
||||
|
||||
void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
|
||||
Tensor argmax_y, Tensor argmax_x,
|
||||
|
@ -36,17 +17,7 @@ void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
|
|||
float spatial_scale, int sampling_ratio,
|
||||
int pool_mode, bool aligned) {
|
||||
// params check
|
||||
TORCH_CHECK(
|
||||
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
|
||||
"input type should be Float or Half, got ", input.scalar_type());
|
||||
TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
|
||||
"rois should have the same type as input");
|
||||
TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
|
||||
input.dim(), "D");
|
||||
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
|
||||
"D");
|
||||
TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
|
||||
|
||||
auto memory_format =
|
||||
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
|
||||
auto input_tensor =
|
||||
|
@ -57,52 +28,56 @@ void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
|
|||
int height = input.size(2);
|
||||
int width = input.size(3);
|
||||
|
||||
if (output.numel() == 0) {
|
||||
output = at::zeros({num_rois, channels, aligned_height, aligned_width},
|
||||
input.options());
|
||||
return;
|
||||
}
|
||||
|
||||
at::Tensor output_tmp =
|
||||
auto output_contiguous =
|
||||
at::empty({num_rois, channels, aligned_height, aligned_width},
|
||||
input.options(), memory_format);
|
||||
|
||||
// get tensor impl
|
||||
auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
|
||||
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
|
||||
auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
|
||||
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
MluOpTensorDescriptor input_desc, rois_desc, argmax_y_desc, argmax_x_desc,
|
||||
output_desc;
|
||||
input_desc.set_with_layout(input_tensor, MLUOP_LAYOUT_NHWC);
|
||||
rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
|
||||
output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
|
||||
|
||||
// get the mlu ptr
|
||||
auto self_ptr = self_impl->cnnlMalloc();
|
||||
auto rois_ptr = rois_impl->cnnlMalloc();
|
||||
auto output_ptr = output_impl->cnnlMalloc();
|
||||
|
||||
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
cnrtDim3_t k_dim;
|
||||
k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
|
||||
k_dim.z = 1;
|
||||
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
|
||||
mluOpRoiAlignForwardDescriptor_t roialign_desc;
|
||||
mluOpCreateRoiAlignForwardDescriptor(&roialign_desc);
|
||||
mluOpSetRoiAlignForwardDescriptor_v2(roialign_desc, aligned_height,
|
||||
aligned_width, sampling_ratio,
|
||||
spatial_scale, pool_mode, aligned);
|
||||
|
||||
KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
|
||||
aligned, aligned_height, aligned_width, height, width,
|
||||
sampling_ratio, spatial_scale, num_rois, output_ptr);
|
||||
|
||||
output.copy_(output_tmp);
|
||||
}
|
||||
|
||||
static int nearestPower2(int x) {
|
||||
x--;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
x++;
|
||||
return x;
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
if (pool_mode == 0) {
|
||||
auto argmax_y_contiguous =
|
||||
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
|
||||
auto argmax_x_contiguous =
|
||||
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
|
||||
auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
|
||||
auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
|
||||
auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
|
||||
auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
|
||||
argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
|
||||
argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
|
||||
mluOpRoiAlignForward_v2(handle, roialign_desc, input_desc.desc(), self_ptr,
|
||||
rois_desc.desc(), rois_ptr, output_desc.desc(),
|
||||
output_ptr, argmax_x_desc.desc(), argmax_x_ptr,
|
||||
argmax_y_desc.desc(), argmax_y_ptr);
|
||||
argmax_x.copy_(argmax_x_contiguous);
|
||||
argmax_y.copy_(argmax_y_contiguous);
|
||||
} else {
|
||||
mluOpRoiAlignForward_v2(handle, roialign_desc, input_desc.desc(), self_ptr,
|
||||
rois_desc.desc(), rois_ptr, output_desc.desc(),
|
||||
output_ptr, NULL, NULL, NULL, NULL);
|
||||
}
|
||||
mluOpDestroyRoiAlignForwardDescriptor(roialign_desc);
|
||||
output.copy_(output_contiguous);
|
||||
}
|
||||
|
||||
void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
|
||||
|
@ -112,17 +87,7 @@ void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
|
|||
int sampling_ratio, int pool_mode,
|
||||
bool aligned) {
|
||||
// params check
|
||||
TORCH_CHECK(
|
||||
grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
|
||||
"grad type should be Float or Half, got ", grad.scalar_type());
|
||||
TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
|
||||
"rois should have the same type as grad");
|
||||
TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
|
||||
"D");
|
||||
TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
|
||||
"D");
|
||||
TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
|
||||
|
||||
int batch_size = grad_input.size(0);
|
||||
int channels = grad_input.size(1);
|
||||
int height = grad_input.size(2);
|
||||
|
@ -148,26 +113,40 @@ void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
|
|||
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
|
||||
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
// get the mlu ptr
|
||||
auto grad_ptr = grad_impl->cnnlMalloc();
|
||||
auto rois_ptr = rois_impl->cnnlMalloc();
|
||||
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
|
||||
|
||||
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
int need_core = nearestPower2(boxes_num);
|
||||
int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
|
||||
uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
uint32_t dim_y = (need_core - 1) / dim_x + 1;
|
||||
dim_y = (dim_y > union_number) ? union_number : dim_y;
|
||||
cnrtDim3_t k_dim = {dim_x, dim_y, 1};
|
||||
cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
|
||||
MluOpTensorDescriptor grads_desc, rois_desc, argmax_y_desc, argmax_x_desc,
|
||||
grad_input_desc;
|
||||
grads_desc.set_with_layout(grad_, MLUOP_LAYOUT_NHWC);
|
||||
rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
|
||||
grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
|
||||
|
||||
KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
|
||||
grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
|
||||
spatial_scale, sampling_ratio, aligned);
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
if (pool_mode == 0) {
|
||||
auto argmax_y_contiguous =
|
||||
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
|
||||
auto argmax_x_contiguous =
|
||||
torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
|
||||
auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
|
||||
auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
|
||||
auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
|
||||
auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
|
||||
argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
|
||||
argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
|
||||
mluOpRoiAlignBackward_v2(handle, grads_desc.desc(), grad_ptr,
|
||||
rois_desc.desc(), rois_ptr, argmax_y_desc.desc(),
|
||||
argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr,
|
||||
spatial_scale, sampling_ratio, aligned, pool_mode,
|
||||
grad_input_desc.desc(), grad_input_ptr);
|
||||
} else {
|
||||
mluOpRoiAlignBackward_v2(handle, grads_desc.desc(), grad_ptr,
|
||||
rois_desc.desc(), rois_ptr, NULL, NULL, NULL, NULL,
|
||||
spatial_scale, sampling_ratio, aligned, pool_mode,
|
||||
grad_input_desc.desc(), grad_input_ptr);
|
||||
}
|
||||
grad_input.copy_(grad_input_);
|
||||
}
|
||||
|
||||
|
|
|
@ -9,238 +9,69 @@
|
|||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
#include "mlu_common_helper.h"
|
||||
|
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
void KernelDynamicVoxelize(
|
||||
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
|
||||
const void *points, void *coors, const float voxel_x, const float voxel_y,
|
||||
const float voxel_z, const float coors_x_min, const float coors_y_min,
|
||||
const float coors_z_min, const float coors_x_max, const float coors_y_max,
|
||||
const float coors_z_max, const int32_t grid_x, const int32_t grid_y,
|
||||
const int32_t grid_z, const int32_t num_points, const int32_t num_features);
|
||||
|
||||
void KernelPoint2Voxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, void *coors, void *point_to_pointidx,
|
||||
void *point_to_voxelidx, const int32_t num_points,
|
||||
const int32_t max_points);
|
||||
|
||||
void KernelCalcPointsPerVoxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, void *point_to_pointidx,
|
||||
void *point_to_voxelidx, void *coor_to_voxelidx,
|
||||
void *num_points_per_voxel, void *voxel_num,
|
||||
const int32_t max_voxels,
|
||||
const int32_t num_points);
|
||||
|
||||
void KernelAssignVoxelsCoors(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const void *points,
|
||||
void *temp_coors, void *point_to_voxelidx,
|
||||
void *coor_to_voxelidx, void *voxels, void *coors,
|
||||
const int32_t max_points, const int32_t num_points,
|
||||
const int32_t num_features);
|
||||
|
||||
// policy function
|
||||
static void policyFuncDefault(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
|
||||
const int num_points) {
|
||||
k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
k_dim->y = MIN((num_points + k_dim->x - 1) / k_dim->x,
|
||||
torch_mlu::getDeviceAttr(cnrtAttrClusterCount));
|
||||
k_dim->z = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
}
|
||||
|
||||
// policy function
|
||||
static void policyFuncCalcPointsPerVoxel(cnrtDim3_t *k_dim,
|
||||
cnrtFunctionType_t *k_type,
|
||||
const int num_points) {
|
||||
k_dim->x = 1;
|
||||
k_dim->y = 1;
|
||||
k_dim->z = 1;
|
||||
*k_type = CNRT_FUNC_TYPE_BLOCK;
|
||||
}
|
||||
/*************************************************************************
|
||||
* This MACRO contains operations of simple tensor to mlu-tensor.
|
||||
* _contiguous, _desc, _impl, _ptr will be automatically generated in
|
||||
* this MACRO.
|
||||
*************************************************************************/
|
||||
#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME) \
|
||||
auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous( \
|
||||
NAME, NAME.suggest_memory_format()); \
|
||||
MluOpTensorDescriptor NAME##_desc; \
|
||||
NAME##_desc.set(NAME##_contigous); \
|
||||
auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
|
||||
auto NAME##_ptr = NAME##_impl->cnnlMalloc();
|
||||
|
||||
int HardVoxelizeForwardMLUKernelLauncher(
|
||||
const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
|
||||
at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
|
||||
const std::vector<float> coors_range, const int max_points,
|
||||
const int max_voxels, const int NDim = 3) {
|
||||
// check datatype
|
||||
TORCH_CHECK(points.scalar_type() == at::kFloat,
|
||||
"points type should be Float, got ", points.scalar_type(), ".");
|
||||
TORCH_CHECK(voxels.scalar_type() == at::kFloat,
|
||||
"voxels type should be Float, got ", voxels.scalar_type(), ".");
|
||||
TORCH_CHECK(coors.scalar_type() == at::kInt,
|
||||
"coors type should be Float, got ", coors.scalar_type(), ".");
|
||||
TORCH_CHECK(num_points_per_voxel.scalar_type() == at::kInt,
|
||||
"num_points_per_voxel type should be Float, got ",
|
||||
num_points_per_voxel.scalar_type(), ".");
|
||||
std::vector<float> _voxel_size(voxel_size.begin(), voxel_size.end());
|
||||
std::vector<float> _coors_range(coors_range.begin(), coors_range.end());
|
||||
auto opts = torch::TensorOptions().dtype(torch::kFloat32);
|
||||
auto voxel_size_tensor =
|
||||
torch::from_blob(_voxel_size.data(), {int64_t(_voxel_size.size())}, opts)
|
||||
.clone()
|
||||
.to(at::kMLU);
|
||||
auto coors_range_tensor =
|
||||
torch::from_blob(_coors_range.data(), {int64_t(_coors_range.size())},
|
||||
opts)
|
||||
.clone()
|
||||
.to(at::kMLU);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(points);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(voxels);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(coors);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(num_points_per_voxel);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(voxel_size_tensor);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(coors_range_tensor);
|
||||
|
||||
// check shape
|
||||
TORCH_CHECK(points.dim() == 2, "points should be a 2d tensor, got ",
|
||||
points.dim(), "D.");
|
||||
TORCH_CHECK(voxels.dim() == 3, "voxels should be a 3d tensor, got ",
|
||||
voxels.dim(), "D.");
|
||||
TORCH_CHECK(coors.dim() == 2, "coors should be a 2d tensor, got ",
|
||||
coors.dim(), "D.");
|
||||
TORCH_CHECK(num_points_per_voxel.dim() == 1,
|
||||
"num_points_per_voxel should be a 1d tensor, got ",
|
||||
num_points_per_voxel.dim(), "D.");
|
||||
auto voxel_num_tensor = at::empty({1}, points.options().dtype(torch::kInt32));
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num_tensor);
|
||||
|
||||
const int num_points = points.size(0);
|
||||
const int num_features = points.size(1);
|
||||
size_t workspace_size;
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
mluOpGetVoxelizationWorkspaceSize(
|
||||
handle, points_desc.desc(), voxel_size_tensor_desc.desc(),
|
||||
coors_range_tensor_desc.desc(), max_points, max_voxels, NDim, true,
|
||||
voxels_desc.desc(), coors_desc.desc(), num_points_per_voxel_desc.desc(),
|
||||
voxel_num_tensor_desc.desc(), &workspace_size);
|
||||
auto workspace_tensor =
|
||||
at::empty(workspace_size, points.options().dtype(at::kByte));
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
|
||||
|
||||
TORCH_CHECK(points.size(0) == num_points,
|
||||
"the 1st dimensions of points should be num_points, got ",
|
||||
points.size(0), ".");
|
||||
TORCH_CHECK(points.size(1) == num_features,
|
||||
"the 2nd dimensions of points should be num_features, got ",
|
||||
points.size(1), ".");
|
||||
TORCH_CHECK(voxels.size(0) == max_voxels,
|
||||
"the 1st dimensions of voxels should be max_voxels, got ",
|
||||
voxels.size(0), ".");
|
||||
TORCH_CHECK(voxels.size(1) == max_points,
|
||||
"the 2nd dimensions of voxels should be max_points, got ",
|
||||
voxels.size(1), ".");
|
||||
TORCH_CHECK(voxels.size(2) == num_features,
|
||||
"the 3rd dimensions of voxels should be num_features, got ",
|
||||
voxels.size(2), ".");
|
||||
TORCH_CHECK(coors.size(0) == max_voxels,
|
||||
"the 1st dimensions of coors should be max_voxels, got ",
|
||||
coors.size(0), ".");
|
||||
TORCH_CHECK(coors.size(1) == 3,
|
||||
"the 2nd dimensions of coors should be 3, got ", coors.size(1),
|
||||
".");
|
||||
TORCH_CHECK(num_points_per_voxel.size(0) == max_voxels,
|
||||
"the 1st dimensions of num_points_per_voxel should be 3, got ",
|
||||
num_points_per_voxel.size(0), ".");
|
||||
|
||||
// large tensor check
|
||||
const size_t max_input_size = 2147483648;
|
||||
TORCH_CHECK(points.numel() < max_input_size,
|
||||
"points element num should be less than 2^31, got ",
|
||||
points.numel(), ".");
|
||||
TORCH_CHECK(voxels.numel() < max_input_size,
|
||||
"voxels element num should be less than 2^31, got ",
|
||||
voxels.numel(), ".");
|
||||
TORCH_CHECK(coors.numel() < max_input_size,
|
||||
"coors element num should be less than 2^31, got ", coors.numel(),
|
||||
".");
|
||||
|
||||
// check zero element
|
||||
if (max_points == 0 || max_voxels == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
// get ptr of tensors
|
||||
auto points_ = points.contiguous();
|
||||
auto points_impl = torch_mlu::getMluTensorImpl(points_);
|
||||
auto points_ptr = points_impl->cnnlMalloc();
|
||||
auto voxels_ = voxels.contiguous();
|
||||
auto voxels_impl = torch_mlu::getMluTensorImpl(voxels_);
|
||||
auto voxels_ptr = voxels_impl->cnnlMalloc();
|
||||
auto coors_ = coors.contiguous();
|
||||
auto coors_impl = torch_mlu::getMluTensorImpl(coors_);
|
||||
auto coors_ptr = coors_impl->cnnlMalloc();
|
||||
auto num_points_per_voxel_ = num_points_per_voxel.contiguous();
|
||||
auto num_points_per_voxel_impl =
|
||||
torch_mlu::getMluTensorImpl(num_points_per_voxel_);
|
||||
auto num_points_per_voxel_ptr = num_points_per_voxel_impl->cnnlMalloc();
|
||||
|
||||
// calculate task dimension
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtFunctionType_t k_type;
|
||||
policyFuncDefault(&k_dim, &k_type, num_points);
|
||||
|
||||
// 1. link point to corresponding voxel coors
|
||||
const float voxel_x = voxel_size[0];
|
||||
const float voxel_y = voxel_size[1];
|
||||
const float voxel_z = voxel_size[2];
|
||||
const float coors_x_min = coors_range[0];
|
||||
const float coors_y_min = coors_range[1];
|
||||
const float coors_z_min = coors_range[2];
|
||||
const float coors_x_max = coors_range[3];
|
||||
const float coors_y_max = coors_range[4];
|
||||
const float coors_z_max = coors_range[5];
|
||||
|
||||
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
|
||||
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
|
||||
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
|
||||
|
||||
auto temp_coors =
|
||||
at::zeros({NDim, num_points}, points.options().dtype(at::kInt))
|
||||
.contiguous();
|
||||
auto temp_coors_impl = torch_mlu::getMluTensorImpl(temp_coors);
|
||||
auto temp_coors_ptr = temp_coors_impl->cnnlMalloc();
|
||||
|
||||
KernelDynamicVoxelize(k_dim, k_type, queue, points_ptr, temp_coors_ptr,
|
||||
voxel_x, voxel_y, voxel_z, coors_x_min, coors_y_min,
|
||||
coors_z_min, coors_x_max, coors_y_max, coors_z_max,
|
||||
grid_x, grid_y, grid_z, num_points, num_features);
|
||||
|
||||
// 2. map point to the idx of the corresponding voxel, find duplicate coor
|
||||
auto point_to_pointidx = at::zeros(
|
||||
{
|
||||
num_points,
|
||||
},
|
||||
points.options().dtype(at::kInt))
|
||||
.contiguous();
|
||||
auto point_to_pointidx_impl = torch_mlu::getMluTensorImpl(point_to_pointidx);
|
||||
auto point_to_pointidx_ptr = point_to_pointidx_impl->cnnlMalloc();
|
||||
auto point_to_voxelidx = at::zeros(
|
||||
{
|
||||
num_points,
|
||||
},
|
||||
points.options().dtype(at::kInt))
|
||||
.contiguous();
|
||||
auto point_to_voxelidx_impl = torch_mlu::getMluTensorImpl(point_to_voxelidx);
|
||||
auto point_to_voxelidx_ptr = point_to_voxelidx_impl->cnnlMalloc();
|
||||
|
||||
KernelPoint2Voxel(k_dim, k_type, queue, temp_coors_ptr, point_to_pointidx_ptr,
|
||||
point_to_voxelidx_ptr, num_points, max_points);
|
||||
|
||||
// calculate task dimension
|
||||
cnrtDim3_t k_dim_calc_points_per_voxel;
|
||||
cnrtFunctionType_t k_type_calc_points_per_voxel;
|
||||
policyFuncCalcPointsPerVoxel(&k_dim_calc_points_per_voxel,
|
||||
&k_type_calc_points_per_voxel, num_points);
|
||||
|
||||
// 3. determine voxel num and voxel's coor index
|
||||
auto coor_to_voxelidx = at::zeros(
|
||||
{
|
||||
num_points,
|
||||
},
|
||||
points.options().dtype(at::kInt))
|
||||
.contiguous();
|
||||
auto coor_to_voxelidx_impl = torch_mlu::getMluTensorImpl(coor_to_voxelidx);
|
||||
auto coor_to_voxelidx_ptr = coor_to_voxelidx_impl->cnnlMalloc();
|
||||
auto voxel_num = at::zeros(
|
||||
{
|
||||
1,
|
||||
},
|
||||
points.options().dtype(at::kInt))
|
||||
.contiguous();
|
||||
auto voxel_num_impl = torch_mlu::getMluTensorImpl(voxel_num);
|
||||
auto voxel_num_ptr = voxel_num_impl->cnnlMalloc();
|
||||
|
||||
KernelCalcPointsPerVoxel(
|
||||
k_dim_calc_points_per_voxel, k_type_calc_points_per_voxel, queue,
|
||||
point_to_pointidx_ptr, point_to_voxelidx_ptr, coor_to_voxelidx_ptr,
|
||||
num_points_per_voxel_ptr, voxel_num_ptr, max_voxels, num_points);
|
||||
|
||||
// 4. copy point features and coors of each voxels to voxels
|
||||
KernelAssignVoxelsCoors(k_dim, k_type, queue, points_ptr, temp_coors_ptr,
|
||||
point_to_voxelidx_ptr, coor_to_voxelidx_ptr,
|
||||
voxels_ptr, coors_ptr, max_points, num_points,
|
||||
num_features);
|
||||
|
||||
auto voxel_num_cpu = voxel_num.to(at::kCPU);
|
||||
mluOpVoxelization(handle, points_desc.desc(), points_ptr,
|
||||
voxel_size_tensor_desc.desc(), voxel_size_tensor_ptr,
|
||||
coors_range_tensor_desc.desc(), coors_range_tensor_ptr,
|
||||
max_points, max_voxels, NDim, true, workspace_tensor_ptr,
|
||||
workspace_size, voxels_desc.desc(), voxels_ptr,
|
||||
coors_desc.desc(), coors_ptr,
|
||||
num_points_per_voxel_desc.desc(), num_points_per_voxel_ptr,
|
||||
voxel_num_tensor_desc.desc(), voxel_num_tensor_ptr);
|
||||
auto voxel_num_cpu = voxel_num_tensor.to(at::kCPU);
|
||||
int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
|
||||
|
||||
return voxel_num_int;
|
||||
}
|
||||
|
||||
|
@ -254,7 +85,7 @@ int hard_voxelize_forward_mlu(const at::Tensor &points, at::Tensor &voxels,
|
|||
return HardVoxelizeForwardMLUKernelLauncher(
|
||||
points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
|
||||
max_points, max_voxels, NDim);
|
||||
};
|
||||
}
|
||||
|
||||
int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
|
||||
at::Tensor &coors,
|
||||
|
|
11
setup.py
11
setup.py
|
@ -272,6 +272,7 @@ def get_extensions():
|
|||
include_dirs = []
|
||||
|
||||
extra_objects = []
|
||||
extra_link_args = []
|
||||
is_rocm_pytorch = False
|
||||
try:
|
||||
from torch.utils.cpp_extension import ROCM_HOME
|
||||
|
@ -388,8 +389,11 @@ def get_extensions():
|
|||
'./mlu-ops/bangc-ops/kernels/**/*.cpp', recursive=True) + \
|
||||
glob.glob(
|
||||
'./mlu-ops/bangc-ops/kernels/**/*.mlu', recursive=True)
|
||||
extra_objects = glob.glob(
|
||||
'./mlu-ops/bangc-ops/kernels/kernel_wrapper/*.o')
|
||||
extra_link_args = [
|
||||
'-Wl,--whole-archive',
|
||||
'./mlu-ops/bangc-ops/kernels/kernel_wrapper/lib/libextops.a',
|
||||
'-Wl,--no-whole-archive'
|
||||
]
|
||||
extension = MLUExtension
|
||||
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
|
||||
include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
|
||||
|
@ -456,7 +460,8 @@ def get_extensions():
|
|||
include_dirs=include_dirs,
|
||||
define_macros=define_macros,
|
||||
extra_objects=extra_objects,
|
||||
extra_compile_args=extra_compile_args)
|
||||
extra_compile_args=extra_compile_args,
|
||||
extra_link_args=extra_link_args)
|
||||
extensions.append(ext_ops)
|
||||
|
||||
if EXT_TYPE == 'pytorch' and os.getenv('MMCV_WITH_ORT', '0') != '0':
|
||||
|
|
|
@ -93,6 +93,15 @@ def _test_roialign_allclose(device, dtype):
|
|||
x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dtype', [
|
||||
torch.float,
|
||||
pytest.param(
|
||||
torch.double,
|
||||
marks=pytest.mark.skipif(
|
||||
IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
|
||||
reason='MLU and NPU do not support for 64-bit floating point')),
|
||||
torch.half
|
||||
])
|
||||
@pytest.mark.parametrize('device', [
|
||||
'cpu',
|
||||
pytest.param(
|
||||
|
@ -108,15 +117,6 @@ def _test_roialign_allclose(device, dtype):
|
|||
marks=pytest.mark.skipif(
|
||||
not IS_NPU_AVAILABLE, reason='requires NPU support'))
|
||||
])
|
||||
@pytest.mark.parametrize('dtype', [
|
||||
torch.float,
|
||||
pytest.param(
|
||||
torch.double,
|
||||
marks=pytest.mark.skipif(
|
||||
IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
|
||||
reason='MLU and NPU do not support for 64-bit floating point')),
|
||||
torch.half
|
||||
])
|
||||
def test_roialign(device, dtype):
|
||||
# check double only
|
||||
if dtype is torch.double:
|
||||
|
|
Loading…
Reference in New Issue