[Improve] Improve the performance of roialign, nms and focalloss with MLU backend (#1572)

* [Improve] Improve the performance of roialign with MLU backend

* replace CHECK_MLU with CHECK_MLU_INPUT

* [Improve] Improve the perf of nms and focallosssigmoid with MLU backend
This commit is contained in:
zhouchenyang 2021-12-21 21:29:10 +08:00 committed by GitHub
parent e1c73552a4
commit 9d007f951f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1822 additions and 600 deletions

View File

@ -24,8 +24,21 @@ __mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
const int32_t dst_stride = 0,
const int32_t src_stride = 0,
const int32_t count = 1) {
__memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
src_stride, count - 1);
if (dst_stride == src_stride) {
__memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
} else {
__memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
src_stride, count - 1);
}
}
template <typename T>
__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
const int32_t c, const int32_t has_weight,
const int32_t partition_nc) {
if (has_weight && partition_nc && t >= 0 && t < c) {
__memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
}
}
template <typename T>
@ -33,152 +46,117 @@ __mlu_func__ void storeOutput(T *dram_output, char *nram_output,
const int32_t size, const int32_t dst_stride = 0,
const int32_t src_stride = 0,
const int32_t count = 1) {
__memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
src_stride, count - 1);
if (dst_stride == src_stride) {
__memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
} else {
__memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
src_stride, count - 1);
}
}
template <typename T>
__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
const int32_t has_weight, const int32_t deal_num,
const int32_t n_seg, const int32_t C, float alpha,
float gamma, T *scalar_temp, T *tensor_max,
T *tensor_temp, T *output) {
const int32_t scalar_elem_num = NFU_ALIGN_SIZE / sizeof(T);
const int32_t has_weight, const int32_t partition_nc,
const int32_t deal_num, const int32_t n_seg,
const int32_t c, const int32_t c_seg,
const int32_t c_start_index, const float alpha,
const float gamma, T *compute_a, T *compute_b,
T *output) {
// set params
const int32_t c_num =
has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
const int32_t c_end_index = c_start_index + c_seg;
const int32_t half_epsilon = 0x0400;
const T epsilon_f =
sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
// 0. n_max = max(0, x)
__nramset((T *)tensor_max, deal_num, (T)0);
__bang_cycle_maxequal((T *)tensor_max, (T *)tensor_max, (T *)input, deal_num,
deal_num);
// 1. ln(1+e^x) = ln(e^(-max) + e^(x-max)) + max
__nramset((T *)scalar_temp, scalar_elem_num, (T)-1);
__bang_cycle_mul((T *)tensor_temp, (T *)tensor_max, (T *)scalar_temp,
deal_num, scalar_elem_num);
__bang_cycle_add((T *)output, (T *)input, (T *)tensor_temp, deal_num,
deal_num);
__bang_active_exphp((T *)output, (T *)output, deal_num);
__bang_active_exphp((T *)tensor_temp, (T *)tensor_temp, deal_num);
__bang_cycle_add((T *)output, (T *)output, (T *)tensor_temp, deal_num,
deal_num);
__bang_active_loghp((T *)output, (T *)output, deal_num);
__bang_cycle_add((T *)output, (T *)output, (T *)tensor_max, deal_num,
deal_num);
// 2. temp = [1 + e^(-x)] ^ (-r)
__nramset((T *)scalar_temp, scalar_elem_num, (T)-1);
__bang_cycle_mul((T *)tensor_temp, (T *)input, (T *)scalar_temp, deal_num,
scalar_elem_num);
__bang_active_exphp((T *)tensor_temp, (T *)tensor_temp, deal_num);
__nramset((T *)scalar_temp, scalar_elem_num, (T)1);
__bang_cycle_add((T *)tensor_temp, (T *)tensor_temp, (T *)scalar_temp,
deal_num, scalar_elem_num);
__bang_active_loghp((T *)tensor_temp, (T *)tensor_temp, deal_num);
__nramset((T *)scalar_temp, scalar_elem_num, (T)(-gamma));
__bang_cycle_mul((T *)tensor_temp, (T *)tensor_temp, (T *)scalar_temp,
deal_num, scalar_elem_num);
__bang_active_exphp((T *)tensor_temp, (T *)tensor_temp, deal_num);
// 3.1 output: target != j
__nramset((T *)scalar_temp, scalar_elem_num, (T)(1 - alpha));
__bang_cycle_mul((T *)output, (T *)output, (T *)scalar_temp, deal_num,
scalar_elem_num);
__bang_cycle_mul((T *)output, (T *)output, (T *)tensor_temp, deal_num,
deal_num);
// 3.2 output: target == j
const int32_t c_align_size = PAD_UP((sizeof(T) * C), NFU_ALIGN_SIZE);
// 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma if t == c_i
// = (1 - alpha) * p ^ gamma if t != c_i
__nramset((T *)output, deal_num, (T)(1 - alpha));
__bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
for (int32_t i = 0; i < n_seg; ++i) {
const int32_t target_value = *((int32_t *)target + i);
if (target_value >= 0 && target_value < C) {
const int32_t offset = i * c_align_size + target_value * sizeof(T);
char *addr_input = (char *)input + offset;
char *addr_output = (char *)output + offset;
const float x = *(T *)addr_input;
const float p = 1. / (1. + exp(-x));
*(T *)addr_output = -alpha * pow(1. - p, gamma) * log(fmax(p, FLT_MIN));
const int32_t t = *((uint32_t *)target + i);
if (t >= c_start_index && t < c_end_index) {
const uint32_t index = i * c_num + t - c_start_index;
*((T *)input + index) = -1.0 * (*((T *)input + index));
*((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
*((T *)output + index) = alpha;
}
}
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
__bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
__bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
deal_num);
__bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
__bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
} else {
__bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
__bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
__bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
}
__bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
// with weight
if (has_weight > 0) {
int32_t row_num_elem = deal_num / n_seg;
// 1. max = max(0, -x) if t == c_i
// = max(0, x) if t != c_i
__nramset((T *)compute_b, deal_num, (T)0);
__bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
// 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max if t == c_i
// = ln(e^(-max)+ e^(-max+x) + max if t != c_i
__bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
__bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
__bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
__bang_active_exphp((T *)input, (T *)input, deal_num);
__bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
__bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
__bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
// 3. output = alpha_t * p_t^r * [-log(p_t)]
__bang_mul((T *)output, (T *)output, (T *)input, deal_num);
// 4. with weight
if (has_weight) {
for (int32_t i = 0; i < n_seg; ++i) {
const int32_t t = *((int32_t *)target + i);
__nramset((T *)scalar_temp, scalar_elem_num, *((T *)weight + t));
__bang_cycle_mul((T *)output + i * row_num_elem,
(T *)output + i * row_num_elem, (T *)scalar_temp,
row_num_elem, scalar_elem_num);
int32_t t = *((int32_t *)target + i);
if (t >= 0 && t < c) {
t = partition_nc ? 0 : t;
__bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
*((T *)weight + t), c_num);
}
}
}
}
template <typename T>
__mlu_func__ void focalLossSigmoidForwardBlock(
__mlu_func__ void startPipeline(
const T *input, const int32_t *target, const T *weight,
const int32_t row_num, const int32_t C, const float alpha,
const float gamma, T *output) {
/*
* NRAM partition
* |-----------------------------------------------------------------------|
* | scalar |
* |-----------------------------------------------------------------------|
* | weight |
* |------------------------------- COMPUTE -------------------------------|
* | | |
* | computeA | computeB |
* | | |
* |------------- PING ------------------------------- PONG ---------------|
* | | |
* | input | input |
* | | |
* |-----------------------------------|-----------------------------------|
* | | |
* | output | output |
* | | |
* |-----------------------------------|-----------------------------------|
* | target | target |
* |-----------------------------------|-----------------------------------|
*
* split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
* PONG(input,output).
* split_target_num is 2: PING(target), PONG(target).
*/
const int32_t c_align = PAD_UP(C, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_align_size = c_align * sizeof(T);
const int32_t scalar_size = NFU_ALIGN_SIZE;
const int32_t weight_size = (weight != NULL) * c_align_size;
const int32_t split_pipeline_num = 6;
const int32_t split_target_num = 2;
char *nram_compute_a, char *nram_compute_b, char *nram_input,
char *nram_target, char *nram_weight, char *nram_output,
const int32_t has_weight, const int32_t partition_nc,
const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
T *output) {
// with offset
input = (T *)((char *)input + c_offset_num * sizeof(T));
output = (T *)((char *)output + c_offset_num * sizeof(T));
const int32_t remain_size = MAX_NRAM_SIZE - scalar_size - weight_size;
const int32_t n_seg = remain_size / (split_pipeline_num * c_align_size +
split_target_num * sizeof(int32_t));
const int32_t deal_num = n_seg * c_align_size / sizeof(T);
const int32_t target_size = n_seg * sizeof(int32_t);
const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
const int32_t load_size = c_seg * sizeof(T);
const int32_t dram_stride = c * sizeof(T);
const int32_t nram_stride = c_num * sizeof(T);
// nram scalar,weight
char *nram_scalar = (char *)nram_buffer;
char *nram_weight = (char *)nram_scalar + scalar_size;
if (weight_size > 0) {
loadInput<T>(nram_weight, (T *)weight, C * sizeof(T));
__asm__ volatile("sync;");
if (has_weight && !partition_nc) {
loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
1);
__asm__ volatile("sync;\n\t");
}
// nram COMPUTE
const int32_t compute_size = 2 * c_align_size * n_seg;
char *nram_compute_a = (char *)nram_weight + weight_size;
char *nram_compute_b = (char *)nram_compute_a + c_align_size * n_seg;
// nram PING/PONG
const int32_t pingpong_offset = (remain_size - compute_size) / 2;
char *nram_input = (char *)nram_compute_a + 2 * c_align_size * n_seg;
char *nram_output = (char *)nram_compute_a + 3 * c_align_size * n_seg;
char *nram_target = (char *)nram_compute_a + 4 * c_align_size * n_seg;
const int32_t repeat = row_num / n_seg;
const int32_t remain = row_num % n_seg;
const int32_t repeat = n / n_seg;
const int32_t remain = n % n_seg;
/*
* Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
@ -206,80 +184,214 @@ __mlu_func__ void focalLossSigmoidForwardBlock(
// diagram of PINGPONG: L0
if (repeat > 0) {
loadInput<T>(nram_input, (T *)input, C * sizeof(T), c_align * sizeof(T),
C * sizeof(T), n_seg);
loadInput<int32_t>(nram_target, (int32_t *)target, target_size);
__asm__ volatile("sync;");
loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
n_seg);
loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
partition_nc);
__asm__ volatile("sync;\n\t");
}
// diagram of PINGPONG: C0 and L1
if (repeat > 1) {
loadInput<T>(nram_input + pingpong_offset, (T *)input + C * n_seg,
C * sizeof(T), c_align * sizeof(T), C * sizeof(T), n_seg);
loadInput<int32_t>(nram_target + pingpong_offset, (int32_t *)target + n_seg,
target_size);
compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
weight_size, deal_num, n_seg, C, alpha, gamma, (T *)nram_scalar,
(T *)nram_compute_a, (T *)nram_compute_b, (T *)nram_output);
__asm__ volatile("sync;");
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)nram_output);
loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
load_size, nram_stride, dram_stride, n_seg);
loadInput<int32_t>((char *)nram_target + pingpong_offset,
(int32_t *)target + n_seg, n_seg * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
*((int32_t *)target + n_seg), c, has_weight, partition_nc);
__asm__ volatile("sync;\n\t");
}
for (int32_t i = 0; i < repeat - 2; ++i) {
storeOutput<T>((T *)output + i * C * n_seg,
nram_output + (i % 2) * pingpong_offset, C * sizeof(T),
C * sizeof(T), c_align * sizeof(T), n_seg);
loadInput<T>(nram_input + (i % 2) * pingpong_offset,
(T *)input + (i + 2) * C * n_seg, C * sizeof(T),
c_align * sizeof(T), C * sizeof(T), n_seg);
loadInput<int32_t>(nram_target + (i % 2) * pingpong_offset,
(int32_t *)target + (i + 2) * n_seg, target_size);
storeOutput<T>((T *)output + i * c * n_seg,
nram_output + (i % 2) * pingpong_offset, load_size,
dram_stride, nram_stride, n_seg);
loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
(T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
dram_stride, n_seg);
loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
(int32_t *)target + (i + 2) * n_seg,
n_seg * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
(T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
has_weight, partition_nc);
compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
(int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
(T *)nram_weight, weight_size, deal_num, n_seg, C, alpha, gamma,
(T *)nram_scalar, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_weight +
partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
__asm__ volatile("sync;");
__asm__ volatile("sync;\n\t");
}
if (repeat > 1) {
storeOutput<T>((T *)output + (repeat - 2) * C * n_seg,
nram_output + (repeat % 2) * pingpong_offset, C * sizeof(T),
C * sizeof(T), c_align * sizeof(T), n_seg);
storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
(char *)nram_output + (repeat % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, n_seg);
}
if (remain > 0) {
loadInput<T>(nram_input + (repeat % 2) * pingpong_offset,
(T *)input + repeat * C * n_seg, C * sizeof(T),
c_align * sizeof(T), C * sizeof(T), remain);
loadInput<int32_t>(nram_target + (repeat % 2) * pingpong_offset,
loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
(T *)input + repeat * c * n_seg, load_size, nram_stride,
dram_stride, remain);
loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
(int32_t *)target + repeat * n_seg,
remain * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
(T *)weight, *((int32_t *)target + repeat * n_seg), c,
has_weight, partition_nc);
}
if (repeat > 0) {
compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
(int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
(T *)nram_weight, weight_size, deal_num, n_seg, C, alpha, gamma,
(T *)nram_scalar, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_weight +
partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
}
__asm__ volatile("sync;");
__asm__ volatile("sync;\n\t");
if (repeat > 0) {
storeOutput<T>((T *)output + (repeat - 1) * C * n_seg,
nram_output + ((repeat - 1) % 2) * pingpong_offset,
C * sizeof(T), C * sizeof(T), c_align * sizeof(T), n_seg);
storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
(char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, n_seg);
}
if (remain > 0) {
int rem_deal_num = remain * c_align_size / sizeof(T);
int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
(int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
(T *)nram_weight, weight_size, rem_deal_num, remain, C, alpha,
gamma, (T *)nram_scalar, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_weight +
partition_nc * (repeat % 2) * pingpong_weight_offset),
has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + (repeat % 2) * pingpong_offset));
__asm__ volatile("sync;");
__asm__ volatile("sync;\n\t");
storeOutput<T>((T *)output + repeat * C * n_seg,
nram_output + (repeat % 2) * pingpong_offset, C * sizeof(T),
C * sizeof(T), c_align * sizeof(T), remain);
storeOutput<T>((T *)output + repeat * c * n_seg,
(char *)nram_output + (repeat % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, remain);
}
__asm__ volatile("sync;\n\t");
}
template <typename T>
__mlu_func__ void focalLossSigmoidForwardBlock(
const T *input, const int32_t *target, const T *weight, const int32_t n,
const int32_t c, const float alpha, const float gamma, T *output) {
/*
* NRAM partition
* |-----------------------------------------------------------------------|
* | weight |
* |------------------------------- COMPUTE -------------------------------|
* | | |
* | computeA | computeB |
* | | |
* |------------- PING ------------------------------- PONG ---------------|
* | | |
* | input | input |
* | | |
* |-----------------------------------|-----------------------------------|
* | | |
* | output | output |
* | | |
* |-----------------------------------|-----------------------------------|
* | target | target |
* |-----------------------------------|-----------------------------------|
*
* split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
* PONG(input,output).
* split_target_num is 2: PING(target), PONG(target).
* weight is not NULL:
* The nram-size of weight is equal to c_align_size when partition input-N.
* The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
* input-NC.
*/
// calculate threshold of c
const int32_t split_pipeline_num = 6;
const int32_t split_target_num = 2;
const int32_t has_weight = weight != NULL;
const int32_t threshold_c =
PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
sizeof(T);
const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_align_size = c_align * sizeof(T);
if (c <= threshold_c) {
// partition inputN
int32_t c_num = c;
int32_t reservered_align_size =
(split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
int32_t weight_size = 0;
if (has_weight) {
c_num = c_align;
reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
weight_size = c_align_size;
}
const int32_t remain_size =
MAX_NRAM_SIZE - weight_size - reservered_align_size;
const int32_t n_seg =
remain_size / (split_pipeline_num * c_num * sizeof(T) +
split_target_num * sizeof(int32_t));
const int32_t split_pipeline_size =
PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
const int32_t compute_size = 2 * split_pipeline_size;
const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
char *nram_weight = (char *)nram_buffer;
char *nram_compute_a = nram_weight + has_weight * c_align_size;
char *nram_compute_b = nram_compute_a + split_pipeline_size;
char *nram_input = nram_compute_b + split_pipeline_size;
char *nram_output = nram_input + split_pipeline_size;
char *nram_target = nram_output + split_pipeline_size;
startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
nram_input, nram_target, nram_weight, nram_output,
has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
alpha, gamma, output);
} else {
// partition inputNC
const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
const int32_t split_pipeline_size = PAD_DOWN(
(remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
NFU_ALIGN_SIZE);
const int32_t c_seg = split_pipeline_size / sizeof(T);
const int32_t n_seg = 1;
const int32_t compute_size = 2 * split_pipeline_size;
const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
const int32_t pingpong_weight_offset = weight_size / 2;
char *nram_weight = (char *)nram_buffer;
char *nram_compute_a = nram_weight + weight_size;
char *nram_compute_b = nram_compute_a + split_pipeline_size;
char *nram_input = nram_compute_b + split_pipeline_size;
char *nram_output = nram_input + split_pipeline_size;
char *nram_target = nram_output + split_pipeline_size;
const int32_t loop_num = (c + c_seg - 1) / c_seg;
const int32_t partition_nc = 1;
for (int32_t i = 0; i < loop_num; ++i) {
const int32_t c_index = i * c_seg;
const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
nram_input, nram_target, nram_weight, nram_output,
has_weight, partition_nc, pingpong_offset,
pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
alpha, gamma, output);
}
}
}

View File

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (C) 2021 by Cambricon.
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
@ -15,6 +15,8 @@
#define COORD_DIM (4)
#define MEMORY_CORE (0x80)
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
@ -551,7 +553,7 @@ __mlu_func__ void nms_detection(
} // for keepNum
}
__mlu_global__ void MLUKernelNMS(
__mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_stride,
const int max_output_size, const float iou_threshold,
@ -635,15 +637,525 @@ __mlu_global__ void MLUKernelNMS(
}
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux(
int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_layout, const int input_num_boxes, const int input_stride,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float offset, const int output_mode, const int algo) {
loop_end_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr;
input_score_ptr = score_data;
input_x1_ptr = boxes_data;
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int nram_save_count = 0;
if (output_mode == 0) {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
// data split
int avg_cluster = input_num_boxes / clusterDim;
int rem_cluster = input_num_boxes % clusterDim;
int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
int cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
int input_offset = cluster_offset + core_offset;
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_avg_core = input_num_boxes / clusterDim;
int max_index_rem_core = input_num_boxes % clusterDim;
int max_index_len_core =
max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
int max_index_input_offset =
max_index_avg_core * clusterId +
(clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
// init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
__sync_all();
/******FIND MAX START******/
int max_index = 0;
int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
if (coreId == 0) {
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = (i == repeat)
? remain_pad
: max_seg_pad; // the length every nms compute
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num
// that fp16 could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
return;
}
int cpy_len = (i == repeat)
? remain
: max_seg_pad; // the length every nms memcpy
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score,
input_score_ptr + max_index_input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index =
((uint16_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index =
((uint32_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every cluster
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
// copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_all();
// copy all partial max to the sram of cluster 0
if (clusterId != 0) {
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
// copy the global max box to max_box
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId != 0x80) {
input_score_ptr[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
/******NMS STORE START******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) {
loop_end_flag[0] = 1; // dram
}
}
__sync_all();
if (loop_end_flag[0] == 1) {
break;
}
/******NMS STORE END******/
// To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
: max_seg_iou_compute;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
/******NMS LOAD START******/
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
/******NMS LOAD END******/
/******NMS COMPUTE START******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
if (sizeof(IN_DT) == 2) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), input_store_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for max_output_size
}
__mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_layout, const int input_stride,
const int max_output_size, const float iou_threshold,
const float confidence_threshold, const float offset,
const cnrtDataType_t data_type_input, const int output_mode, const int algo,
void *workspace, void *result_num, void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *loop_end_flag =
(int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
int cluster_score_size = input_num_boxes * input_dwidth;
int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
char *sram_score = (char *)sram_buffer + reduce_sram_size;
char *sram_boxes =
(char *)sram_buffer + reduce_sram_size + cluster_score_size;
Addr input_ram = GDRAM;
if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
input_ram = SRAM;
__memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
__memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
} else {
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
}
__sync_cluster();
uint32_t output_box_num = 0;
if (output_mode == 0) {
uint32_t *output_dram = (uint32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *score_data;
half *boxes_data;
score_data =
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
boxes_data =
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
}
} else {
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *output_dram = (half *)output;
half *score_data;
half *boxes_data;
score_data =
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
boxes_data =
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *output_dram = (float *)output;
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
}
}
}
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int input_stride, const int max_output_boxes,
const float iou_threshold, const float offset,
void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
MLUKernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, input_stride, max_output_boxes,
iou_threshold, /*confidence_threshold=*/0.0, /*output_mode=*/0,
/*input_layout=*/0, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, input_stride,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0,
/*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
}; break;
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION4:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
input_stride, max_output_boxes, iou_threshold,
/*confidence_threshold=*/0.0, offset, data_type_input,
/*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
output_ptr);
}; break;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -96,8 +96,10 @@ void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
#endif
#ifdef MMCV_WITH_MLU
} else if (input.device().type() == at::kMLU) {
CHECK_MLU(input);
CHECK_MLU(target);
CHECK_MLU_INPUT(input);
CHECK_MLU_INPUT(target);
CHECK_MLU_INPUT(weight);
CHECK_MLU_INPUT(output);
sigmoid_focal_loss_forward_mlu(input, target, weight, output, gamma, alpha);
#endif
} else {
@ -121,10 +123,10 @@ void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
#endif
#ifdef MMCV_WITH_MLU
} else if (input.device().type() == at::kMLU) {
CHECK_MLU(input);
CHECK_MLU(target);
CHECK_MLU(weight);
CHECK_MLU(grad_input);
CHECK_MLU_INPUT(input);
CHECK_MLU_INPUT(target);
CHECK_MLU_INPUT(weight);
CHECK_MLU_INPUT(grad_input);
sigmoid_focal_loss_backward_mlu(input, target, weight, grad_input, gamma,
alpha);

View File

@ -37,24 +37,38 @@ static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
auto N = input.size(0);
auto C = input.size(1);
auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
auto c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
const int split_target_num = 2;
const int split_pipeline_num = 6;
auto scalar_size = NFU_ALIGN_SIZE;
auto weight_size = c_align_size;
const int has_weight = weight.data_ptr() != nullptr;
const int target_data_width = target.scalar_type() == at::kLong
? target.itemsize() / 2
: target.itemsize();
const int threshold_c =
PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
input.itemsize();
// n_seg * c_align_size * split_pipeline_num +
// n_seg * target.itemsize() * split_target_num +
// weight_size + scalar_size <= nram_size
auto n_seg = (nram_size - weight_size - scalar_size) /
(c_align_size * split_pipeline_num +
target_data_width * split_target_num);
auto seg_num = (N + n_seg - 1) / n_seg;
int n_seg = 1;
if (C <= threshold_c) {
int c_size = C * input.itemsize();
int reservered_align_size =
(split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
int wegiht_size = 0;
if (has_weight) {
c_size = c_align_size;
reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
wegiht_size = c_align_size;
}
// n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
// split_target_num
// + weight_size + reservered_align_size <= nram_size
n_seg = (nram_size - wegiht_size - reservered_align_size) /
(split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
}
auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
auto core_num = core_dim * cluster_num;
@ -103,31 +117,8 @@ void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
CNLOG(INFO) << "weight is a empty tensor.";
}
// check C
auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
auto input_N = input.size(0);
auto input_C = input.size(1);
const int split_target_num = 2;
const int split_pipeline_num = 6;
const int has_weight = (int)(weight.data_ptr() != nullptr);
// target supports only INT on MLU device while it keeps LONG on host side,
// so target.itemsize() / 2
const int target_data_width = target.scalar_type() == at::kLong
? target.itemsize() / 2
: target.itemsize();
auto threshold_C = PAD_DOWN((nram_size - NFU_ALIGN_SIZE -
split_target_num * target_data_width) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
input.itemsize();
TORCH_CHECK(threshold_C >= input_C,
"input.size(1) should be in the range of [0, ", threshold_C,
"]. ", "But now input.size(1) is ", input_C, ".");
// return if zero-element
if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
// return if zero-element
return;
}
@ -158,8 +149,8 @@ void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
<< k_dim.z << ">>>";
// launch kernel
KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
target_ptr, weight_ptr, input_N, input_C, alpha,
gamma, output_ptr);
target_ptr, weight_ptr, input.size(0),
input.size(1), alpha, gamma, output_ptr);
}
void getDealNAndThresholdC(const int compute_data_bytes,

View File

@ -19,6 +19,16 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const float iou_threshold, const float offset,
void *workspace_ptr, void *output_size_ptr, void *output_ptr);
int selectUnionType(uint32_t use_job, int box_num_per_core) {
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while (box_num_per_core < 256 && use_job >= 4) {
box_num_per_core *= 2;
use_job /= 2;
}
return use_job;
}
Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset) {
// dimension parameters check
@ -42,32 +52,57 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
}
int input_num_boxes = boxes.size(0);
int input_stride = boxes.size(1);
int input_stride = boxes.size(0);
int max_output_boxes = boxes.size(0);
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t dim_x = core_dim;
cnrtDim3_t k_dim = {dim_x, 1, 1};
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
cnrtDim3_t k_dim;
cnrtJobType_t k_type;
uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
uint32_t job_limit = union_number * core_dim;
uint32_t core_number = union_number * core_dim;
int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
// initiate k_type as Union1
k_dim.x = core_dim;
k_dim.y = 1;
k_dim.z = 1;
k_type = CNRT_FUNC_TYPE_UNION1;
int use_job = selectUnionType(job_limit, box_num_per_core);
if (use_job < 4) {
k_dim.x = 1;
k_type = CNRT_FUNC_TYPE_BLOCK;
} else if (use_job == 4) {
k_dim.x = core_dim;
k_type = CNRT_FUNC_TYPE_UNION1;
} else {
k_dim.x = use_job;
k_type = (cnrtFunctionType_t)use_job;
}
// transpose boxes (n, 4) to (4, n) for better performance
auto boxes_t = boxes.transpose(0, 1);
auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
// workspace
const int info_num = 5; // x1, x2, y1, y2 and score
size_t space_size = 0;
if (boxes.scalar_type() == at::kHalf) {
space_size = input_num_boxes * sizeof(int16_t);
space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
} else {
space_size = input_num_boxes * sizeof(float);
space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
}
auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes);
auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
auto boxes_ptr = boxes_impl->cnnlMalloc();
auto scores_impl = torch_mlu::getMluTensorImpl(scores);
auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
auto scores_ptr = scores_impl->cnnlMalloc();
auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
auto workspace_ptr = workspace_impl->cnnlMalloc();
@ -76,20 +111,11 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
auto output_size_ptr = output_size_impl->cnnlMalloc();
switch (k_type) {
default: {
TORCH_CHECK(false, "[nms_mlu]:Failed to choose kernel to launch");
}
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
CNLOG(INFO) << "Launch Kernel MLUUnion1 or Block NMS<<<Union"
<< k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y
<< ", " << k_dim.z << ">>>";
KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
input_num_boxes, input_stride, max_output_boxes, iou_threshold,
offset, workspace_ptr, output_size_ptr, output_ptr);
}; break;
}
CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
input_num_boxes, input_stride, max_output_boxes, iou_threshold,
offset, workspace_ptr, output_size_ptr, output_ptr);
int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
return output.slice(0, 0, output_num);

View File

@ -122,11 +122,11 @@ void roi_align_forward(Tensor input, Tensor rois, Tensor output,
#endif
#ifdef MMCV_WITH_MLU
} else if (input.device().type() == at::kMLU) {
CHECK_MLU(input);
CHECK_MLU(rois);
CHECK_MLU(output);
CHECK_MLU(argmax_y);
CHECK_MLU(argmax_x);
CHECK_MLU_INPUT(input);
CHECK_MLU_INPUT(rois);
CHECK_MLU_INPUT(output);
CHECK_MLU_INPUT(argmax_y);
CHECK_MLU_INPUT(argmax_x);
roi_align_forward_mlu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
@ -164,11 +164,11 @@ void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
#endif
#ifdef MMCV_WITH_MLU
} else if (grad_output.device().type() == at::kMLU) {
CHECK_MLU(grad_output);
CHECK_MLU(rois);
CHECK_MLU(argmax_y);
CHECK_MLU(argmax_x);
CHECK_MLU(grad_input);
CHECK_MLU_INPUT(grad_output);
CHECK_MLU_INPUT(rois);
CHECK_MLU_INPUT(argmax_y);
CHECK_MLU_INPUT(argmax_x);
CHECK_MLU_INPUT(grad_input);
roi_align_backward_mlu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,