mirror of https://github.com/open-mmlab/mmcv.git
[Refactor] Replace bbox_overlaps op of MLU backend with mlu-ops (#2880)
parent
eb1dad9ad8
commit
8cf8a5cdba
|
@ -1,322 +0,0 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2021 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include <float.h>
|
||||
|
||||
#include "common_mlu_helper.hpp"
|
||||
|
||||
#define COORD_NUM 4
|
||||
|
||||
__nram__ char nmem_buf[MAX_NRAM_SIZE];
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
|
||||
void *nram_addition, const int32_t deal_num) {
|
||||
__bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
|
||||
__bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
|
||||
}
|
||||
|
||||
template <>
|
||||
__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
|
||||
void *nram_src1, void *nram_addition,
|
||||
const int32_t deal_num) {
|
||||
__bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
|
||||
__bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
|
||||
deal_num);
|
||||
__bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
|
||||
__bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void bboxOverlapsWorkflow(
|
||||
T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
|
||||
T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
|
||||
T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
|
||||
const int32_t offset, const int32_t mode, const int32_t batches_stride,
|
||||
const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
|
||||
int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
|
||||
int32_t batch_start = taskId * task_batch_stride;
|
||||
int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
|
||||
? task_batch_stride
|
||||
: num_bbox1 - batch_start;
|
||||
batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
|
||||
|
||||
if (aligned) {
|
||||
int32_t num_loop_cpy = batch_per_task / batches_stride;
|
||||
int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
|
||||
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
|
||||
for (int32_t i = 0; i < num_loop_cpy; i++) {
|
||||
int32_t index = batch_start + i * batches_stride;
|
||||
int32_t handle_batches = index + batches_stride > num_bbox1
|
||||
? num_rem_cpy_batches
|
||||
: batches_stride;
|
||||
int32_t b1 = index;
|
||||
int32_t b2 = index;
|
||||
|
||||
int32_t base1 = b1 * COORD_NUM;
|
||||
__memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
|
||||
int32_t base2 = b2 * COORD_NUM;
|
||||
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
// get the width and height
|
||||
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
|
||||
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
|
||||
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
|
||||
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
|
||||
|
||||
// right - left + offset ---> left
|
||||
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
|
||||
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
|
||||
|
||||
// bottom - top + offset ---> right
|
||||
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
|
||||
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
|
||||
|
||||
// zero vector ---> bottom
|
||||
__bang_write_value(vec_bottom, batches_stride, 0.f);
|
||||
|
||||
// width --> vec_left
|
||||
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
|
||||
T *width = vec_left;
|
||||
// height --> vec_right
|
||||
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
|
||||
T *height = vec_right;
|
||||
|
||||
// get the b1_area
|
||||
// (b1_x2 - b1_x1 + offset) ---> vec_top
|
||||
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
|
||||
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
|
||||
|
||||
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
|
||||
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
|
||||
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
|
||||
|
||||
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
|
||||
// ---> vec_top;
|
||||
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
|
||||
T *b1_area = vec_top;
|
||||
|
||||
// get the b2_area
|
||||
// (b2_x2 - b2_x1 + offset) ---> b2_x1
|
||||
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
|
||||
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
|
||||
|
||||
// (b2_y2 - b2_y1 + offset) ---> b2_y1
|
||||
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
|
||||
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
|
||||
|
||||
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
|
||||
// ---> b2_x1;
|
||||
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
|
||||
T *b2_area = vec_b2_x1;
|
||||
|
||||
// inter_s = width * height
|
||||
__bang_mul(height, width, height, batches_stride);
|
||||
T *inter_s = height;
|
||||
|
||||
// offset vector ---> vec_b2_y1
|
||||
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
|
||||
T *vec_offset = vec_b2_y1;
|
||||
|
||||
if (mode == 0) {
|
||||
__bang_add(b1_area, b1_area, b2_area, batches_stride);
|
||||
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
|
||||
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
|
||||
} else {
|
||||
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
|
||||
}
|
||||
T *base_s = b1_area;
|
||||
|
||||
// ious = inter_s / base_s
|
||||
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
|
||||
__memcpy((T *)ious + index, width, handle_batches * sizeof(T),
|
||||
NRAM2GDRAM);
|
||||
}
|
||||
} else {
|
||||
int32_t num_loop_cpy = num_bbox2 / batches_stride;
|
||||
int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
|
||||
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
|
||||
for (int32_t i = 0; i < batch_per_task; i++) {
|
||||
int32_t index1 = batch_start + i;
|
||||
int32_t b1 = index1;
|
||||
int32_t base1 = b1 * COORD_NUM;
|
||||
|
||||
// set bbox1 and bbox2 to nram
|
||||
__bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
|
||||
__bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
|
||||
__bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
|
||||
__bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
|
||||
|
||||
for (int32_t j = 0; j < num_loop_cpy; j++) {
|
||||
int32_t index2 = j * batches_stride;
|
||||
int32_t handle_batches = index2 + batches_stride > num_bbox2
|
||||
? num_rem_cpy_batches
|
||||
: batches_stride;
|
||||
int32_t b2 = index2;
|
||||
int32_t base2 = b2 * COORD_NUM;
|
||||
|
||||
// copy bbox2 to nram
|
||||
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
|
||||
COORD_NUM * sizeof(T), handle_batches - 1);
|
||||
|
||||
// get the width and height
|
||||
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
|
||||
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
|
||||
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
|
||||
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
|
||||
|
||||
// right - left + offset ---> left
|
||||
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
|
||||
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
|
||||
// bottom - top + offset ---> right
|
||||
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
|
||||
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
|
||||
|
||||
// zero vector ---> bottom
|
||||
__bang_write_value(vec_bottom, batches_stride, (T)0);
|
||||
|
||||
// width --> vec_left
|
||||
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
|
||||
T *width = vec_left;
|
||||
// height --> vec_right
|
||||
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
|
||||
T *height = vec_right;
|
||||
|
||||
// get the b1_area
|
||||
// (b1_x2 - b1_x1 + offset) ---> vec_top
|
||||
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
|
||||
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
|
||||
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
|
||||
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
|
||||
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
|
||||
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
|
||||
// ---> vec_top;
|
||||
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
|
||||
T *b1_area = vec_top;
|
||||
|
||||
// get the b2_area
|
||||
// (b2_x2 - b2_x1 + offset) ---> b2_x1
|
||||
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
|
||||
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
|
||||
// (b2_y2 - b2_y1 + offset) ---> b2_y1
|
||||
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
|
||||
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
|
||||
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
|
||||
// ---> b2_x1;
|
||||
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
|
||||
T *b2_area = vec_b2_x1;
|
||||
|
||||
// inter_s = width * height
|
||||
__bang_mul(height, width, height, batches_stride);
|
||||
T *inter_s = height;
|
||||
|
||||
// offset vector ---> vec_b2_y1
|
||||
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
|
||||
T *vec_offset = vec_b2_y1;
|
||||
|
||||
if (mode == 0) {
|
||||
__bang_add(b1_area, b1_area, b2_area, batches_stride);
|
||||
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
|
||||
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
|
||||
} else {
|
||||
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
|
||||
}
|
||||
T *base_s = b1_area;
|
||||
|
||||
// ious = inter_s / base_s
|
||||
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
|
||||
int32_t gdram_offset = index1 * num_bbox2 + index2;
|
||||
__memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
|
||||
NRAM2GDRAM);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
|
||||
const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
|
||||
const int32_t num_bbox2, const int32_t mode, const bool aligned,
|
||||
const int32_t offset) {
|
||||
/*
|
||||
* NRAM partition
|
||||
* |-------------------------------------------------------------|
|
||||
* | vec_b1_x1 | vec_b1_y1 | vec_b1_x2 | vec_b1_y2 |
|
||||
* |-------------------------------------------------------------|
|
||||
* | vec_b2_x1 | vec_b2_y1 | vec_b2_x2 | vec_b2_y2 |
|
||||
* |-------------------------------------------------------------|
|
||||
* | vec_left | vec_right | vec_top | vec_bottom |
|
||||
* |-------------------------------------------------------------|
|
||||
*
|
||||
*/
|
||||
const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
|
||||
const int32_t split_nram_num = 12;
|
||||
const int32_t nram_stride =
|
||||
align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
|
||||
|
||||
void *vec_b1_x1 = nmem_buf;
|
||||
void *vec_b1_y1 = nmem_buf + nram_stride;
|
||||
void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
|
||||
void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
|
||||
|
||||
void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
|
||||
void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
|
||||
void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
|
||||
void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
|
||||
|
||||
void *vec_left = nmem_buf + 8 * nram_stride;
|
||||
void *vec_right = nmem_buf + 9 * nram_stride;
|
||||
void *vec_top = nmem_buf + 10 * nram_stride;
|
||||
void *vec_bottom = nmem_buf + 11 * nram_stride;
|
||||
|
||||
const int32_t vec_length = nram_stride / sizeof(T);
|
||||
bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
|
||||
(T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
|
||||
(T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
|
||||
(T *)vec_right, (T *)vec_top, (T *)vec_bottom,
|
||||
(T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
|
||||
vec_length, num_bbox1, num_bbox2, aligned);
|
||||
}
|
||||
|
||||
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const cnrtDataType_t d_type,
|
||||
const void *bbox1, const void *bbox2, void *ious,
|
||||
const int32_t num_bbox1, const int32_t num_bbox2,
|
||||
const int32_t mode, const bool aligned,
|
||||
const int32_t offset) {
|
||||
if (d_type == CNRT_FLOAT16) {
|
||||
MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
|
||||
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
|
||||
} else {
|
||||
MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
|
||||
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
|
||||
}
|
||||
}
|
|
@ -10,36 +10,11 @@
|
|||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
#include "mlu_common_helper.h"
|
||||
|
||||
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, const cnrtDataType_t d_type,
|
||||
const void *bbox1, const void *bbox2, void *ious,
|
||||
const int32_t num_bbox1, const int32_t num_bbox2,
|
||||
const int32_t mode, const bool aligned,
|
||||
const int32_t offset);
|
||||
|
||||
static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
|
||||
const int32_t batch_num_all) {
|
||||
auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
|
||||
auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
auto core_num = union_num * core_dim;
|
||||
|
||||
// Union1 policyFunc
|
||||
*k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
k_dim->x = core_dim;
|
||||
auto need_core_num = PAD_UP(batch_num_all, core_dim);
|
||||
k_dim->y =
|
||||
(need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
|
||||
k_dim->z = 1;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
|
||||
Tensor ious, const int32_t mode,
|
||||
const bool aligned, const int32_t offset) {
|
||||
void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
|
||||
const int32_t mode, const bool aligned,
|
||||
const int32_t offset) {
|
||||
// check dtype
|
||||
TORCH_CHECK(
|
||||
bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
|
||||
|
@ -63,38 +38,19 @@ void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
|
|||
return;
|
||||
}
|
||||
|
||||
// calculate task dimension
|
||||
cnrtDim3_t k_dim;
|
||||
cnrtFunctionType_t k_type;
|
||||
policyFunc(&k_dim, &k_type, batch_num_all);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(bboxes1);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(bboxes2);
|
||||
INITIAL_MLU_PARAM_WITH_TENSOR(ious);
|
||||
|
||||
// get compute queue
|
||||
cnrtQueue_t queue = torch_mlu::getCurQueue();
|
||||
// get compute handle
|
||||
auto handle = mluOpGetCurrentHandle();
|
||||
|
||||
// get dtype of input
|
||||
cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
|
||||
|
||||
// get ptr of tensors
|
||||
auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
|
||||
auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
|
||||
auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
|
||||
auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
|
||||
auto ious_impl = torch_mlu::getMluTensorImpl(ious);
|
||||
auto ious_ptr = ious_impl->cnnlMalloc();
|
||||
|
||||
// launch kernel
|
||||
CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
|
||||
CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
|
||||
<< " ]";
|
||||
KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
|
||||
ious_ptr, rows, cols, mode, aligned, offset);
|
||||
}
|
||||
|
||||
void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
|
||||
const int mode, const bool aligned, const int offset) {
|
||||
BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
|
||||
TORCH_MLUOP_CHECK(mluOpBboxOverlaps(
|
||||
handle, mode, aligned, offset, bboxes1_desc.desc(), bboxes1_ptr,
|
||||
bboxes2_desc.desc(), bboxes2_ptr, ious_desc.desc(), ious_ptr));
|
||||
}
|
||||
|
||||
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
|
||||
const int mode, const bool aligned, const int offset);
|
||||
|
||||
REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
|
||||
|
|
Loading…
Reference in New Issue