mirror of https://github.com/open-mmlab/mmcv.git
[Feature] Support ThreeNN with cambricon MLU backend (#2215)
parent
e663670a74
commit
a08517790d
mmcv/ops
csrc
common/mlu
pytorch/mlu
tests/test_ops
|
@ -53,7 +53,7 @@ We implement common ops used in detection, segmentation, etc.
|
|||
| Sparse Convolution | | √ | | |
|
||||
| Synchronized BatchNorm | | √ | | |
|
||||
| ThreeInterpolate | | √ | | |
|
||||
| ThreeNN | | √ | | |
|
||||
| ThreeNN | | √ | √ | |
|
||||
| TINShift | | √ | √ | |
|
||||
| UpFirDn2d | | √ | | |
|
||||
| Voxelization | √ | √ | | |
|
||||
|
|
|
@ -53,7 +53,7 @@ MMCV 提供了检测、分割等任务中常用的算子
|
|||
| Sparse Convolution | | √ | | |
|
||||
| Synchronized BatchNorm | | √ | | |
|
||||
| ThreeInterpolate | | √ | | |
|
||||
| ThreeNN | | √ | | |
|
||||
| ThreeNN | | √ | √ | |
|
||||
| TINShift | | √ | √ | |
|
||||
| UpFirDn2d | | √ | | |
|
||||
| Voxelization | √ | √ | | |
|
||||
|
|
|
@ -0,0 +1,466 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "common_mlu_helper.hpp"
|
||||
#include <algorithm>
|
||||
|
||||
__nram__ char nram_buffer[MAX_NRAM_SIZE];
|
||||
|
||||
#if __BANG_ARCH__ >= 322
|
||||
/**
|
||||
* returns the index of ret, which is stored at the 1st position of the `ret`,
|
||||
* used after bang_min
|
||||
*/
|
||||
__mlu_func__ uint32_t getIndice(half *ret) {
|
||||
uint32_t indice = *((uint32_t *)((uint16_t *)ret + 1));
|
||||
return indice;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the index of ret, which is stored at the 1st position of the `ret`,
|
||||
* used after bang_min
|
||||
*/
|
||||
__mlu_func__ uint32_t getIndice(float *ret) {
|
||||
uint32_t indice = ((uint32_t *)ret)[1];
|
||||
return indice;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void auxArgmin(T *nram_dst, T *nram_src, const int num_deal,
|
||||
T *value, int *index) {
|
||||
__bang_min(nram_dst, nram_src, num_deal);
|
||||
*value = nram_dst[0];
|
||||
__bang_write_value(nram_dst, num_deal, *value);
|
||||
__bang_eq(nram_dst, nram_src, nram_dst, num_deal);
|
||||
__bang_findfirst1((uint32_t *)nram_dst, nram_dst, num_deal);
|
||||
*index = *((int *)nram_dst);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void auxFuncFind3Min(T *nram_aux_a, const int auxa_offset,
|
||||
int *nram_aux_b, const int auxb_offset,
|
||||
T *nram_dest, T *nram_aux_sort_a,
|
||||
int *nram_aux_sort_b, const int deal_offset) {
|
||||
__bang_write_value(nram_aux_sort_a, auxa_offset, (T)(INFINITY));
|
||||
__bang_write_value(nram_aux_sort_b, auxb_offset, (int)0);
|
||||
int index = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
#if __BANG_ARCH__ >= 322
|
||||
__bang_argmin(nram_dest, nram_aux_a, auxa_offset);
|
||||
nram_aux_sort_a[i] = nram_dest[0];
|
||||
index = getIndice(nram_dest);
|
||||
#else
|
||||
T value = 0;
|
||||
auxArgmin(nram_dest, nram_aux_a, auxa_offset, &value, &index);
|
||||
nram_aux_sort_a[i] = value;
|
||||
#endif
|
||||
nram_aux_sort_b[i] = nram_aux_b[index];
|
||||
__memset_nram(nram_aux_a + index, 1, (T)(INFINITY));
|
||||
}
|
||||
__memcpy((char *)nram_aux_a, (char *)nram_aux_sort_a, auxa_offset * sizeof(T),
|
||||
NRAM2NRAM);
|
||||
__memcpy((char *)nram_aux_b, (char *)nram_aux_sort_b,
|
||||
auxb_offset * sizeof(int), NRAM2NRAM);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void auxFuncSort(T *nram_aux_a, const int auxa_offset,
|
||||
int *nram_aux_b, const int auxb_offset,
|
||||
T *nram_dest, T *nram_help_value,
|
||||
int *nram_help_idx, const int num_deal,
|
||||
const int deal_offset) {
|
||||
for (int k = 0; k < num_deal; ++k) {
|
||||
auxFuncFind3Min(nram_aux_a + k * auxa_offset, auxa_offset,
|
||||
nram_aux_b + k * auxb_offset, auxb_offset, nram_dest,
|
||||
nram_help_value, nram_help_idx, deal_offset);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void auxFuncNN(
|
||||
size_t *output_aux_sort_a_gap, size_t *output_aux_sort_b_gap,
|
||||
size_t *output_aux_dest_gap, size_t *output_unknown_gap,
|
||||
size_t *output_known_gap, size_t *output_dist_gap, size_t *auxillary_a_gap,
|
||||
size_t *auxillary_b_gap, size_t *known_num_deal, size_t *unknown_num_deal,
|
||||
size_t *align_num, size_t *auxa_offset, size_t *auxb_offset) {
|
||||
/*
|
||||
* nram partition:
|
||||
* |-NFU_ALIGN_SIZE-|-2*NFU_ALIGN_SIZE-|-X*3*sizeof(T)-|
|
||||
* space: | aux_sort_a | aux_sort_b | nram_unknown |
|
||||
*
|
||||
* | ------ (Y * 7 *sizeof(T)) ---------------- |
|
||||
* | nram_known | nram_dist | nram_dest |
|
||||
*
|
||||
* | -X * NFU_ALIGN_SIZE ---|---X * 2 * NFU_ALIGN_SIZE-|
|
||||
* | output_dist(aux_a) | output_dist(aux_b) |
|
||||
* 200 series
|
||||
* X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (2/3) / (3 * sizeof(T) + 3 *
|
||||
* NFU_ALIGN_SIZE)
|
||||
* Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (1/3) / (7 * sizeof(T))
|
||||
* 300 series
|
||||
* X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (4/5) / (3 *
|
||||
* sizeof(T) + 3 * NFU_ALIGN_SIZE)
|
||||
* Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) *
|
||||
* (1/5) / (7 * sizeof(T))
|
||||
*
|
||||
*/
|
||||
|
||||
*align_num = NFU_ALIGN_SIZE / sizeof(T);
|
||||
*auxa_offset = NFU_ALIGN_SIZE / sizeof(T);
|
||||
*auxb_offset = 2 * NFU_ALIGN_SIZE / sizeof(int);
|
||||
#if __BANG_ARCH__ >= 322
|
||||
*known_num_deal = PAD_DOWN(
|
||||
(MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 / (7 * sizeof(T)), *align_num);
|
||||
*unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 * 4 /
|
||||
(3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
|
||||
*align_num);
|
||||
#else
|
||||
*known_num_deal = PAD_DOWN(
|
||||
(MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 / (7 * sizeof(T)), *align_num);
|
||||
*unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 * 2 /
|
||||
(3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
|
||||
*align_num);
|
||||
#endif
|
||||
|
||||
*output_aux_sort_a_gap = 0;
|
||||
*output_aux_sort_b_gap = *output_aux_sort_a_gap + NFU_ALIGN_SIZE;
|
||||
*output_aux_dest_gap = *output_aux_sort_b_gap + 2 * NFU_ALIGN_SIZE;
|
||||
|
||||
*output_unknown_gap = *output_aux_dest_gap + *known_num_deal * sizeof(T);
|
||||
*output_known_gap = *output_unknown_gap + *unknown_num_deal * 3 * sizeof(T);
|
||||
*output_dist_gap = *output_known_gap + *known_num_deal * 3 * sizeof(T);
|
||||
*auxillary_a_gap = *output_dist_gap + *known_num_deal * 3 * sizeof(T);
|
||||
*auxillary_b_gap = *auxillary_a_gap + *unknown_num_deal * NFU_ALIGN_SIZE;
|
||||
}
|
||||
|
||||
#if __BANG_ARCH__ >= 322
|
||||
template <typename T>
|
||||
__mlu_func__ bool containNanInf(T *nram_unknown) {
|
||||
if (std::isnan(nram_unknown[0]) || std::isnan(nram_unknown[1]) ||
|
||||
std::isnan(nram_unknown[2]) || std::isinf(nram_unknown[0]) ||
|
||||
std::isinf(nram_unknown[1]) || std::isinf(nram_unknown[2]))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist,
|
||||
T *nram_dest, T *nram_aux_a,
|
||||
T *nram_aux_sort_a, int *nram_aux_b,
|
||||
int *nram_aux_sort_b, const int known_num_deal,
|
||||
const int known_seg_num, const int deal_offset,
|
||||
const int known_count,
|
||||
const int known_count_align) {
|
||||
__bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
|
||||
#if __BANG_ARCH__ >= 322
|
||||
if (!containNanInf(nram_unknown)) {
|
||||
#endif
|
||||
// x1 - x2
|
||||
__bang_sub_scalar(nram_dist, nram_known, nram_unknown[0],
|
||||
known_count_align);
|
||||
// y1 - y2
|
||||
__bang_sub_scalar(nram_dist + known_count_align,
|
||||
nram_known + known_count_align, nram_unknown[1],
|
||||
known_count_align);
|
||||
// z1 - z2
|
||||
__bang_sub_scalar(nram_dist + 2 * known_count_align,
|
||||
nram_known + 2 * known_count_align, nram_unknown[2],
|
||||
known_count_align);
|
||||
__bang_square(nram_dist, nram_dist, 3 * known_count_align);
|
||||
__bang_add(nram_dist, nram_dist, nram_dist + known_count_align,
|
||||
known_count_align);
|
||||
__bang_add(nram_dist, nram_dist, nram_dist + 2 * known_count_align,
|
||||
known_count_align);
|
||||
#if __BANG_ARCH__ >= 322
|
||||
}
|
||||
#endif
|
||||
|
||||
int index = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
#if __BANG_ARCH__ >= 322
|
||||
__bang_argmin(nram_dest, nram_dist, known_count_align);
|
||||
nram_aux_a[i + deal_offset] = nram_dest[0];
|
||||
index = getIndice(nram_dest);
|
||||
#else
|
||||
T value = 0;
|
||||
auxArgmin(nram_dest, nram_dist, known_count_align, &value, &index);
|
||||
nram_aux_a[i + deal_offset] = value;
|
||||
#endif
|
||||
nram_aux_b[i + deal_offset] = index + known_seg_num * known_num_deal;
|
||||
__memset_nram(nram_dist + index, 1, (T)(INFINITY));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void loadTransposedKnownTensor(
|
||||
char *nram_known, char *nram_dist, const char *known_gdram,
|
||||
const int known_num_deal, const int batch_id, const int m,
|
||||
const int known_seg_num, const int count, const int count_align_num) {
|
||||
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
|
||||
#if __BANG_ARCH__ >= 322
|
||||
__bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
|
||||
__memcpy(nram_dist,
|
||||
known_gdram +
|
||||
(batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
|
||||
count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
|
||||
m * sizeof(T), 2);
|
||||
__bang_minequal((T *)nram_known, (T *)nram_known, (T *)nram_dist,
|
||||
3 * count_align_num);
|
||||
#else
|
||||
__memcpy(nram_known,
|
||||
known_gdram +
|
||||
(batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
|
||||
count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
|
||||
m * sizeof(T), 2);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void loadUnknownTensor(char *nram_unknown,
|
||||
const char *unknown_gdram,
|
||||
const int unknown_num_deal,
|
||||
const int unknown_seg_num, const int count,
|
||||
const int count_align_num) {
|
||||
__memcpy(nram_unknown,
|
||||
unknown_gdram + unknown_seg_num * unknown_num_deal * 3 * sizeof(T),
|
||||
count * 3 * sizeof(T), GDRAM2NRAM);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_func__ void auxProcessSegment(
|
||||
const int m, const int n, T *nram_unknown, T *nram_known, T *nram_dist,
|
||||
T *nram_dest, T *known_gdram, T *nram_aux_a, const int auxa_offset,
|
||||
int *nram_aux_b, const int auxb_offset, T *nram_aux_sort_a,
|
||||
int *nram_aux_sort_b, const int unknown_num_deal, const int known_num_deal,
|
||||
const int known_seg_num, const int unknown_seg_num, const int unknown_count,
|
||||
const int known_count, const int known_count_align, const int start_idx,
|
||||
int *deal_offset) {
|
||||
int pre_batch_id = -1;
|
||||
int cur_batch_id = -1;
|
||||
pre_batch_id = start_idx / n;
|
||||
|
||||
// if aux_a space is not enough, get the first 3 min among aux_a and clear.
|
||||
if (*deal_offset >= PAD_DOWN(auxa_offset, 3)) {
|
||||
auxFuncSort(nram_aux_a, auxa_offset, nram_aux_b, auxb_offset, nram_dest,
|
||||
nram_aux_sort_a, nram_aux_sort_b, unknown_count, *deal_offset);
|
||||
*deal_offset = 3;
|
||||
}
|
||||
|
||||
// load i'th segment of known batch data.
|
||||
loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
|
||||
(char *)known_gdram, known_num_deal,
|
||||
pre_batch_id, m, known_seg_num, known_count,
|
||||
known_count_align);
|
||||
|
||||
for (int k = 0; k < unknown_count; ++k) {
|
||||
cur_batch_id = (start_idx + k) / n;
|
||||
if (cur_batch_id != pre_batch_id) { // if batch id of unknown data changed,
|
||||
// load corresponding known batch data
|
||||
pre_batch_id = cur_batch_id;
|
||||
loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
|
||||
(char *)known_gdram, known_num_deal,
|
||||
pre_batch_id, m, known_seg_num, known_count,
|
||||
known_count_align);
|
||||
}
|
||||
computeThreeNN(nram_unknown + 3 * k, nram_known, nram_dist, nram_dest,
|
||||
nram_aux_a + k * auxa_offset, nram_aux_sort_a,
|
||||
nram_aux_b + k * auxb_offset, nram_aux_sort_b,
|
||||
known_num_deal, known_seg_num, *deal_offset, known_count,
|
||||
known_count_align);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__mlu_global__ void MLUUnion1KernelThreeNN(const int b, const int n,
|
||||
const int m, char *unknown_gdram,
|
||||
char *known_gdram, char *dist2_gdram,
|
||||
int *idx_gdram) {
|
||||
if (coreId == 0x80) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t output_aux_sort_a_gap = 0, output_aux_sort_b_gap = 0,
|
||||
output_dest_gap = 0, output_unknown_gap = 0, output_known_gap = 0,
|
||||
output_dist_gap = 0, auxillary_a_gap = 0, auxillary_b_gap = 0,
|
||||
known_num_deal = 0, unknown_num_deal = 0, align_num = 0,
|
||||
auxa_offset = 0, auxb_offset = 0;
|
||||
auxFuncNN<T>(&output_aux_sort_a_gap, &output_aux_sort_b_gap, &output_dest_gap,
|
||||
&output_unknown_gap, &output_known_gap, &output_dist_gap,
|
||||
&auxillary_a_gap, &auxillary_b_gap, &known_num_deal,
|
||||
&unknown_num_deal, &align_num, &auxa_offset, &auxb_offset);
|
||||
|
||||
int num_per_core = b * n / taskDim;
|
||||
const int core_offset = num_per_core;
|
||||
|
||||
char *unknown_gdram_start =
|
||||
unknown_gdram + taskId * 3 * core_offset * sizeof(T);
|
||||
char *known_gdram_start = known_gdram;
|
||||
char *output_dist_start = dist2_gdram + taskId * 3 * core_offset * sizeof(T);
|
||||
int *output_idx_start = idx_gdram + taskId * 3 * core_offset;
|
||||
|
||||
const int rem = (b * n) % taskDim;
|
||||
if (taskId == taskDim - 1) {
|
||||
num_per_core += rem;
|
||||
}
|
||||
|
||||
const int unknown_repeat =
|
||||
num_per_core / unknown_num_deal; // if unknown number is big, process it
|
||||
// by unknown_repeat times.
|
||||
const int unknown_rem = num_per_core % unknown_num_deal; // unknown reminder
|
||||
const int unknown_rem_align = PAD_UP(unknown_rem, align_num);
|
||||
|
||||
const int known_repeat =
|
||||
m / known_num_deal; // if known number is big, process it by
|
||||
// unknown_repeat times.
|
||||
const int known_rem = m % known_num_deal; // known reminder
|
||||
const int known_rem_align = PAD_UP(known_rem, align_num);
|
||||
|
||||
char *nram_aux_sort_a = nram_buffer;
|
||||
int *nram_aux_sort_b = (int *)(nram_buffer + output_aux_sort_b_gap);
|
||||
char *nram_dest = nram_buffer + output_dest_gap;
|
||||
char *nram_unknown = nram_buffer + output_unknown_gap;
|
||||
char *nram_known = nram_buffer + output_known_gap;
|
||||
char *nram_dist = nram_buffer + output_dist_gap;
|
||||
char *nram_aux_a = nram_buffer + auxillary_a_gap;
|
||||
int *nram_aux_b = (int *)(nram_buffer + auxillary_b_gap);
|
||||
int deal_offset = 0;
|
||||
int start_idx = -1;
|
||||
|
||||
for (int j = 0; j < unknown_repeat;
|
||||
++j) { // process data within a unknown_repeat
|
||||
// if unknown need to be process segmentally, use a aux_a and aux_b
|
||||
// space to find first 3 minimum dist.
|
||||
__bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
|
||||
(T)(INFINITY));
|
||||
__bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
|
||||
loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal, j,
|
||||
unknown_num_deal, unknown_num_deal);
|
||||
|
||||
deal_offset = 0;
|
||||
start_idx = taskId * core_offset + j * unknown_num_deal;
|
||||
|
||||
for (int i = 0; i < known_repeat;
|
||||
++i) { // process known data in segmentally.
|
||||
auxProcessSegment<T>(
|
||||
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
|
||||
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
|
||||
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
|
||||
unknown_num_deal, known_num_deal, i, j, unknown_num_deal,
|
||||
known_num_deal, known_num_deal, start_idx, &deal_offset);
|
||||
deal_offset += 3;
|
||||
}
|
||||
|
||||
if (known_rem > 0) { // process known rem
|
||||
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
|
||||
auxProcessSegment<T>(
|
||||
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
|
||||
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
|
||||
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
|
||||
unknown_num_deal, known_num_deal, known_repeat, j, unknown_num_deal,
|
||||
known_rem, known_rem_align, start_idx, &deal_offset);
|
||||
}
|
||||
|
||||
deal_offset += 3;
|
||||
|
||||
if (deal_offset > 3) {
|
||||
auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
|
||||
(T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
|
||||
unknown_num_deal, deal_offset);
|
||||
deal_offset = 0;
|
||||
}
|
||||
|
||||
__memcpy((char *)output_dist_start + j * unknown_num_deal * 3 * sizeof(T),
|
||||
(char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
|
||||
auxa_offset * sizeof(T), unknown_num_deal - 1);
|
||||
__memcpy((char *)output_idx_start + j * unknown_num_deal * 3 * sizeof(int),
|
||||
(char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
|
||||
auxb_offset * sizeof(int), unknown_num_deal - 1);
|
||||
}
|
||||
|
||||
if (unknown_rem > 0) { // process unknown rem
|
||||
deal_offset = 0;
|
||||
__bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
|
||||
(T)(INFINITY));
|
||||
__bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
|
||||
loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal,
|
||||
unknown_repeat, unknown_rem, unknown_rem_align);
|
||||
start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
|
||||
|
||||
for (int i = 0; i < known_repeat; ++i) {
|
||||
auxProcessSegment<T>(
|
||||
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
|
||||
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
|
||||
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
|
||||
unknown_num_deal, known_num_deal, i, unknown_repeat, unknown_rem,
|
||||
known_num_deal, known_num_deal, start_idx, &deal_offset);
|
||||
deal_offset += 3;
|
||||
}
|
||||
|
||||
if (known_rem > 0) {
|
||||
__bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
|
||||
start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
|
||||
|
||||
auxProcessSegment<T>(
|
||||
m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
|
||||
(T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
|
||||
nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
|
||||
unknown_num_deal, known_num_deal, known_repeat, unknown_repeat,
|
||||
unknown_rem, known_rem, known_rem_align, start_idx, &deal_offset);
|
||||
|
||||
deal_offset += 3;
|
||||
}
|
||||
if (deal_offset > 3) {
|
||||
auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
|
||||
(T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
|
||||
unknown_rem, deal_offset);
|
||||
deal_offset = 0;
|
||||
}
|
||||
|
||||
__memcpy((char *)output_dist_start +
|
||||
unknown_repeat * unknown_num_deal * 3 * sizeof(T),
|
||||
(char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
|
||||
auxa_offset * sizeof(T), unknown_rem - 1);
|
||||
__memcpy((char *)output_idx_start +
|
||||
unknown_repeat * unknown_num_deal * 3 * sizeof(int),
|
||||
(char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
|
||||
auxb_offset * sizeof(int), unknown_rem - 1);
|
||||
}
|
||||
}
|
||||
|
||||
template __mlu_global__ void MLUUnion1KernelThreeNN<float>(
|
||||
const int b, const int n, const int m, char *unknown_gdram,
|
||||
char *known_gdram, char *dist2_gdram, int *idx_gdram);
|
||||
|
||||
template __mlu_global__ void MLUUnion1KernelThreeNN<half>(
|
||||
const int b, const int n, const int m, char *unknown_gdram,
|
||||
char *known_gdram, char *dist2_gdram, int *idx_gdram);
|
||||
|
||||
void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, cnrtDataType_t data_type,
|
||||
const void *unknown, const void *known, void *dist2,
|
||||
int *idx, const int b, const int n, const int m) {
|
||||
switch (data_type) {
|
||||
case CNRT_FLOAT16: {
|
||||
MLUUnion1KernelThreeNN<half><<<k_dim, k_type, queue>>>(
|
||||
b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
|
||||
}; break;
|
||||
case CNRT_FLOAT32: {
|
||||
MLUUnion1KernelThreeNN<float><<<k_dim, k_type, queue>>>(
|
||||
b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
|
||||
}; break;
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
/*************************************************************************
|
||||
* Copyright (C) 2022 Cambricon.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*************************************************************************/
|
||||
#include "pytorch_device_registry.hpp"
|
||||
#include "pytorch_mlu_helper.hpp"
|
||||
|
||||
void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
|
||||
cnrtQueue_t queue, cnrtDataType_t data_type,
|
||||
const void *unknown, const void *known, void *dist2,
|
||||
int *idx, const int b, const int n, const int m);
|
||||
|
||||
void ThreeNNMLUKernelLauncher(int b, int n, int m, const Tensor unknown,
|
||||
const Tensor known, Tensor dist2, Tensor idx) {
|
||||
// Check dtype.
|
||||
TORCH_CHECK(
|
||||
unknown.scalar_type() == at::kFloat || unknown.scalar_type() == at::kHalf,
|
||||
"unknown type should be Float or Half, got ", unknown.scalar_type(), ".");
|
||||
TORCH_CHECK(unknown.scalar_type() == known.scalar_type(),
|
||||
"known should have the same type as unknown.");
|
||||
TORCH_CHECK(unknown.scalar_type() == dist2.scalar_type(),
|
||||
"dist2 should have the same type as unknown.");
|
||||
TORCH_CHECK(idx.scalar_type() == at::kInt, "idx type should be Int.");
|
||||
|
||||
// Check shape.
|
||||
TORCH_CHECK(unknown.dim() == 3, "unknown should be 3d tensor, got ",
|
||||
unknown.dim(), "D.");
|
||||
TORCH_CHECK(known.dim() == 3, "known should be 3d tensor, got ", known.dim(),
|
||||
"D.");
|
||||
TORCH_CHECK(unknown.size(0) == known.size(0),
|
||||
"known.dim0 should be equal to unknown.dim0, got ", known.size(0),
|
||||
".");
|
||||
TORCH_CHECK(unknown.size(2) == 3, "unknown dim2 should be 3, got ",
|
||||
unknown.size(2), ".");
|
||||
TORCH_CHECK(known.size(2) == 3, "known dim2 should be 3, got ", known.size(2),
|
||||
".");
|
||||
|
||||
// zero element check
|
||||
TORCH_CHECK(unknown.numel() > 0,
|
||||
"unknown.numel should greater than zero, got ", unknown.numel(),
|
||||
".");
|
||||
if (known.numel() == 0) {
|
||||
// return if known zero element
|
||||
return;
|
||||
}
|
||||
|
||||
// large tensor check
|
||||
const size_t max_input_num = 2147483648; // 2^31, 2G num
|
||||
TORCH_CHECK(unknown.numel() < max_input_num,
|
||||
"unknown.numel() should be less than 2147483648, got ",
|
||||
unknown.numel(), ".");
|
||||
TORCH_CHECK(known.numel() < max_input_num,
|
||||
"known.numel() should be less than 2147483648, got ",
|
||||
known.numel(), ".");
|
||||
|
||||
// get compute queue
|
||||
auto queue = torch_mlu::getCurQueue();
|
||||
|
||||
// get ptr of tensors
|
||||
auto unknown_impl = torch_mlu::getMluTensorImpl(unknown);
|
||||
auto unknown_ptr = unknown_impl->cnnlMalloc();
|
||||
auto known_t = known.permute({0, 2, 1}).contiguous();
|
||||
auto known_impl = torch_mlu::getMluTensorImpl(known_t);
|
||||
auto known_ptr = known_impl->cnnlMalloc();
|
||||
auto dist2_impl = torch_mlu::getMluTensorImpl(dist2);
|
||||
auto dist2_ptr = dist2_impl->cnnlMalloc();
|
||||
auto idx_impl = torch_mlu::getMluTensorImpl(idx);
|
||||
auto idx_ptr = idx_impl->cnnlMalloc();
|
||||
|
||||
cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
|
||||
cnrtDim3_t k_dim;
|
||||
k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
|
||||
k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
|
||||
k_dim.z = 1;
|
||||
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(unknown.dtype());
|
||||
|
||||
// launch kernel
|
||||
CNLOG(INFO) << "Launch Kernel MLUKernelThreeNNForward<<<" << k_dim.x << ", "
|
||||
<< k_dim.y << ", " << k_dim.z << ">>>.";
|
||||
|
||||
KernelThreeNNForward(k_dim, k_type, queue, data_type, unknown_ptr, known_ptr,
|
||||
dist2_ptr, (int *)idx_ptr, b, n, m);
|
||||
}
|
||||
|
||||
void three_nn_forward_mlu(int b, int n, int m, const Tensor unknown,
|
||||
const Tensor known, Tensor dist2, Tensor idx) {
|
||||
ThreeNNMLUKernelLauncher(b, n, m, unknown, known, dist2, idx);
|
||||
}
|
||||
|
||||
void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
|
||||
const Tensor known, Tensor dist2, Tensor idx);
|
||||
|
||||
REGISTER_DEVICE_IMPL(three_nn_forward_impl, MLU, three_nn_forward_mlu);
|
|
@ -34,8 +34,8 @@ class ThreeNN(Function):
|
|||
|
||||
B, N, _ = target.size()
|
||||
m = source.size(1)
|
||||
dist2 = torch.cuda.FloatTensor(B, N, 3)
|
||||
idx = torch.cuda.IntTensor(B, N, 3)
|
||||
dist2 = torch.FloatTensor(B, N, 3).to(target.device)
|
||||
idx = torch.IntTensor(B, N, 3).to(target.device)
|
||||
|
||||
ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
|
||||
if torch.__version__ != 'parrots':
|
||||
|
|
|
@ -3,70 +3,61 @@ import pytest
|
|||
import torch
|
||||
|
||||
from mmcv.ops import three_nn
|
||||
from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not torch.cuda.is_available(), reason='requires CUDA support')
|
||||
def test_three_nn():
|
||||
known = torch.tensor([[[-1.8373, 3.5605,
|
||||
-0.7867], [0.7615, 2.9420, 0.2314],
|
||||
[-0.6503, 3.6637, -1.0622],
|
||||
[-1.8373, 3.5605, -0.7867],
|
||||
[-1.8373, 3.5605, -0.7867]],
|
||||
[[-1.3399, 1.9991, -0.3698],
|
||||
[-0.0799, 0.9698,
|
||||
-0.8457], [0.0858, 2.4721, -0.1928],
|
||||
[-1.3399, 1.9991, -0.3698],
|
||||
[-1.3399, 1.9991, -0.3698]]]).cuda()
|
||||
@pytest.mark.parametrize('device', [
|
||||
pytest.param(
|
||||
'cuda',
|
||||
marks=pytest.mark.skipif(
|
||||
not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
|
||||
pytest.param(
|
||||
'mlu',
|
||||
marks=pytest.mark.skipif(
|
||||
not IS_MLU_AVAILABLE, reason='requires MLU support'))
|
||||
])
|
||||
def test_three_nn(device):
|
||||
known = torch.tensor(
|
||||
[[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
|
||||
[-0.6503, 3.6637, -1.0622], [-1.8373, 3.5605, -0.7867],
|
||||
[-1.8373, 3.5605, -0.7867]],
|
||||
[[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
|
||||
[0.0858, 2.4721, -0.1928], [-1.3399, 1.9991, -0.3698],
|
||||
[-1.3399, 1.9991, -0.3698]]],
|
||||
device=device)
|
||||
|
||||
unknown = torch.tensor([[[-1.8373, 3.5605, -0.7867],
|
||||
[0.7615, 2.9420, 0.2314],
|
||||
[-0.6503, 3.6637, -1.0622],
|
||||
[-1.5237, 2.3976, -0.8097],
|
||||
[-0.0722, 3.4017, -0.2880],
|
||||
[0.5198, 3.0661, -0.4605],
|
||||
[-2.0185, 3.5019, -0.3236],
|
||||
[0.5098, 3.1020, 0.5799],
|
||||
[-1.6137, 3.8443, -0.5269],
|
||||
[0.7341, 2.9626, -0.3189]],
|
||||
[[-1.3399, 1.9991, -0.3698],
|
||||
[-0.0799, 0.9698, -0.8457],
|
||||
[0.0858, 2.4721, -0.1928],
|
||||
[-0.9022, 1.6560, -1.3090],
|
||||
[0.1156, 1.6901, -0.4366],
|
||||
[-0.6477, 2.3576, -0.1563],
|
||||
[-0.8482, 1.1466, -1.2704],
|
||||
[-0.8753, 2.0845, -0.3460],
|
||||
[-0.5621, 1.4233, -1.2858],
|
||||
[-0.5883, 1.3114, -1.2899]]]).cuda()
|
||||
unknown = torch.tensor(
|
||||
[[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
|
||||
[-0.6503, 3.6637, -1.0622], [-1.5237, 2.3976, -0.8097],
|
||||
[-0.0722, 3.4017, -0.2880], [0.5198, 3.0661, -0.4605],
|
||||
[-2.0185, 3.5019, -0.3236], [0.5098, 3.1020, 0.5799],
|
||||
[-1.6137, 3.8443, -0.5269], [0.7341, 2.9626, -0.3189]],
|
||||
[[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
|
||||
[0.0858, 2.4721, -0.1928], [-0.9022, 1.6560, -1.3090],
|
||||
[0.1156, 1.6901, -0.4366], [-0.6477, 2.3576, -0.1563],
|
||||
[-0.8482, 1.1466, -1.2704], [-0.8753, 2.0845, -0.3460],
|
||||
[-0.5621, 1.4233, -1.2858], [-0.5883, 1.3114, -1.2899]]],
|
||||
device=device)
|
||||
|
||||
dist, idx = three_nn(unknown, known)
|
||||
expected_dist = torch.tensor([[[0.0000, 0.0000, 0.0000],
|
||||
[0.0000, 2.0463, 2.8588],
|
||||
[0.0000, 1.2229, 1.2229],
|
||||
[1.2047, 1.2047, 1.2047],
|
||||
[1.0011, 1.0845, 1.8411],
|
||||
[0.7433, 1.4451, 2.4304],
|
||||
[0.5007, 0.5007, 0.5007],
|
||||
[0.4587, 2.0875, 2.7544],
|
||||
[0.4450, 0.4450, 0.4450],
|
||||
[0.5514, 1.7206, 2.6811]],
|
||||
[[0.0000, 0.0000, 0.0000],
|
||||
[0.0000, 1.6464, 1.6952],
|
||||
[0.0000, 1.5125, 1.5125],
|
||||
[1.0915, 1.0915, 1.0915],
|
||||
[0.8197, 0.8511, 1.4894],
|
||||
[0.7433, 0.8082, 0.8082],
|
||||
[0.8955, 1.3340, 1.3340],
|
||||
[0.4730, 0.4730, 0.4730],
|
||||
[0.7949, 1.3325, 1.3325],
|
||||
[0.7566, 1.3727, 1.3727]]]).cuda()
|
||||
expected_idx = torch.tensor([[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
|
||||
[2, 1, 0], [1, 2, 0], [0, 3, 4], [1, 2, 0],
|
||||
[0, 3, 4], [1, 2, 0]],
|
||||
[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
|
||||
[2, 1, 0], [2, 0, 3], [1, 0, 3], [0, 3, 4],
|
||||
[1, 0, 3], [1, 0, 3]]]).cuda()
|
||||
expected_dist = torch.tensor(
|
||||
[[[0.0000, 0.0000, 0.0000], [0.0000, 2.0463, 2.8588],
|
||||
[0.0000, 1.2229, 1.2229], [1.2047, 1.2047, 1.2047],
|
||||
[1.0011, 1.0845, 1.8411], [0.7433, 1.4451, 2.4304],
|
||||
[0.5007, 0.5007, 0.5007], [0.4587, 2.0875, 2.7544],
|
||||
[0.4450, 0.4450, 0.4450], [0.5514, 1.7206, 2.6811]],
|
||||
[[0.0000, 0.0000, 0.0000], [0.0000, 1.6464, 1.6952],
|
||||
[0.0000, 1.5125, 1.5125], [1.0915, 1.0915, 1.0915],
|
||||
[0.8197, 0.8511, 1.4894], [0.7433, 0.8082, 0.8082],
|
||||
[0.8955, 1.3340, 1.3340], [0.4730, 0.4730, 0.4730],
|
||||
[0.7949, 1.3325, 1.3325], [0.7566, 1.3727, 1.3727]]],
|
||||
device=device)
|
||||
expected_idx = torch.tensor(
|
||||
[[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0], [1, 2, 0],
|
||||
[0, 3, 4], [1, 2, 0], [0, 3, 4], [1, 2, 0]],
|
||||
[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0], [2, 0, 3],
|
||||
[1, 0, 3], [0, 3, 4], [1, 0, 3], [1, 0, 3]]],
|
||||
device=device)
|
||||
|
||||
assert torch.allclose(dist, expected_dist, 1e-4)
|
||||
assert torch.allclose(dist, expected_dist, atol=1e-4)
|
||||
assert torch.all(idx == expected_idx)
|
||||
|
|
Loading…
Reference in New Issue