From 48ea88ab9f3f8eb1e799d1d711da0181d95e574d Mon Sep 17 00:00:00 2001 From: ZShaopeng <108382403+ZShaopeng@users.noreply.github.com> Date: Tue, 10 Jan 2023 19:43:45 +0800 Subject: [PATCH] [Feature] Support Voxelization with cambricon MLU device (#2500) * [Feature] Support hard_voxelize with cambricon MLU backend * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op * [Feature](bangc-ops): add voxelization op --- docs/en/understand_mmcv/ops.md | 2 +- docs/zh_cn/understand_mmcv/ops.md | 2 +- .../common/mlu/voxelization_mlu_kernel.mlu | 532 ++++++++++++++++++ .../ops/csrc/pytorch/mlu/voxelization_mlu.cpp | 268 +++++++++ tests/test_ops/test_voxelization.py | 41 +- 5 files changed, 840 insertions(+), 5 deletions(-) create mode 100644 mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu create mode 100644 mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp diff --git a/docs/en/understand_mmcv/ops.md b/docs/en/understand_mmcv/ops.md index 6967467c9..b1e3ea28f 100644 --- a/docs/en/understand_mmcv/ops.md +++ b/docs/en/understand_mmcv/ops.md @@ -58,5 +58,5 @@ We implement common ops used in detection, segmentation, etc. | ThreeNN | | √ | √ | | | | TINShift | | √ | √ | | | | UpFirDn2d | | √ | | | | -| Voxelization | √ | √ | | | | +| Voxelization | √ | √ | √ | | | | PrRoIPool | | √ | | | | diff --git a/docs/zh_cn/understand_mmcv/ops.md b/docs/zh_cn/understand_mmcv/ops.md index cbfb39d3d..650cd537a 100644 --- a/docs/zh_cn/understand_mmcv/ops.md +++ b/docs/zh_cn/understand_mmcv/ops.md @@ -58,5 +58,5 @@ MMCV 提供了检测、分割等任务中常用的算子 | ThreeNN | | √ | √ | | | | TINShift | | √ | √ | | | | UpFirDn2d | | √ | | | | -| Voxelization | √ | √ | | | | +| Voxelization | √ | √ | √ | | | | PrRoIPool | | √ | | | | diff --git a/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu new file mode 100644 index 000000000..d7c57da4f --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu @@ -0,0 +1,532 @@ +/************************************************************************* + * Copyright (C) 2022 by Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#include "common_mlu_helper.hpp" + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +#if __BANG_ARCH__ >= 322 +__mlu_func__ void computeDynamicVoxelize( + char *points_x, char *points_y, char *points_z, char *auxiliary_a, + char *auxiliary_b, char *auxiliary_c, const float coors_x_min, + const float coors_y_min, const float coors_z_min, const float voxel_x, + const float voxel_y, const float voxel_z, const int32_t grid_x, + const int32_t grid_y, const int32_t grid_z, const int32_t deal_num) { + // x - coors_x_min + __bang_sub_scalar((float *)points_x, (float *)points_x, coors_x_min, + deal_num); + // y - coors_y_min + __bang_sub_scalar((float *)points_y, (float *)points_y, coors_y_min, + deal_num); + // z - coors_z_min + __bang_sub_scalar((float *)points_z, (float *)points_z, coors_z_min, + deal_num); + // (x - coors_x_min) / voxel_x + __bang_mul_scalar((float *)points_x, (float *)points_x, 1.0 / voxel_x, + deal_num); + // (y - coors_y_min) / voxel_y + __bang_mul_scalar((float *)points_y, (float *)points_y, 1.0 / voxel_y, + deal_num); + // (z - coors_z_min) / voxel_z + __bang_mul_scalar((float *)points_z, (float *)points_z, 1.0 / voxel_z, + deal_num); + // c_x = floor((x - coors_x_min) / voxel_x) + __bang_floor((float *)auxiliary_a, (float *)points_x, deal_num); + __bang_float2int32((int32_t *)points_x, (float *)auxiliary_a, deal_num, 0); + // c_y = floor((y - coors_y_min) / voxel_y) + __bang_floor((float *)auxiliary_a, (float *)points_y, deal_num); + __bang_float2int32((int32_t *)points_y, (float *)auxiliary_a, deal_num, 0); + // c_z = floor((z - coors_z_min) / voxel_z) + __bang_floor((float *)auxiliary_a, (float *)points_z, deal_num); + __bang_float2int32((int32_t *)points_z, (float *)auxiliary_a, deal_num, 0); + // c_x >= 0 + __bang_ge_scalar((int32_t *)auxiliary_b, (int32_t *)points_x, (int32_t)0, + deal_num); + // c_x < grid_x + __bang_lt_scalar((int32_t *)auxiliary_c, (int32_t *)points_x, grid_x, + deal_num); + // 0 <= c_x < grid_x + __bang_mul((int32_t *)auxiliary_a, (int32_t *)auxiliary_b, + (int32_t *)auxiliary_c, deal_num); + // c_y >= 0 + __bang_ge_scalar((int32_t *)auxiliary_b, (int32_t *)points_y, (int32_t)0, + deal_num); + // c_y < grid_y + __bang_lt_scalar((int32_t *)auxiliary_c, (int32_t *)points_y, grid_y, + deal_num); + // 0 <= c_y < grid_y + __bang_mul((int32_t *)auxiliary_b, (int32_t *)auxiliary_b, + (int32_t *)auxiliary_c, deal_num); + // c_x >= 0 && c_x < grid_x && c_y >= 0 && c_y < grid_y + __bang_mul((int32_t *)auxiliary_a, (int32_t *)auxiliary_a, + (int32_t *)auxiliary_b, deal_num); + // c_z >= 0 + __bang_ge_scalar((int32_t *)auxiliary_b, (int32_t *)points_z, (int32_t)0, + deal_num); + // c_z < grid_z + __bang_lt_scalar((int32_t *)auxiliary_c, (int32_t *)points_z, grid_z, + deal_num); + // 0 <= c_z < grid_z + __bang_mul((int32_t *)auxiliary_b, (int32_t *)auxiliary_b, + (int32_t *)auxiliary_c, deal_num); + // 0 <= c_x < grid_x && 0 <= c_y < grid_y && 0 <= c_z < grid_z + __bang_mul((int32_t *)auxiliary_a, (int32_t *)auxiliary_a, + (int32_t *)auxiliary_b, deal_num); + __bang_not((int32_t *)auxiliary_c, (int32_t *)auxiliary_a, deal_num); + + __bang_mul((int32_t *)points_x, (int32_t *)points_x, (int32_t *)auxiliary_a, + deal_num); + __bang_mul_scalar((int32_t *)auxiliary_b, (int32_t *)auxiliary_c, + (int32_t)(-1), deal_num); + __bang_add((int32_t *)points_x, (int32_t *)points_x, (int32_t *)auxiliary_b, + deal_num); + __bang_mul((int32_t *)points_y, (int32_t *)points_y, (int32_t *)auxiliary_a, + deal_num); + __bang_add((int32_t *)points_y, (int32_t *)points_y, (int32_t *)auxiliary_b, + deal_num); + __bang_mul((int32_t *)points_z, (int32_t *)points_z, (int32_t *)auxiliary_a, + deal_num); + __bang_add((int32_t *)points_z, (int32_t *)points_z, (int32_t *)auxiliary_b, + deal_num); +} + +__mlu_func__ void computePoint2Voxel(char *coors_x, char *coors_y, + char *coors_z, const int32_t c_x, + const int32_t c_y, const int32_t c_z, + const int32_t max_points, int32_t *num, + int32_t *first_point, + const int32_t deal_idx, + const int32_t deal_num) { + __bang_eq_scalar((int32_t *)coors_x, (int32_t *)coors_x, c_x, deal_num); + __bang_eq_scalar((int32_t *)coors_y, (int32_t *)coors_y, c_y, deal_num); + __bang_eq_scalar((int32_t *)coors_z, (int32_t *)coors_z, c_z, deal_num); + __bang_mul((int32_t *)coors_x, (int32_t *)coors_x, (int32_t *)coors_y, + deal_num); + __bang_mul((int32_t *)coors_x, (int32_t *)coors_x, (int32_t *)coors_z, + deal_num); + if (*num == 0) { + *num = (int32_t)__bang_count((float *)coors_x, deal_num); + if (*num > 0) { + *first_point = + (int32_t)__bang_findfirst1((float *)coors_x, deal_num) + deal_idx; + } + } else { + *num += (int32_t)__bang_count((float *)coors_x, deal_num); + } +} +#endif + +__mlu_global__ void MLUUnion1KernelDynamicVoxelize( + const float *points, int32_t *coors, const float voxel_x, + const float voxel_y, const float voxel_z, const float coors_x_min, + const float coors_y_min, const float coors_z_min, const float coors_x_max, + const float coors_y_max, const float coors_z_max, const int32_t grid_x, + const int32_t grid_y, const int32_t grid_z, const int32_t num_points, + const int32_t num_features) { +#if __BANG_ARCH__ >= 322 + if (coreId == 0x80) { + return; + } + + const int32_t points_rem = num_points % taskDim; + const int32_t points_per_core = + taskId < points_rem ? num_points / taskDim + 1 : num_points / taskDim; + const int32_t points_start = taskId < points_rem + ? taskId * points_per_core + : taskId * points_per_core + points_rem; + + const int32_t split_num = 9; + const int32_t deal_num = + PAD_DOWN(MAX_NRAM_SIZE / split_num / sizeof(float), NFU_ALIGN_SIZE); + const int32_t repeat = points_per_core / deal_num; + const int32_t rem = points_per_core % deal_num; + const int32_t ping_pong_gap = 3 * deal_num * sizeof(float); + + char *points_x = nram_buffer; + char *points_y = points_x + deal_num * sizeof(float); + char *points_z = points_y + deal_num * sizeof(float); + char *auxiliary_a = points_x + 2 * ping_pong_gap; + char *auxiliary_b = auxiliary_a + deal_num * sizeof(float); + char *auxiliary_c = auxiliary_b + deal_num * sizeof(float); + + int32_t *coors_z_start = coors + points_start; + int32_t *coors_y_start = coors + num_points + points_start; + int32_t *coors_x_start = coors + num_points * 2 + points_start; + + if (repeat > 0) { + __memcpy_async(points_x, points + points_start * num_features, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + __memcpy_async(points_y, points + points_start * num_features + 1, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + __memcpy_async(points_z, points + points_start * num_features + 2, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + __asm__ volatile("sync;"); + } + if (repeat > 1) { + __memcpy_async(points_x + ping_pong_gap, + points + (points_start + deal_num) * num_features, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + __memcpy_async(points_y + ping_pong_gap, + points + (points_start + deal_num) * num_features + 1, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + __memcpy_async(points_z + ping_pong_gap, + points + (points_start + deal_num) * num_features + 2, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + computeDynamicVoxelize(points_x, points_y, points_z, auxiliary_a, + auxiliary_b, auxiliary_c, coors_x_min, coors_y_min, + coors_z_min, voxel_x, voxel_y, voxel_z, grid_x, + grid_y, grid_z, deal_num); + __asm__ volatile("sync;"); + } + + for (int32_t i = 0; i < repeat - 2; ++i) { + __memcpy_async(coors_x_start + i * deal_num, + points_x + (i % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_y_start + i * deal_num, + points_y + (i % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_z_start + i * deal_num, + points_z + (i % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(points_x + (i % 2) * ping_pong_gap, + points + (points_start + (i + 2) * deal_num) * num_features, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), deal_num - 1); + __memcpy_async( + points_y + (i % 2) * ping_pong_gap, + points + (points_start + (i + 2) * deal_num) * num_features + 1, + sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float), + deal_num - 1); + __memcpy_async( + points_z + (i % 2) * ping_pong_gap, + points + (points_start + (i + 2) * deal_num) * num_features + 2, + sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float), + deal_num - 1); + computeDynamicVoxelize(points_x + ((i + 1) % 2) * ping_pong_gap, + points_y + ((i + 1) % 2) * ping_pong_gap, + points_z + ((i + 1) % 2) * ping_pong_gap, + auxiliary_a, auxiliary_b, auxiliary_c, coors_x_min, + coors_y_min, coors_z_min, voxel_x, voxel_y, voxel_z, + grid_x, grid_y, grid_z, deal_num); + __asm__ volatile("sync;"); + } + + if (repeat >= 2) { + __memcpy_async(coors_x_start + (repeat - 2) * deal_num, + points_x + (repeat % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_y_start + (repeat - 2) * deal_num, + points_y + (repeat % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_z_start + (repeat - 2) * deal_num, + points_z + (repeat % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + } + if (rem > 0) { + __memcpy_async(points_x + (repeat % 2) * ping_pong_gap, + points + (points_start + repeat * deal_num) * num_features, + sizeof(float), GDRAM2NRAM, sizeof(float), + num_features * sizeof(float), rem - 1); + __memcpy_async( + points_y + (repeat % 2) * ping_pong_gap, + points + (points_start + repeat * deal_num) * num_features + 1, + sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float), + rem - 1); + __memcpy_async( + points_z + (repeat % 2) * ping_pong_gap, + points + (points_start + repeat * deal_num) * num_features + 2, + sizeof(float), GDRAM2NRAM, sizeof(float), num_features * sizeof(float), + rem - 1); + } + if (repeat > 0) { + computeDynamicVoxelize(points_x + ((repeat - 1) % 2) * ping_pong_gap, + points_y + ((repeat - 1) % 2) * ping_pong_gap, + points_z + ((repeat - 1) % 2) * ping_pong_gap, + auxiliary_a, auxiliary_b, auxiliary_c, coors_x_min, + coors_y_min, coors_z_min, voxel_x, voxel_y, voxel_z, + grid_x, grid_y, grid_z, deal_num); + } + __asm__ volatile("sync;"); + + if (repeat > 0) { + __memcpy_async(coors_x_start + (repeat - 1) * deal_num, + points_x + ((repeat - 1) % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_y_start + (repeat - 1) * deal_num, + points_y + ((repeat - 1) % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_z_start + (repeat - 1) * deal_num, + points_z + ((repeat - 1) % 2) * ping_pong_gap, + deal_num * sizeof(int32_t), NRAM2GDRAM); + } + if (rem > 0) { + computeDynamicVoxelize(points_x + (repeat % 2) * ping_pong_gap, + points_y + (repeat % 2) * ping_pong_gap, + points_z + (repeat % 2) * ping_pong_gap, auxiliary_a, + auxiliary_b, auxiliary_c, coors_x_min, coors_y_min, + coors_z_min, voxel_x, voxel_y, voxel_z, grid_x, + grid_y, grid_z, rem); + __asm__ volatile("sync;"); + __memcpy_async(coors_x_start + repeat * deal_num, + points_x + (repeat % 2) * ping_pong_gap, + rem * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_y_start + repeat * deal_num, + points_y + (repeat % 2) * ping_pong_gap, + rem * sizeof(int32_t), NRAM2GDRAM); + __memcpy_async(coors_z_start + repeat * deal_num, + points_z + (repeat % 2) * ping_pong_gap, + rem * sizeof(int32_t), NRAM2GDRAM); + } +#endif +} + +__mlu_global__ void MLUUnion1KernelPoint2Voxel(int32_t *coors, + int32_t *point_to_pointidx, + int32_t *point_to_voxelidx, + const int32_t num_points, + const int32_t max_points) { +#if __BANG_ARCH__ >= 322 + if (coreId == 0x80) { + return; + } + + const int32_t split_num = 6; + const int32_t deal_num = + PAD_DOWN(MAX_NRAM_SIZE / split_num / sizeof(int32_t), NFU_ALIGN_SIZE); + const int32_t ping_pong_gap = 3 * deal_num * sizeof(int32_t); + + char *coors_x = nram_buffer; + char *coors_y = coors_x + deal_num * sizeof(int32_t); + char *coors_z = coors_y + deal_num * sizeof(int32_t); + + int32_t *coors_z_start = coors; + int32_t *coors_y_start = coors + num_points; + int32_t *coors_x_start = coors + num_points * 2; + + for (int32_t point_idx = taskId; point_idx < num_points; + point_idx += taskDim) { + if (coors_x_start[point_idx] == -1) { + point_to_pointidx[point_idx] = -1; + point_to_voxelidx[point_idx] = -1; + continue; + } + + int32_t c_x = coors_x_start[point_idx]; + int32_t c_y = coors_y_start[point_idx]; + int32_t c_z = coors_z_start[point_idx]; + + int32_t deal_total_num = point_idx; + int32_t repeat = deal_total_num / deal_num; + int32_t rem = deal_total_num % deal_num; + int32_t num = 0; + int32_t first_point = -1; + + if (repeat > 0) { + __memcpy_async(coors_x, coors_x_start, deal_num * sizeof(int32_t), + GDRAM2NRAM); + __memcpy_async(coors_y, coors_y_start, deal_num * sizeof(int32_t), + GDRAM2NRAM); + __memcpy_async(coors_z, coors_z_start, deal_num * sizeof(int32_t), + GDRAM2NRAM); + __asm__ volatile("sync;"); + } + + for (int32_t i = 0; i < repeat - 1; ++i) { + __memcpy_async(coors_x + ((i + 1) % 2) * ping_pong_gap, + coors_x_start + (i + 1) * deal_num, + deal_num * sizeof(int32_t), GDRAM2NRAM); + __memcpy_async(coors_y + ((i + 1) % 2) * ping_pong_gap, + coors_y_start + (i + 1) * deal_num, + deal_num * sizeof(int32_t), GDRAM2NRAM); + __memcpy_async(coors_z + ((i + 1) % 2) * ping_pong_gap, + coors_z_start + (i + 1) * deal_num, + deal_num * sizeof(int32_t), GDRAM2NRAM); + computePoint2Voxel( + coors_x + (i % 2) * ping_pong_gap, coors_y + (i % 2) * ping_pong_gap, + coors_z + (i % 2) * ping_pong_gap, c_x, c_y, c_z, max_points, &num, + &first_point, i * deal_num, deal_num); + __asm__ volatile("sync;"); + } + + if (rem > 0) { + __memcpy_async(coors_x + (repeat % 2) * ping_pong_gap, + coors_x_start + repeat * deal_num, rem * sizeof(int32_t), + GDRAM2NRAM); + __memcpy_async(coors_y + (repeat % 2) * ping_pong_gap, + coors_y_start + repeat * deal_num, rem * sizeof(int32_t), + GDRAM2NRAM); + __memcpy_async(coors_z + (repeat % 2) * ping_pong_gap, + coors_z_start + repeat * deal_num, rem * sizeof(int32_t), + GDRAM2NRAM); + } + if (repeat > 0) { + computePoint2Voxel(coors_x + ((repeat - 1) % 2) * ping_pong_gap, + coors_y + ((repeat - 1) % 2) * ping_pong_gap, + coors_z + ((repeat - 1) % 2) * ping_pong_gap, c_x, c_y, + c_z, max_points, &num, &first_point, + (repeat - 1) * deal_num, deal_num); + } + __asm__ volatile("sync;"); + + if (rem > 0) { + computePoint2Voxel(coors_x + (repeat % 2) * ping_pong_gap, + coors_y + (repeat % 2) * ping_pong_gap, + coors_z + (repeat % 2) * ping_pong_gap, c_x, c_y, c_z, + max_points, &num, &first_point, repeat * deal_num, + rem); + __asm__ volatile("sync;"); + } + + if (num == 0) { + point_to_pointidx[point_idx] = point_idx; + } else if (num > 0) { + point_to_pointidx[point_idx] = first_point; + } + + if (num < max_points) { + point_to_voxelidx[point_idx] = num; + } else { + point_to_voxelidx[point_idx] = -1; + } + } +#endif +} + +__mlu_global__ void MLUUnion1KernelCalcPointsPerVoxel( + int32_t *point_to_pointidx, int32_t *point_to_voxelidx, + int32_t *coor_to_voxelidx, int32_t *num_points_per_voxel, + int32_t *voxel_num, const int32_t max_voxels, const int32_t num_points) { +#if __BANG_ARCH__ >= 322 + if (coreId == 0) { + int32_t voxel_num_temp = 0; + for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) { + int32_t point_pos_in_voxel = point_to_voxelidx[point_idx]; + coor_to_voxelidx[point_idx] = -1; + if (point_pos_in_voxel == -1) { + continue; + } else if (point_pos_in_voxel == 0) { + int32_t voxel_idx = voxel_num_temp; + if (voxel_num_temp >= max_voxels) { + continue; + } + voxel_num_temp += 1; + coor_to_voxelidx[point_idx] = voxel_idx; + num_points_per_voxel[voxel_idx] = 1; + } else { + int32_t point_idx_temp = point_to_pointidx[point_idx]; + int32_t voxel_idx = coor_to_voxelidx[point_idx_temp]; + if (voxel_idx != -1) { + coor_to_voxelidx[point_idx] = voxel_idx; + num_points_per_voxel[voxel_idx] += 1; + } + } + } + *voxel_num = voxel_num_temp; + } +#endif +} + +__mlu_global__ void MLUUnion1KernelAssignVoxelsCoors( + const float *points, int32_t *temp_coors, int32_t *point_to_voxelidx, + int32_t *coor_to_voxelidx, float *voxels, int32_t *coors, + const int32_t max_points, const int32_t num_points, + const int32_t num_features) { +#if __BANG_ARCH__ >= 322 + if (coreId == 0x80) { + return; + } + + int32_t points_per_core = num_points / taskDim; + int32_t points_rem = num_points % taskDim; + int32_t points_start = taskId < points_rem + ? taskId * (points_per_core + 1) + : taskId * points_per_core + points_rem; + int32_t points_end = taskId < points_rem ? points_start + points_per_core + 1 + : points_start + points_per_core; + + for (int32_t point_idx = points_start; point_idx < points_end; ++point_idx) { + int32_t num = point_to_voxelidx[point_idx]; + int32_t voxel_idx = coor_to_voxelidx[point_idx]; + if (num > -1 && voxel_idx > -1) { + float *voxels_offset = + voxels + voxel_idx * max_points * num_features + num * num_features; + const float *points_offset = points + point_idx * num_features; + __memcpy_async(voxels_offset, points_offset, num_features * sizeof(float), + GDRAM2GDRAM); + + if (num == 0) { + int32_t *coors_offset = coors + voxel_idx * 3; + __memcpy_async(coors_offset, temp_coors + point_idx, sizeof(int32_t), + GDRAM2GDRAM, sizeof(int32_t), + num_points * sizeof(int32_t), 2); + } + } + } + __asm__ volatile("sync;"); +#endif +} + +void KernelDynamicVoxelize(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const void *points, void *coors, + const float voxel_x, const float voxel_y, + const float voxel_z, const float coors_x_min, + const float coors_y_min, const float coors_z_min, + const float coors_x_max, const float coors_y_max, + const float coors_z_max, const int32_t grid_x, + const int32_t grid_y, const int32_t grid_z, + const int32_t num_points, + const int32_t num_features) { + MLUUnion1KernelDynamicVoxelize<<>>( + (float *)points, (int32_t *)coors, voxel_x, voxel_y, voxel_z, coors_x_min, + coors_y_min, coors_z_min, coors_x_max, coors_y_max, coors_z_max, grid_x, + grid_y, grid_z, num_points, num_features); +} + +void KernelPoint2Voxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, void *coors, void *point_to_pointidx, + void *point_to_voxelidx, const int32_t num_points, + const int32_t max_points) { + MLUUnion1KernelPoint2Voxel<<>>( + (int32_t *)coors, (int32_t *)point_to_pointidx, + (int32_t *)point_to_voxelidx, num_points, max_points); +} + +void KernelCalcPointsPerVoxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, void *point_to_pointidx, + void *point_to_voxelidx, void *coor_to_voxelidx, + void *num_points_per_voxel, void *voxel_num, + const int32_t max_voxels, + const int32_t num_points) { + MLUUnion1KernelCalcPointsPerVoxel<<>>( + (int32_t *)point_to_pointidx, (int32_t *)point_to_voxelidx, + (int32_t *)coor_to_voxelidx, (int32_t *)num_points_per_voxel, + (int32_t *)voxel_num, max_voxels, num_points); +} + +void KernelAssignVoxelsCoors(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const void *points, + void *temp_coors, void *point_to_voxelidx, + void *coor_to_voxelidx, void *voxels, void *coors, + const int32_t max_points, const int32_t num_points, + const int32_t num_features) { + MLUUnion1KernelAssignVoxelsCoors<<>>( + (float *)points, (int32_t *)temp_coors, (int32_t *)point_to_voxelidx, + (int32_t *)coor_to_voxelidx, (float *)voxels, (int32_t *)coors, + max_points, num_points, num_features); +} diff --git a/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp new file mode 100644 index 000000000..c3d31bc0e --- /dev/null +++ b/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp @@ -0,0 +1,268 @@ +/************************************************************************* + * Copyright (C) 2022 by Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "pytorch_device_registry.hpp" +#include "pytorch_mlu_helper.hpp" + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +void KernelDynamicVoxelize( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *points, void *coors, const float voxel_x, const float voxel_y, + const float voxel_z, const float coors_x_min, const float coors_y_min, + const float coors_z_min, const float coors_x_max, const float coors_y_max, + const float coors_z_max, const int32_t grid_x, const int32_t grid_y, + const int32_t grid_z, const int32_t num_points, const int32_t num_features); + +void KernelPoint2Voxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, void *coors, void *point_to_pointidx, + void *point_to_voxelidx, const int32_t num_points, + const int32_t max_points); + +void KernelCalcPointsPerVoxel(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, void *point_to_pointidx, + void *point_to_voxelidx, void *coor_to_voxelidx, + void *num_points_per_voxel, void *voxel_num, + const int32_t max_voxels, + const int32_t num_points); + +void KernelAssignVoxelsCoors(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const void *points, + void *temp_coors, void *point_to_voxelidx, + void *coor_to_voxelidx, void *voxels, void *coors, + const int32_t max_points, const int32_t num_points, + const int32_t num_features); + +// policy function +static void policyFuncDefault(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, + const int num_points) { + k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + k_dim->y = MIN((num_points + k_dim->x - 1) / k_dim->x, + torch_mlu::getDeviceAttr(cnrtAttrClusterCount)); + k_dim->z = 1; + *k_type = CNRT_FUNC_TYPE_UNION1; +} + +// policy function +static void policyFuncCalcPointsPerVoxel(cnrtDim3_t *k_dim, + cnrtFunctionType_t *k_type, + const int num_points) { + k_dim->x = 1; + k_dim->y = 1; + k_dim->z = 1; + *k_type = CNRT_FUNC_TYPE_BLOCK; +} + +int HardVoxelizeForwardMLUKernelLauncher( + const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors, + at::Tensor &num_points_per_voxel, const std::vector voxel_size, + const std::vector coors_range, const int max_points, + const int max_voxels, const int NDim = 3) { + // check datatype + TORCH_CHECK(points.scalar_type() == at::kFloat, + "points type should be Float, got ", points.scalar_type(), "."); + TORCH_CHECK(voxels.scalar_type() == at::kFloat, + "voxels type should be Float, got ", voxels.scalar_type(), "."); + TORCH_CHECK(coors.scalar_type() == at::kInt, + "coors type should be Float, got ", coors.scalar_type(), "."); + TORCH_CHECK(num_points_per_voxel.scalar_type() == at::kInt, + "num_points_per_voxel type should be Float, got ", + num_points_per_voxel.scalar_type(), "."); + + // check shape + TORCH_CHECK(points.dim() == 2, "points should be a 2d tensor, got ", + points.dim(), "D."); + TORCH_CHECK(voxels.dim() == 3, "voxels should be a 3d tensor, got ", + voxels.dim(), "D."); + TORCH_CHECK(coors.dim() == 2, "coors should be a 2d tensor, got ", + coors.dim(), "D."); + TORCH_CHECK(num_points_per_voxel.dim() == 1, + "num_points_per_voxel should be a 1d tensor, got ", + num_points_per_voxel.dim(), "D."); + + const int num_points = points.size(0); + const int num_features = points.size(1); + + TORCH_CHECK(points.size(0) == num_points, + "the 1st dimensions of points should be num_points, got ", + points.size(0), "."); + TORCH_CHECK(points.size(1) == num_features, + "the 2nd dimensions of points should be num_features, got ", + points.size(1), "."); + TORCH_CHECK(voxels.size(0) == max_voxels, + "the 1st dimensions of voxels should be max_voxels, got ", + voxels.size(0), "."); + TORCH_CHECK(voxels.size(1) == max_points, + "the 2nd dimensions of voxels should be max_points, got ", + voxels.size(1), "."); + TORCH_CHECK(voxels.size(2) == num_features, + "the 3rd dimensions of voxels should be num_features, got ", + voxels.size(2), "."); + TORCH_CHECK(coors.size(0) == max_voxels, + "the 1st dimensions of coors should be max_voxels, got ", + coors.size(0), "."); + TORCH_CHECK(coors.size(1) == 3, + "the 2nd dimensions of coors should be 3, got ", coors.size(1), + "."); + TORCH_CHECK(num_points_per_voxel.size(0) == max_voxels, + "the 1st dimensions of num_points_per_voxel should be 3, got ", + num_points_per_voxel.size(0), "."); + + // large tensor check + const size_t max_input_size = 2147483648; + TORCH_CHECK(points.numel() < max_input_size, + "points element num should be less than 2^31, got ", + points.numel(), "."); + TORCH_CHECK(voxels.numel() < max_input_size, + "voxels element num should be less than 2^31, got ", + voxels.numel(), "."); + TORCH_CHECK(coors.numel() < max_input_size, + "coors element num should be less than 2^31, got ", coors.numel(), + "."); + + // check zero element + if (max_points == 0 || max_voxels == 0) { + return 0; + } + + // get compute queue + auto queue = torch_mlu::getCurQueue(); + + // get ptr of tensors + auto points_ = points.contiguous(); + auto points_impl = torch_mlu::getMluTensorImpl(points_); + auto points_ptr = points_impl->cnnlMalloc(); + auto voxels_ = voxels.contiguous(); + auto voxels_impl = torch_mlu::getMluTensorImpl(voxels_); + auto voxels_ptr = voxels_impl->cnnlMalloc(); + auto coors_ = coors.contiguous(); + auto coors_impl = torch_mlu::getMluTensorImpl(coors_); + auto coors_ptr = coors_impl->cnnlMalloc(); + auto num_points_per_voxel_ = num_points_per_voxel.contiguous(); + auto num_points_per_voxel_impl = + torch_mlu::getMluTensorImpl(num_points_per_voxel_); + auto num_points_per_voxel_ptr = num_points_per_voxel_impl->cnnlMalloc(); + + // calculate task dimension + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + policyFuncDefault(&k_dim, &k_type, num_points); + + // 1. link point to corresponding voxel coors + const float voxel_x = voxel_size[0]; + const float voxel_y = voxel_size[1]; + const float voxel_z = voxel_size[2]; + const float coors_x_min = coors_range[0]; + const float coors_y_min = coors_range[1]; + const float coors_z_min = coors_range[2]; + const float coors_x_max = coors_range[3]; + const float coors_y_max = coors_range[4]; + const float coors_z_max = coors_range[5]; + + const int grid_x = round((coors_x_max - coors_x_min) / voxel_x); + const int grid_y = round((coors_y_max - coors_y_min) / voxel_y); + const int grid_z = round((coors_z_max - coors_z_min) / voxel_z); + + auto temp_coors = + at::zeros({NDim, num_points}, points.options().dtype(at::kInt)) + .contiguous(); + auto temp_coors_impl = torch_mlu::getMluTensorImpl(temp_coors); + auto temp_coors_ptr = temp_coors_impl->cnnlMalloc(); + + KernelDynamicVoxelize(k_dim, k_type, queue, points_ptr, temp_coors_ptr, + voxel_x, voxel_y, voxel_z, coors_x_min, coors_y_min, + coors_z_min, coors_x_max, coors_y_max, coors_z_max, + grid_x, grid_y, grid_z, num_points, num_features); + + // 2. map point to the idx of the corresponding voxel, find duplicate coor + auto point_to_pointidx = at::zeros( + { + num_points, + }, + points.options().dtype(at::kInt)) + .contiguous(); + auto point_to_pointidx_impl = torch_mlu::getMluTensorImpl(point_to_pointidx); + auto point_to_pointidx_ptr = point_to_pointidx_impl->cnnlMalloc(); + auto point_to_voxelidx = at::zeros( + { + num_points, + }, + points.options().dtype(at::kInt)) + .contiguous(); + auto point_to_voxelidx_impl = torch_mlu::getMluTensorImpl(point_to_voxelidx); + auto point_to_voxelidx_ptr = point_to_voxelidx_impl->cnnlMalloc(); + + KernelPoint2Voxel(k_dim, k_type, queue, temp_coors_ptr, point_to_pointidx_ptr, + point_to_voxelidx_ptr, num_points, max_points); + + // calculate task dimension + cnrtDim3_t k_dim_calc_points_per_voxel; + cnrtFunctionType_t k_type_calc_points_per_voxel; + policyFuncCalcPointsPerVoxel(&k_dim_calc_points_per_voxel, + &k_type_calc_points_per_voxel, num_points); + + // 3. determine voxel num and voxel's coor index + auto coor_to_voxelidx = at::zeros( + { + num_points, + }, + points.options().dtype(at::kInt)) + .contiguous(); + auto coor_to_voxelidx_impl = torch_mlu::getMluTensorImpl(coor_to_voxelidx); + auto coor_to_voxelidx_ptr = coor_to_voxelidx_impl->cnnlMalloc(); + auto voxel_num = at::zeros( + { + 1, + }, + points.options().dtype(at::kInt)) + .contiguous(); + auto voxel_num_impl = torch_mlu::getMluTensorImpl(voxel_num); + auto voxel_num_ptr = voxel_num_impl->cnnlMalloc(); + + KernelCalcPointsPerVoxel( + k_dim_calc_points_per_voxel, k_type_calc_points_per_voxel, queue, + point_to_pointidx_ptr, point_to_voxelidx_ptr, coor_to_voxelidx_ptr, + num_points_per_voxel_ptr, voxel_num_ptr, max_voxels, num_points); + + // 4. copy point features and coors of each voxels to voxels + KernelAssignVoxelsCoors(k_dim, k_type, queue, points_ptr, temp_coors_ptr, + point_to_voxelidx_ptr, coor_to_voxelidx_ptr, + voxels_ptr, coors_ptr, max_points, num_points, + num_features); + + auto voxel_num_cpu = voxel_num.to(at::kCPU); + int voxel_num_int = voxel_num_cpu.data_ptr()[0]; + + return voxel_num_int; +} + +int hard_voxelize_forward_mlu(const at::Tensor &points, at::Tensor &voxels, + at::Tensor &coors, + at::Tensor &num_points_per_voxel, + const std::vector voxel_size, + const std::vector coors_range, + const int max_points, const int max_voxels, + const int NDim) { + return HardVoxelizeForwardMLUKernelLauncher( + points, voxels, coors, num_points_per_voxel, voxel_size, coors_range, + max_points, max_voxels, NDim); +}; + +int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels, + at::Tensor &coors, + at::Tensor &num_points_per_voxel, + const std::vector voxel_size, + const std::vector coors_range, + const int max_points, const int max_voxels, + const int NDim); + +REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MLU, + hard_voxelize_forward_mlu); diff --git a/tests/test_ops/test_voxelization.py b/tests/test_ops/test_voxelization.py index d3555ac69..d34797caf 100644 --- a/tests/test_ops/test_voxelization.py +++ b/tests/test_ops/test_voxelization.py @@ -4,6 +4,7 @@ import pytest import torch from mmcv.ops import Voxelization +from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE def _get_voxel_points_indices(points, coors, voxel): @@ -16,7 +17,7 @@ def _get_voxel_points_indices(points, coors, voxel): pytest.param( 'cuda:0', marks=pytest.mark.skipif( - not torch.cuda.is_available(), reason='requires CUDA support')) + not IS_CUDA_AVAILABLE, reason='requires CUDA support')) ]) def test_voxelization(device_type): voxel_size = [0.5, 0.5, 0.5] @@ -62,8 +63,7 @@ def test_voxelization(device_type): assert num_points_current_voxel == expected_num_points_per_voxel[i] -@pytest.mark.skipif( - not torch.cuda.is_available(), reason='requires CUDA support') +@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support') def test_voxelization_nondeterministic(): voxel_size = [0.5, 0.5, 0.5] point_cloud_range = [0, -40, -3, 70.4, 40, 1] @@ -137,3 +137,38 @@ def test_voxelization_nondeterministic(): coors_all_set = {tuple(c) for c in coors_all} assert len(coors_set) == len(coors) == len(coors_all_set) + + +@pytest.mark.parametrize('device_type', [ + pytest.param( + 'mlu', + marks=pytest.mark.skipif( + not IS_MLU_AVAILABLE, reason='requires MLU support')) +]) +def test_voxelization_mlu(device_type): + voxel_size = [0.5, 0.5, 0.5] + point_cloud_range = [0, -40, -3, 70.4, 40, 1] + + voxel_dict = np.load( + 'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item() + expected_coors = voxel_dict['coors'] + expected_voxels = voxel_dict['voxels'] + expected_num_points_per_voxel = voxel_dict['num_points_per_voxel'] + points = voxel_dict['points'] + + points = torch.tensor(points) + max_num_points = 1000 + hard_voxelization = Voxelization(voxel_size, point_cloud_range, + max_num_points) + + device = torch.device(device_type) + + # test hard_voxelization on mlu + points = points.contiguous().to(device) + coors, voxels, num_points_per_voxel = hard_voxelization.forward(points) + coors = coors.cpu().detach().numpy() + voxels = voxels.cpu().detach().numpy() + num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy() + assert np.all(coors == expected_coors) + assert np.all(voxels == expected_voxels) + assert np.all(num_points_per_voxel == expected_num_points_per_voxel)