[NPU] add npu ops and check (#3267)

* add npu ops and check

* lint fix

* lint fix 2

* delete and fix

* fix 3
pull/2976/merge
Wang Yixuan 2025-04-01 22:20:31 +08:00 committed by GitHub
parent cf1c168fcb
commit 6d33b9f650
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 60 additions and 66 deletions

View File

@ -21,3 +21,24 @@ void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
REGISTER_NPU_IMPL(assign_score_withk_forward_impl,
assign_score_withk_forward_npu);
void assign_score_withk_backward_npu(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores) {
at::Tensor grad_out_trans = grad_out.permute({0, 2, 3, 1});
EXEC_NPU_CMD(aclnnAssignScoreWithkGrad, grad_out_trans, points, centers,
scores, knn_idx, B, N0, N1, M, K, O, aggregate, grad_scores,
grad_points, grad_centers);
}
void assign_score_withk_backward_impl(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores);
REGISTER_NPU_IMPL(assign_score_withk_backward_impl,
assign_score_withk_backward_npu);

View File

@ -1,66 +0,0 @@
#include "pytorch_npu_helper.hpp"
using namespace NPU_NAME_SPACE;
using namespace std;
vector<vector<float>> pixel_group_npu(Tensor score, Tensor mask,
Tensor embedding, Tensor kernel_label,
Tensor kernel_contour,
int kernel_region_num,
float distance_threshold) {
TORCH_CHECK(score.dim() == 2,
"score.dim() must be 2, but got: ", score.dim());
TORCH_CHECK(mask.dim() == 2, "mask.dim() must be 2, but got: ", mask.dim());
TORCH_CHECK(embedding.dim() == 3,
"embedding.dim() must be 3, but got: ", embedding.dim());
TORCH_CHECK(kernel_label.dim() == 2,
"kernel_label.dim() must be 2, but got: ", kernel_label.dim());
TORCH_CHECK(
kernel_contour.dim() == 2,
"kernel_contour.dim() must be 2, but got: ", kernel_contour.dim());
auto label_size = kernel_label.sizes();
auto height = label_size[0];
auto width = label_size[1];
c10::SmallVector<int64_t, 8> point_vector_size = {kernel_region_num, 2};
c10::SmallVector<int64_t, 8> label_updated_size = {height, width};
at::Tensor point_vector = at::zeros(point_vector_size, score.options());
at::Tensor label_updated =
at::empty(label_updated_size, kernel_label.options());
EXEC_NPU_CMD(aclnnPixelGroup, score, mask, embedding, kernel_label,
kernel_contour, kernel_region_num, distance_threshold,
point_vector, label_updated);
std::vector<std::vector<float>> pixel_assignment(kernel_region_num);
at::Tensor point_vector_cpu = point_vector.to(at::kCPU);
at::Tensor label_updated_cpu = label_updated.to(at::kCPU);
for (int32_t l = 0; l < kernel_region_num; l++) {
pixel_assignment[l].push_back(point_vector_cpu[l][0].item<float>());
pixel_assignment[l].push_back(point_vector_cpu[l][1].item<float>());
if (pixel_assignment[l][1] > 0) {
pixel_assignment[l][0] /= pixel_assignment[l][1];
}
if (l > 0) {
at::Tensor valid_mask = (label_updated_cpu == l);
at::Tensor indices = at::nonzero(valid_mask);
for (int32_t i = 0; i < indices.size(0); i++) {
auto x = indices[i][0].item<int32_t>();
auto y = indices[i][1].item<int32_t>();
pixel_assignment[l].push_back(y);
pixel_assignment[l].push_back(x);
}
}
}
return pixel_assignment;
}
vector<vector<float>> pixel_group_impl(Tensor score, Tensor mask,
Tensor embedding, Tensor kernel_label,
Tensor kernel_contour,
int kernel_region_num,
float distance_threshold);
REGISTER_NPU_IMPL(pixel_group_impl, pixel_group_npu);

View File

@ -8,6 +8,8 @@ void roi_align_forward_npu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
TORCH_CHECK(input.scalar_type() == at::kFloat,
"input should be a float tensor");
int64_t roi_end_mode = 2;
if (!aligned) {
LOG(WARNING) << "The [aligned] attr in roi_align op is false";
@ -34,6 +36,8 @@ void roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
TORCH_CHECK(grad_output.scalar_type() == at::kFloat,
"input should be a float tensor");
int64_t aligned_height_64 = aligned_height;
int64_t aligned_width_64 = aligned_width;
int64_t sampling_ratio_64 = sampling_ratio;

View File

@ -11,6 +11,11 @@ int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
const int max_points, const int max_voxels,
const int NDim = 3);
void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3);
int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors,
at::Tensor &num_points_per_voxel,
@ -53,4 +58,34 @@ int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
return voxel_num_int;
}
void dynamic_voxelize_forward_npu(const at::Tensor &points, at::Tensor &coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim = 3) {
uint32_t ptsNum = points.size(0);
uint32_t ptsFeature = points.size(1);
at::Tensor ptsTrans = at::transpose(points, 0, 1);
double coors_min_x = coors_range[0];
double coors_min_y = coors_range[1];
double coors_min_z = coors_range[2];
double coors_max_x = coors_range[3];
double coors_max_y = coors_range[4];
double coors_max_z = coors_range[5];
double voxel_x = voxel_size[0];
double voxel_y = voxel_size[1];
double voxel_z = voxel_size[2];
int grid_x = std::round((coors_max_x - coors_min_x) / voxel_x);
int grid_y = std::round((coors_max_y - coors_min_y) / voxel_y);
int grid_z = std::round((coors_max_z - coors_min_z) / voxel_z);
at::Tensor tmp_coors =
at::zeros({3, ptsNum}, points.options().dtype(at::kInt));
EXEC_NPU_CMD(aclnnDynamicVoxelization, ptsTrans, coors_min_x, coors_min_y,
coors_min_z, voxel_x, voxel_y, voxel_z, grid_x, grid_y, grid_z,
tmp_coors);
tmp_coors.transpose_(0, 1);
coors.copy_(tmp_coors);
}
REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);
REGISTER_NPU_IMPL(dynamic_voxelize_forward_impl, dynamic_voxelize_forward_npu);