diff --git a/configs/deploy/detection_tensorrt-fp16_dynamic-320x320-640x640.py b/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py similarity index 58% rename from configs/deploy/detection_tensorrt-fp16_dynamic-320x320-640x640.py rename to configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py index 92ca431e..da565b6c 100644 --- a/configs/deploy/detection_tensorrt-fp16_dynamic-320x320-640x640.py +++ b/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py @@ -6,7 +6,8 @@ backend_config = dict( dict( input_shapes=dict( input=dict( - min_shape=[1, 3, 320, 320], + min_shape=[1, 3, 192, 192], opt_shape=[1, 3, 640, 640], - max_shape=[1, 3, 640, 640]))) + max_shape=[1, 3, 960, 960]))) ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/configs/deploy/detection_tensorrt-fp16_static-640x640.py b/configs/deploy/detection_tensorrt-fp16_static-640x640.py index 06644ae8..24d2a00d 100644 --- a/configs/deploy/detection_tensorrt-fp16_static-640x640.py +++ b/configs/deploy/detection_tensorrt-fp16_static-640x640.py @@ -11,3 +11,4 @@ backend_config = dict( opt_shape=[1, 3, 640, 640], max_shape=[1, 3, 640, 640]))) ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/configs/deploy/detection_tensorrt-int8_dynamic-320x320-640x640.py b/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py similarity index 65% rename from configs/deploy/detection_tensorrt-int8_dynamic-320x320-640x640.py rename to configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py index d01761f1..b0ba7d70 100644 --- a/configs/deploy/detection_tensorrt-int8_dynamic-320x320-640x640.py +++ b/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py @@ -7,8 +7,9 @@ backend_config = dict( dict( input_shapes=dict( input=dict( - min_shape=[1, 3, 320, 320], + min_shape=[1, 3, 192, 192], opt_shape=[1, 3, 640, 640], - max_shape=[1, 3, 640, 640]))) + max_shape=[1, 3, 960, 960]))) ], calib_config=dict(create_calib=True, calib_file='calib_data.h5')) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/configs/deploy/detection_tensorrt-int8_static-640x640.py b/configs/deploy/detection_tensorrt-int8_static-640x640.py index eeb330ea..f439c7b1 100644 --- a/configs/deploy/detection_tensorrt-int8_static-640x640.py +++ b/configs/deploy/detection_tensorrt-int8_static-640x640.py @@ -13,3 +13,4 @@ backend_config = dict( max_shape=[1, 3, 640, 640]))) ], calib_config=dict(create_calib=True, calib_file='calib_data.h5')) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/configs/deploy/detection_tensorrt_dynamic-320x320-640x640.py b/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py similarity index 58% rename from configs/deploy/detection_tensorrt_dynamic-320x320-640x640.py rename to configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py index c02e397b..17047d73 100644 --- a/configs/deploy/detection_tensorrt_dynamic-320x320-640x640.py +++ b/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py @@ -6,7 +6,8 @@ backend_config = dict( dict( input_shapes=dict( input=dict( - min_shape=[1, 3, 320, 320], + min_shape=[1, 3, 192, 192], opt_shape=[1, 3, 640, 640], - max_shape=[1, 3, 640, 640]))) + max_shape=[1, 3, 960, 960]))) ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/configs/deploy/detection_tensorrt_static-640x640.py b/configs/deploy/detection_tensorrt_static-640x640.py index a0830dee..9ec49cc1 100644 --- a/configs/deploy/detection_tensorrt_static-640x640.py +++ b/configs/deploy/detection_tensorrt_static-640x640.py @@ -11,3 +11,4 @@ backend_config = dict( opt_shape=[1, 3, 640, 640], max_shape=[1, 3, 640, 640]))) ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/mmyolo/deploy/models/dense_heads/yolov5_head.py b/mmyolo/deploy/models/dense_heads/yolov5_head.py index f8f2b5bf..cf61fb3c 100644 --- a/mmyolo/deploy/models/dense_heads/yolov5_head.py +++ b/mmyolo/deploy/models/dense_heads/yolov5_head.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +from functools import partial from typing import List, Optional, Tuple import torch @@ -10,6 +11,28 @@ from mmengine.config import ConfigDict from mmengine.structures import InstanceData from torch import Tensor +from mmyolo.deploy.models.layers import efficient_nms +from mmyolo.models.dense_heads import YOLOv5Head + + +def yolov5_bbox_decoder(priors, bbox_preds, stride): + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + @FUNCTION_REWRITER.register_rewriter( func_name='mmyolo.models.dense_heads.yolov5_head.' @@ -18,7 +41,7 @@ def yolov5_head__predict_by_feat(ctx, self, cls_scores: List[Tensor], bbox_preds: List[Tensor], - objectnesses: Optional[List[Tensor]], + objectnesses: Optional[List[Tensor]] = None, batch_img_metas: Optional[List[dict]] = None, cfg: Optional[ConfigDict] = None, rescale: bool = False, @@ -51,6 +74,20 @@ def yolov5_head__predict_by_feat(ctx, tensor in the tuple is (N, num_box), and each element represents the class label of the corresponding box. """ + detector_type = type(self) + deploy_cfg = ctx.cfg + use_efficientnms = deploy_cfg.get('use_efficientnms', False) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + bbox_decoder = self.bbox_coder.decode + nms_func = multiclass_nms + if use_efficientnms: + if detector_type is YOLOv5Head: + nms_func = partial(efficient_nms, box_coding=0) + bbox_decoder = yolov5_bbox_decoder + else: + nms_func = efficient_nms + assert len(cls_scores) == len(bbox_preds) cfg = self.test_cfg if cfg is None else cfg cfg = copy.deepcopy(cfg) @@ -59,7 +96,8 @@ def yolov5_head__predict_by_feat(ctx, featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] mlvl_priors = self.prior_generator.grid_priors( - featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device) + featmap_sizes, dtype=dtype, device=device) + flatten_priors = torch.cat(mlvl_priors) mlvl_strides = [ @@ -69,33 +107,36 @@ def yolov5_head__predict_by_feat(ctx, for featmap_size, stride in zip(featmap_sizes, self.featmap_strides) ] flatten_stride = torch.cat(mlvl_strides) + # flatten cls_scores, bbox_preds and objectness flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes) for cls_score in cls_scores ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) for bbox_pred in bbox_preds ] - - flatten_objectness = [ - objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) - for objectness in objectnesses - ] - - cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) - flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() - bboxes = self.bbox_coder.decode(flatten_priors[None], flatten_bbox_preds, - flatten_stride) - # directly multiply score factor and feed to nms - scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) if not with_nms: return bboxes, scores - deploy_cfg = ctx.cfg + post_params = get_post_processing_params(deploy_cfg) max_output_boxes_per_class = post_params.max_output_boxes_per_class iou_threshold = cfg.nms.get('iou_threshold', post_params.iou_threshold) @@ -103,6 +144,5 @@ def yolov5_head__predict_by_feat(ctx, pre_top_k = post_params.pre_top_k keep_top_k = cfg.get('max_per_img', post_params.keep_top_k) - return multiclass_nms(bboxes, scores, max_output_boxes_per_class, - iou_threshold, score_threshold, pre_top_k, - keep_top_k) + return nms_func(bboxes, scores, max_output_boxes_per_class, iou_threshold, + score_threshold, pre_top_k, keep_top_k) diff --git a/mmyolo/deploy/models/layers/__init__.py b/mmyolo/deploy/models/layers/__init__.py new file mode 100644 index 00000000..6017cf83 --- /dev/null +++ b/mmyolo/deploy/models/layers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_nms import efficient_nms + +__all__ = ['efficient_nms'] diff --git a/mmyolo/deploy/models/layers/bbox_nms.py b/mmyolo/deploy/models/layers/bbox_nms.py new file mode 100644 index 00000000..55257a3a --- /dev/null +++ b/mmyolo/deploy/models/layers/bbox_nms.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdeploy.core import mark +from torch import Tensor + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x, y, w, h]. + Set to 1 means [x1, y1 ,x2, y2]. + + Returns: + tuple[Tensor, Tensor]: (dets, labels), `dets` of shape [N, num_det, 5] + and `labels` of shape [N, num_det]. + """ + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, det_boxes, det_scores, labels = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + dets = torch.cat([det_boxes, det_scores.unsqueeze(2)], -1) + + # retain shape info + batch_size = boxes.size(0) + + dets_shape = dets.shape + label_shape = labels.shape + dets = dets.reshape([batch_size, *dets_shape[1:]]) + labels = labels.reshape([batch_size, *label_shape[1:]]) + return dets, labels + + +@mark('efficient_nms', inputs=['boxes', 'scores'], outputs=['dets', 'labels']) +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +class TRTEfficientNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25, + ): + batch_size, num_boxes, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25): + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + nums, boxes, scores, classes = out + return nums, boxes, scores, classes