From 600343eb083e148e9c8da1301a1559ef105d2e0f Mon Sep 17 00:00:00 2001 From: JosonChan <57584090+JosonChan1998@users.noreply.github.com> Date: Thu, 27 Apr 2023 14:47:52 +0800 Subject: [PATCH] [Feature] Support YOLOv5 instance segmentation (#735) * add * reproduce map * add typehint and doc * format code * replace key * add ut * format * format * format code * fix ut * fix ut * fix comment * fix comment * fix comment * [WIP][Feature] Support yolov5-Ins training * fix comment * change data flow and fix loss_mask compute * align the data pipeline * remove albu gt mask key * support yolov5 ins inference * fix multi gpu test * align the post_process with v8 * support training * support training * code formatting * code formatting * Support pad_param type (#672) * add half_pad_param * fix default fast_test * fix loss weight compute * fix mask rescale, add segment merge, fix segment2bbox * fix clip and fix mask init * code formatting * code formatting * code formatting * code formatting * [Fix] fix load image from file * [Add] Add docs and more config * [Fix] config type and test_formatting * [Fix] fix yolov5-ins_m packdetinputs * update --------- Co-authored-by: Nioolek <379319054@qq.com> Co-authored-by: Nioolek <40284075+Nioolek@users.noreply.github.com> Co-authored-by: huanghaian --- configs/yolov5/README.md | 14 + ...61_syncbn_fast_8xb16-300e_coco_instance.py | 15 + ...61_syncbn_fast_8xb16-300e_coco_instance.py | 88 +++ ...61_syncbn_fast_8xb16-300e_coco_instance.py | 15 + ...61_syncbn_fast_8xb16-300e_coco_instance.py | 126 +++ ...st_non_overlap_8xb16-300e_coco_instance.py | 49 ++ ...61_syncbn_fast_8xb16-300e_coco_instance.py | 15 + configs/yolov5/metafile.yml | 64 ++ mmyolo/datasets/transforms/__init__.py | 14 +- mmyolo/datasets/transforms/formatting.py | 102 +++ .../datasets/transforms/mix_img_transforms.py | 2 +- mmyolo/datasets/transforms/transforms.py | 361 ++++++++- mmyolo/datasets/utils.py | 12 +- mmyolo/models/dense_heads/__init__.py | 3 +- mmyolo/models/dense_heads/yolov5_head.py | 7 +- mmyolo/models/dense_heads/yolov5_ins_head.py | 740 ++++++++++++++++++ .../test_transforms/test_formatting.py | 119 +++ .../test_transforms/test_transforms.py | 27 +- .../test_dense_heads/test_yolov5_head.py | 177 ++++- tools/model_converters/yolov5_to_mmyolo.py | 6 + tools/model_converters/yolov8_to_mmyolo.py | 13 + 21 files changed, 1922 insertions(+), 47 deletions(-) create mode 100644 configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py create mode 100644 configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py create mode 100644 configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py create mode 100644 configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py create mode 100644 configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py create mode 100644 configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py create mode 100644 mmyolo/datasets/transforms/formatting.py create mode 100644 mmyolo/models/dense_heads/yolov5_ins_head.py create mode 100644 tests/test_datasets/test_transforms/test_formatting.py diff --git a/configs/yolov5/README.md b/configs/yolov5/README.md index cc6eff2b..c5980658 100644 --- a/configs/yolov5/README.md +++ b/configs/yolov5/README.md @@ -53,6 +53,20 @@ YOLOv5-l-P6 model structure 7. The performance of `Mask Refine` training is for the weight performance officially released by YOLOv5. `Mask Refine` means refining bbox by mask while loading annotations and transforming after `YOLOv5RandomAffine`, `Copy Paste` means using `YOLOv5CopyPaste`. 8. `YOLOv5u` models use the same loss functions and split Detect head as `YOLOv8` models for improved performance, but only requires 300 epochs. +### COCO Instance segmentation + +| Backbone | Arch | size | SyncBN | AMP | Mem (GB) | Box AP | Mask AP | Config | Download | +| :-------------------: | :--: | :--: | :----: | :-: | :------: | :----: | :-----: | :--------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | P5 | 640 | Yes | Yes | 3.3 | 27.9 | 23.7 | [config](./ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807.log.json) | +| YOLOv5-s | P5 | 640 | Yes | Yes | 4.8 | 38.1 | 32.0 | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542.log.json) | +| YOLOv5-s(non-overlap) | P5 | 640 | Yes | Yes | 4.8 | 38.0 | 32.1 | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642.log.json) | +| YOLOv5-m | P5 | 640 | Yes | Yes | 7.3 | 45.1 | 37.3 | [config](./ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529.log.json) | + +**Note**: + +1. `Non-overlap` refers to the instance-level masks being stored in the format (num_instances, h, w) instead of (h, w). Storing masks in overlap format consumes less memory and GPU memory. +2. We found that the mAP of the N/S/M model is higher than the official version, but the L/X model is lower than the official version. We will resolve this issue as soon as possible. + ### VOC | Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) | Config | Download | diff --git a/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 00000000..dd15b1bf --- /dev/null +++ b/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 00000000..2951c9e3 --- /dev/null +++ b/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,88 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=_base_.use_mask2refine), +] + +# enable mixup +train_pipeline = [ + *pre_transform, + *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=_base_.mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 00000000..e06130bd --- /dev/null +++ b/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 00000000..bd73139e --- /dev/null +++ b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,126 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' # noqa + +# ========================modified parameters====================== +# YOLOv5RandomAffine +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = True +# LeterResize +# half_pad_param: if set to True, left and right pad_param will +# be given by dividing padding_h by 2. If set to False, pad_param is +# in int format. We recommend setting this to False for object +# detection tasks, and True for instance segmentation tasks. +# Default to False. +half_pad_param = True + +# Testing take a long time due to model_test_cfg. +# If you want to speed it up, you can increase score_thr +# or decraese nms_pre and max_per_img +model_test_cfg = dict( + multi_label=True, + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300, + mask_thr_binary=0.5, + # fast_test: Whether to use fast test methods. When set + # to False, the implementation here is the same as the + # official, with higher mAP. If set to True, mask will first + # be upsampled to origin image shape through Pytorch, and + # then use mask_thr_binary to determine which pixels belong + # to the object. If set to False, will first use + # mask_thr_binary to determine which pixels belong to the + # object , and then use opencv to upsample mask to origin + # image shape. Default to False. + fast_test=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + bbox_head=dict( + type='YOLOv5InsHead', + head_module=dict( + type='YOLOv5InsHeadModule', mask_channels=32, proto_channels=256), + mask_overlap=mask_overlap, + loss_mask=dict( + type='mmdet.CrossEntropyLoss', use_sigmoid=True, reduction='none'), + loss_mask_weight=0.05), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + max_aspect_ratio=max_aspect_ratio, + use_mask_refine=use_mask2refine), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes', + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + half_pad_param=half_pad_param, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py new file mode 100644 index 00000000..83b48cab --- /dev/null +++ b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py @@ -0,0 +1,49 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# ========================modified parameters====================== +mask_overlap = False # Polygon2Mask + +# ===============================Unmodified in most cases==================== +model = dict(bbox_head=dict(mask_overlap=mask_overlap)) + +train_pipeline = [ + *_base_.pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=True), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes', + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 00000000..e08d4304 --- /dev/null +++ b/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/configs/yolov5/metafile.yml b/configs/yolov5/metafile.yml index bfa92bdb..97a5416b 100644 --- a/configs/yolov5/metafile.yml +++ b/configs/yolov5/metafile.yml @@ -248,3 +248,67 @@ Models: Metrics: box AP: 50.9 Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth + - Name: yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 3.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 23.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth + - Name: yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth + - Name: yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth + - Name: yolov5_ins_m-v61_syncbn_fast=_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast=_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth diff --git a/mmyolo/datasets/transforms/__init__.py b/mmyolo/datasets/transforms/__init__.py index 58f4e6fd..6719ac33 100644 --- a/mmyolo/datasets/transforms/__init__.py +++ b/mmyolo/datasets/transforms/__init__.py @@ -1,14 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .formatting import PackDetInputs from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp -from .transforms import (LetterResize, LoadAnnotations, PPYOLOERandomCrop, - PPYOLOERandomDistort, RegularizeRotatedBox, - RemoveDataElement, YOLOv5CopyPaste, - YOLOv5HSVRandomAug, YOLOv5KeepRatioResize, - YOLOv5RandomAffine) +from .transforms import (LetterResize, LoadAnnotations, Polygon2Mask, + PPYOLOERandomCrop, PPYOLOERandomDistort, + RegularizeRotatedBox, RemoveDataElement, + YOLOv5CopyPaste, YOLOv5HSVRandomAug, + YOLOv5KeepRatioResize, YOLOv5RandomAffine) __all__ = [ 'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp', 'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations', 'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop', - 'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox' + 'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox', + 'Polygon2Mask', 'PackDetInputs' ] diff --git a/mmyolo/datasets/transforms/formatting.py b/mmyolo/datasets/transforms/formatting.py new file mode 100644 index 00000000..0185d78c --- /dev/null +++ b/mmyolo/datasets/transforms/formatting.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.transforms import to_tensor +from mmdet.datasets.transforms import PackDetInputs as MMDET_PackDetInputs +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import BaseBoxes +from mmengine.structures import InstanceData, PixelData + +from mmyolo.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class PackDetInputs(MMDET_PackDetInputs): + """Pack the inputs data for the detection / semantic segmentation / + panoptic segmentation. + + Compared to mmdet, we just add the `gt_panoptic_seg` field and logic. + """ + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + Args: + results (dict): Result dict from the data pipeline. + Returns: + dict: + - 'inputs' (obj:`torch.Tensor`): The forward data of models. + - 'data_sample' (obj:`DetDataSample`): The annotation info of the + sample. + """ + packed_results = dict() + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # To improve the computational speed by by 3-5 times, apply: + # If image is not contiguous, use + # `numpy.transpose()` followed by `numpy.ascontiguousarray()` + # If image is already contiguous, use + # `torch.permute()` followed by `torch.contiguous()` + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if not img.flags.c_contiguous: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + else: + img = to_tensor(img).permute(2, 0, 1).contiguous() + + packed_results['inputs'] = img + + if 'gt_ignore_flags' in results: + valid_idx = np.where(results['gt_ignore_flags'] == 0)[0] + ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0] + + data_sample = DetDataSample() + instance_data = InstanceData() + ignore_instance_data = InstanceData() + + for key in self.mapping_table.keys(): + if key not in results: + continue + if key == 'gt_masks' or isinstance(results[key], BaseBoxes): + if 'gt_ignore_flags' in results: + instance_data[ + self.mapping_table[key]] = results[key][valid_idx] + ignore_instance_data[ + self.mapping_table[key]] = results[key][ignore_idx] + else: + instance_data[self.mapping_table[key]] = results[key] + else: + if 'gt_ignore_flags' in results: + instance_data[self.mapping_table[key]] = to_tensor( + results[key][valid_idx]) + ignore_instance_data[self.mapping_table[key]] = to_tensor( + results[key][ignore_idx]) + else: + instance_data[self.mapping_table[key]] = to_tensor( + results[key]) + data_sample.gt_instances = instance_data + data_sample.ignored_instances = ignore_instance_data + + if 'gt_seg_map' in results: + gt_sem_seg_data = dict( + sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy())) + data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data) + + # In order to unify the support for the overlap mask annotations + # i.e. mask overlap annotations in (h,w) format, + # we use the gt_panoptic_seg field to unify the modeling + if 'gt_panoptic_seg' in results: + data_sample.gt_panoptic_seg = PixelData( + pan_seg=results['gt_panoptic_seg']) + + img_meta = {} + for key in self.meta_keys: + assert key in results, f'`{key}` is not found in `results`, ' \ + f'the valid keys are {list(results)}.' + img_meta[key] = results[key] + + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + + return packed_results diff --git a/mmyolo/datasets/transforms/mix_img_transforms.py b/mmyolo/datasets/transforms/mix_img_transforms.py index 4a25f6f7..4753ecc3 100644 --- a/mmyolo/datasets/transforms/mix_img_transforms.py +++ b/mmyolo/datasets/transforms/mix_img_transforms.py @@ -374,7 +374,7 @@ class Mosaic(BaseMixImageTransform): mosaic_ignore_flags.append(gt_ignore_flags_i) if with_mask and results_patch.get('gt_masks', None) is not None: gt_masks_i = results_patch['gt_masks'] - gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i)) + gt_masks_i = gt_masks_i.resize(img_i.shape[:2]) gt_masks_i = gt_masks_i.translate( out_shape=(int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)), diff --git a/mmyolo/datasets/transforms/transforms.py b/mmyolo/datasets/transforms/transforms.py index 2cdc6a5f..30dfdb3f 100644 --- a/mmyolo/datasets/transforms/transforms.py +++ b/mmyolo/datasets/transforms/transforms.py @@ -13,7 +13,7 @@ from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations from mmdet.datasets.transforms import Resize as MMDET_Resize from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type, get_box_type) -from mmdet.structures.mask import PolygonMasks +from mmdet.structures.mask import PolygonMasks, polygon_to_bitmap from numpy import random from mmyolo.registry import TRANSFORMS @@ -99,17 +99,21 @@ class YOLOv5KeepRatioResize(MMDET_Resize): self.scale) if ratio != 1: - # resize image according to the ratio - image = mmcv.imrescale( + # resize image according to the shape + # NOTE: We are currently testing on COCO that modifying + # this code will not affect the results. + # If you find that it has an effect on your results, + # please feel free to contact us. + image = mmcv.imresize( img=image, - scale=ratio, + size=(int(original_w * ratio), int(original_h * ratio)), interpolation='area' if ratio < 1 else 'bilinear', backend=self.backend) resized_h, resized_w = image.shape[:2] - scale_ratio = resized_h / original_h - - scale_factor = (scale_ratio, scale_ratio) + scale_ratio_h = resized_h / original_h + scale_ratio_w = resized_w / original_w + scale_factor = (scale_ratio_w, scale_ratio_h) results['img'] = image results['img_shape'] = image.shape[:2] @@ -142,6 +146,11 @@ class LetterResize(MMDET_Resize): stretch_only (bool): Whether stretch to the specified size directly. Defaults to False allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True + half_pad_param (bool): If set to True, left and right pad_param will + be given by dividing padding_h by 2. If set to False, pad_param is + in int format. We recommend setting this to False for object + detection tasks, and True for instance segmentation tasks. + Default to False. """ def __init__(self, @@ -150,6 +159,7 @@ class LetterResize(MMDET_Resize): use_mini_pad: bool = False, stretch_only: bool = False, allow_scale_up: bool = True, + half_pad_param: bool = False, **kwargs): super().__init__(scale=scale, keep_ratio=True, **kwargs) @@ -162,6 +172,7 @@ class LetterResize(MMDET_Resize): self.use_mini_pad = use_mini_pad self.stretch_only = stretch_only self.allow_scale_up = allow_scale_up + self.half_pad_param = half_pad_param def _resize_img(self, results: dict): """Resize images with ``results['scale']``.""" @@ -212,7 +223,8 @@ class LetterResize(MMDET_Resize): interpolation=self.interpolation, backend=self.backend) - scale_factor = (ratio[1], ratio[0]) # mmcv scale factor is (w, h) + scale_factor = (no_pad_shape[1] / image_shape[1], + no_pad_shape[0] / image_shape[0]) if 'scale_factor' in results: results['scale_factor_origin'] = results['scale_factor'] @@ -246,7 +258,15 @@ class LetterResize(MMDET_Resize): if 'pad_param' in results: results['pad_param_origin'] = results['pad_param'] * \ np.repeat(ratio, 2) - results['pad_param'] = np.array(padding_list, dtype=np.float32) + + if self.half_pad_param: + results['pad_param'] = np.array( + [padding_h / 2, padding_h / 2, padding_w / 2, padding_w / 2], + dtype=np.float32) + else: + # We found in object detection, using padding list with + # int type can get higher mAP. + results['pad_param'] = np.array(padding_list, dtype=np.float32) def _resize_masks(self, results: dict): """Resize masks with ``results['scale']``""" @@ -370,13 +390,26 @@ class YOLOv5HSVRandomAug(BaseTransform): class LoadAnnotations(MMDET_LoadAnnotations): """Because the yolo series does not need to consider ignore bboxes for the time being, in order to speed up the pipeline, it can be excluded in - advance.""" + advance. + + Args: + mask2bbox (bool): Whether to use mask annotation to get bbox. + Defaults to False. + poly2mask (bool): Whether to transform the polygons to bitmaps. + Defaults to False. + merge_polygons (bool): Whether to merge polygons into one polygon. + If merged, the storage structure is simpler and training is more + effcient, especially if the mask inside a bbox is divided into + multiple polygons. Defaults to True. + """ def __init__(self, mask2bbox: bool = False, poly2mask: bool = False, - **kwargs) -> None: + merge_polygons: bool = True, + **kwargs): self.mask2bbox = mask2bbox + self.merge_polygons = merge_polygons assert not poly2mask, 'Does not support BitmapMasks considering ' \ 'that bitmap consumes more memory.' super().__init__(poly2mask=poly2mask, **kwargs) @@ -485,6 +518,8 @@ class LoadAnnotations(MMDET_LoadAnnotations): # ignore self._mask_ignore_flag.append(0) else: + if len(gt_mask) > 1 and self.merge_polygons: + gt_mask = self.merge_multi_segment(gt_mask) gt_masks.append(gt_mask) gt_ignore_flags.append(instance['ignore_flag']) self._mask_ignore_flag.append(1) @@ -503,6 +538,79 @@ class LoadAnnotations(MMDET_LoadAnnotations): gt_masks = PolygonMasks([mask for mask in gt_masks], h, w) results['gt_masks'] = gt_masks + def merge_multi_segment(self, + gt_masks: List[np.ndarray]) -> List[np.ndarray]: + """Merge multi segments to one list. + + Find the coordinates with min distance between each segment, + then connect these coordinates with one thin line to merge all + segments into one. + Args: + gt_masks(List(np.array)): + original segmentations in coco's json file. + like [segmentation1, segmentation2,...], + each segmentation is a list of coordinates. + Return: + gt_masks(List(np.array)): merged gt_masks + """ + s = [] + segments = [np.array(i).reshape(-1, 2) for i in gt_masks] + idx_list = [[] for _ in range(len(gt_masks))] + + # record the indexes with min distance between each segment + for i in range(1, len(segments)): + idx1, idx2 = self.min_index(segments[i - 1], segments[i]) + idx_list[i - 1].append(idx1) + idx_list[i].append(idx2) + + # use two round to connect all the segments + # first round: first to end, i.e. A->B(partial)->C + # second round: end to first, i.e. C->B(remaining)-A + for k in range(2): + # forward first round + if k == 0: + for i, idx in enumerate(idx_list): + # middle segments have two indexes + # reverse the index of middle segments + if len(idx) == 2 and idx[0] > idx[1]: + idx = idx[::-1] + segments[i] = segments[i][::-1, :] + # add the idx[0] point for connect next segment + segments[i] = np.roll(segments[i], -idx[0], axis=0) + segments[i] = np.concatenate( + [segments[i], segments[i][:1]]) + # deal with the first segment and the last one + if i in [0, len(idx_list) - 1]: + s.append(segments[i]) + # deal with the middle segment + # Note that in the first round, only partial segment + # are appended. + else: + idx = [0, idx[1] - idx[0]] + s.append(segments[i][idx[0]:idx[1] + 1]) + # forward second round + else: + for i in range(len(idx_list) - 1, -1, -1): + # deal with the middle segment + # append the remaining points + if i not in [0, len(idx_list) - 1]: + idx = idx_list[i] + nidx = abs(idx[1] - idx[0]) + s.append(segments[i][nidx:]) + return [np.concatenate(s).reshape(-1, )] + + def min_index(self, arr1: np.ndarray, arr2: np.ndarray) -> Tuple[int, int]: + """Find a pair of indexes with the shortest distance. + + Args: + arr1: (N, 2). + arr2: (M, 2). + Return: + tuple: a pair of indexes. + """ + dis = ((arr1[:, None, :] - arr2[None, :, :])**2).sum(-1) + return np.unravel_index(np.argmin(dis, axis=None), dis.shape) + def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f'(with_bbox={self.with_bbox}, ' @@ -571,7 +679,7 @@ class YOLOv5RandomAffine(BaseTransform): min_area_ratio (float): Threshold of area ratio between original bboxes and wrapped bboxes. If smaller than this value, the box will be removed. Defaults to 0.1. - use_mask_refine (bool): Whether to refine bbox by mask. + use_mask_refine (bool): Whether to refine bbox by mask. Deprecated. max_aspect_ratio (float): Aspect ratio of width and height threshold to filter bboxes. If max(h/w, w/h) larger than this value, the box will be removed. Defaults to 20. @@ -603,6 +711,7 @@ class YOLOv5RandomAffine(BaseTransform): self.bbox_clip_border = bbox_clip_border self.min_bbox_size = min_bbox_size self.min_area_ratio = min_area_ratio + # The use_mask_refine parameter has been deprecated. self.use_mask_refine = use_mask_refine self.max_aspect_ratio = max_aspect_ratio self.resample_num = resample_num @@ -644,7 +753,7 @@ class YOLOv5RandomAffine(BaseTransform): num_bboxes = len(bboxes) if num_bboxes: orig_bboxes = bboxes.clone() - if self.use_mask_refine and 'gt_masks' in results: + if 'gt_masks' in results: # If the dataset has annotations of mask, # the mask will be used to refine bbox. gt_masks = results['gt_masks'] @@ -654,10 +763,13 @@ class YOLOv5RandomAffine(BaseTransform): img_h, img_w) # refine bboxes by masks - bboxes = gt_masks.get_bboxes(dst_type='hbox') + bboxes = self.segment2box(gt_masks, height, width) # filter bboxes outside image valid_index = self.filter_gt_bboxes(orig_bboxes, bboxes).numpy() + if self.bbox_clip_border: + bboxes.clip_([height - 1e-3, width - 1e-3]) + gt_masks = self.clip_polygons(gt_masks, height, width) results['gt_masks'] = gt_masks[valid_index] else: bboxes.project_(warp_matrix) @@ -671,18 +783,84 @@ class YOLOv5RandomAffine(BaseTransform): # otherwise it will raise out of bounds when len(valid_index)=1 valid_index = self.filter_gt_bboxes(orig_bboxes, bboxes).numpy() - if 'gt_masks' in results: - results['gt_masks'] = PolygonMasks( - results['gt_masks'].masks, img_h, img_w) results['gt_bboxes'] = bboxes[valid_index] results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ valid_index] results['gt_ignore_flags'] = results['gt_ignore_flags'][ valid_index] + else: + if 'gt_masks' in results: + results['gt_masks'] = PolygonMasks([], img_h, img_w) return results + def segment2box(self, gt_masks: PolygonMasks, height: int, + width: int) -> HorizontalBoxes: + """ + Convert 1 segment label to 1 box label, applying inside-image + constraint i.e. (xy1, xy2, ...) to (xyxy) + Args: + gt_masks (torch.Tensor): the segment label + width (int): the width of the image. Defaults to 640 + height (int): The height of the image. Defaults to 640 + Returns: + HorizontalBoxes: the clip bboxes from gt_masks. + """ + bboxes = [] + for _, poly_per_obj in enumerate(gt_masks): + # simply use a number that is big enough for comparison with + # coordinates + xy_min = np.array([width * 2, height * 2], dtype=np.float32) + xy_max = np.zeros(2, dtype=np.float32) - 1 + + for p in poly_per_obj: + xy = np.array(p).reshape(-1, 2).astype(np.float32) + x, y = xy.T + inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) + x, y = x[inside], y[inside] + if not any(x): + continue + xy = np.stack([x, y], axis=0).T + + xy_min = np.minimum(xy_min, np.min(xy, axis=0)) + xy_max = np.maximum(xy_max, np.max(xy, axis=0)) + if xy_max[0] == -1: + bbox = np.zeros(4, dtype=np.float32) + else: + bbox = np.concatenate([xy_min, xy_max], axis=0) + bboxes.append(bbox) + + return HorizontalBoxes(np.stack(bboxes, axis=0)) + + # TODO: Move to mmdet + def clip_polygons(self, gt_masks: PolygonMasks, height: int, + width: int) -> PolygonMasks: + """Function to clip points of polygons with height and width. + + Args: + gt_masks (PolygonMasks): Annotations of instance segmentation. + height (int): height of clip border. + width (int): width of clip border. + Return: + clipped_masks (PolygonMasks): + Clip annotations of instance segmentation. + """ + if len(gt_masks) == 0: + clipped_masks = PolygonMasks([], height, width) + else: + clipped_masks = [] + for poly_per_obj in gt_masks: + clipped_poly_per_obj = [] + for p in poly_per_obj: + p = p.copy() + p[0::2] = p[0::2].clip(0, width) + p[1::2] = p[1::2].clip(0, height) + clipped_poly_per_obj.append(p) + clipped_masks.append(clipped_poly_per_obj) + clipped_masks = PolygonMasks(clipped_masks, height, width) + return clipped_masks + @staticmethod def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int, img_h: int) -> np.ndarray: @@ -707,10 +885,7 @@ class YOLOv5RandomAffine(BaseTransform): poly = poly @ warp_matrix.T poly = poly[:, :2] / poly[:, 2:3] - # filter point outside image - x, y = poly.T - valid_ind_point = (x >= 0) & (y >= 0) & (x <= img_w) & (y <= img_h) - return poly[valid_ind_point].reshape(-1) + return poly.reshape(-1) def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray, img_w: int, img_h: int) -> PolygonMasks: @@ -1374,7 +1549,7 @@ class YOLOv5CopyPaste(BaseTransform): if len(results.get('gt_masks', [])) == 0: return results gt_masks = results['gt_masks'] - assert isinstance(gt_masks, PolygonMasks),\ + assert isinstance(gt_masks, PolygonMasks), \ 'only support type of PolygonMasks,' \ ' but get type: %s' % type(gt_masks) gt_bboxes = results['gt_bboxes'] @@ -1555,3 +1730,145 @@ class RegularizeRotatedBox(BaseTransform): results['gt_bboxes'] = self.box_type( results['gt_bboxes'].regularize_boxes(self.angle_version)) return results + + +@TRANSFORMS.register_module() +class Polygon2Mask(BaseTransform): + """Polygons to bitmaps in YOLOv5. + + Args: + downsample_ratio (int): Downsample ratio of mask. + mask_overlap (bool): Whether to use maskoverlap in mask process. + When set to True, the implementation here is the same as the + official, with higher training speed. If set to True, all gt masks + will compress into one overlap mask, the value of mask indicates + the index of gt masks. If set to False, one mask is a binary mask. + Default to True. + coco_style (bool): Whether to use coco_style to convert the polygons to + bitmaps. Note that this option is only used to test if there is an + improvement in training speed and we recommend setting it to False. + """ + + def __init__(self, + downsample_ratio: int = 4, + mask_overlap: bool = True, + coco_style: bool = False): + self.downsample_ratio = downsample_ratio + self.mask_overlap = mask_overlap + self.coco_style = coco_style + + def polygon2mask(self, + img_shape: Tuple[int, int], + polygons: np.ndarray, + color: int = 1) -> np.ndarray: + """ + Args: + img_shape (tuple): The image size. + polygons (np.ndarray): [N, M], N is the number of polygons, + M is the number of points(Be divided by 2). + color (int): color in fillPoly. + Return: + np.ndarray: the overlap mask. + """ + nh, nw = (img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio) + if self.coco_style: + # This practice can lead to the loss of small objects + # polygons = polygons.resize((nh, nw)).masks + # polygons = np.asarray(polygons).reshape(-1) + # mask = polygon_to_bitmap([polygons], nh, nw) + + polygons = np.asarray(polygons).reshape(-1) + mask = polygon_to_bitmap([polygons], img_shape[0], + img_shape[1]).astype(np.uint8) + mask = mmcv.imresize(mask, (nw, nh)) + else: + mask = np.zeros(img_shape, dtype=np.uint8) + polygons = np.asarray(polygons) + polygons = polygons.astype(np.int32) + shape = polygons.shape + polygons = polygons.reshape(shape[0], -1, 2) + cv2.fillPoly(mask, polygons, color=color) + # NOTE: fillPoly firstly then resize is trying the keep the same + # way of loss calculation when mask-ratio=1. + mask = mmcv.imresize(mask, (nw, nh)) + return mask + + def polygons2masks(self, + img_shape: Tuple[int, int], + polygons: PolygonMasks, + color: int = 1) -> np.ndarray: + """Return a list of bitmap masks. + + Args: + img_shape (tuple): The image size. + polygons (PolygonMasks): The mask annotations. + color (int): color in fillPoly. + Return: + List[np.ndarray]: the list of masks in bitmaps. + """ + if self.coco_style: + nh, nw = (img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio) + masks = polygons.resize((nh, nw)).to_ndarray() + return masks + else: + masks = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_shape, polygons[si], color) + masks.append(mask) + return np.array(masks) + + def polygons2masks_overlap( + self, img_shape: Tuple[int, int], + polygons: PolygonMasks) -> Tuple[np.ndarray, np.ndarray]: + """Return a overlap mask and the sorted idx of area. + + Args: + img_shape (tuple): The image size. + polygons (PolygonMasks): The mask annotations. + color (int): color in fillPoly. + Return: + Tuple[np.ndarray, np.ndarray]: + the overlap mask and the sorted idx of area. + """ + masks = np.zeros((img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio), + dtype=np.int32 if len(polygons) > 255 else np.uint8) + areas = [] + ms = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_shape, polygons[si], color=1) + ms.append(mask) + areas.append(mask.sum()) + areas = np.asarray(areas) + index = np.argsort(-areas) + ms = np.array(ms)[index] + for i in range(len(polygons)): + mask = ms[i] * (i + 1) + masks = masks + mask + masks = np.clip(masks, a_min=0, a_max=i + 1) + return masks, index + + def transform(self, results: dict) -> dict: + gt_masks = results['gt_masks'] + assert isinstance(gt_masks, PolygonMasks) + + if self.mask_overlap: + masks, sorted_idx = self.polygons2masks_overlap( + (gt_masks.height, gt_masks.width), gt_masks) + results['gt_bboxes'] = results['gt_bboxes'][sorted_idx] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + sorted_idx] + + # In this case we put gt_masks in gt_panoptic_seg + results.pop('gt_masks') + results['gt_panoptic_seg'] = torch.from_numpy(masks[None]) + else: + masks = self.polygons2masks((gt_masks.height, gt_masks.width), + gt_masks, + color=1) + masks = torch.from_numpy(masks) + # Consistent logic with mmdet + results['gt_masks'] = masks + return results diff --git a/mmyolo/datasets/utils.py b/mmyolo/datasets/utils.py index 62fe5484..d50207c8 100644 --- a/mmyolo/datasets/utils.py +++ b/mmyolo/datasets/utils.py @@ -4,6 +4,7 @@ from typing import List, Sequence import numpy as np import torch from mmengine.dataset import COLLATE_FUNCTIONS +from mmengine.dist import get_dist_info from ..registry import TASK_UTILS @@ -28,9 +29,10 @@ def yolov5_collate(data_batch: Sequence, gt_bboxes = datasamples.gt_instances.bboxes.tensor gt_labels = datasamples.gt_instances.labels if 'masks' in datasamples.gt_instances: - masks = datasamples.gt_instances.masks.to_tensor( - dtype=torch.bool, device=gt_bboxes.device) + masks = datasamples.gt_instances.masks batch_masks.append(masks) + if 'gt_panoptic_seg' in datasamples: + batch_masks.append(datasamples.gt_panoptic_seg.pan_seg) batch_idx = gt_labels.new_full((len(gt_labels), 1), i) bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), dim=1) @@ -70,10 +72,14 @@ class BatchShapePolicy: img_size: int = 640, size_divisor: int = 32, extra_pad_ratio: float = 0.5): - self.batch_size = batch_size self.img_size = img_size self.size_divisor = size_divisor self.extra_pad_ratio = extra_pad_ratio + _, world_size = get_dist_info() + # During multi-gpu testing, the batchsize should be multiplied by + # worldsize, so that the number of batches can be calculated correctly. + # The index of batches will affect the calculation of batch shape. + self.batch_size = batch_size * world_size def __call__(self, data_list: List[dict]) -> List[dict]: image_shapes = [] diff --git a/mmyolo/models/dense_heads/__init__.py b/mmyolo/models/dense_heads/__init__.py index a95abd61..ac65c42e 100644 --- a/mmyolo/models/dense_heads/__init__.py +++ b/mmyolo/models/dense_heads/__init__.py @@ -5,6 +5,7 @@ from .rtmdet_ins_head import RTMDetInsSepBNHead, RTMDetInsSepBNHeadModule from .rtmdet_rotated_head import (RTMDetRotatedHead, RTMDetRotatedSepBNHeadModule) from .yolov5_head import YOLOv5Head, YOLOv5HeadModule +from .yolov5_ins_head import YOLOv5InsHead, YOLOv5InsHeadModule from .yolov6_head import YOLOv6Head, YOLOv6HeadModule from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule from .yolov8_head import YOLOv8Head, YOLOv8HeadModule @@ -16,5 +17,5 @@ __all__ = [ 'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule', 'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule', 'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead', - 'RTMDetInsSepBNHeadModule' + 'RTMDetInsSepBNHeadModule', 'YOLOv5InsHead', 'YOLOv5InsHeadModule' ] diff --git a/mmyolo/models/dense_heads/yolov5_head.py b/mmyolo/models/dense_heads/yolov5_head.py index c49d0851..fb24617f 100644 --- a/mmyolo/models/dense_heads/yolov5_head.py +++ b/mmyolo/models/dense_heads/yolov5_head.py @@ -95,7 +95,12 @@ class YOLOv5HeadModule(BaseModule): b = mi.bias.data.view(self.num_base_priors, -1) # obj (8 objects per 640 image) b.data[:, 4] += math.log(8 / (640 / s)**2) - b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.999999)) + # NOTE: The following initialization can only be performed on the + # bias of the category, if the following initialization is + # performed on the bias of mask coefficient, + # there will be a significant decrease in mask AP. + b.data[:, 5:5 + self.num_classes] += math.log( + 0.6 / (self.num_classes - 0.999999)) mi.bias.data = b.view(-1) diff --git a/mmyolo/models/dense_heads/yolov5_ins_head.py b/mmyolo/models/dense_heads/yolov5_ins_head.py new file mode 100644 index 00000000..df94f422 --- /dev/null +++ b/mmyolo/models/dense_heads/yolov5_ins_head.py @@ -0,0 +1,740 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Sequence, Tuple, Union + +import mmcv +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmdet.models.utils import filter_scores_and_topk, multi_apply +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..utils import make_divisible +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule + + +class ProtoModule(BaseModule): + """Mask Proto module for segmentation models of YOLOv5. + + Args: + in_channels (int): Number of channels in the input feature map. + middle_channels (int): Number of channels in the middle feature map. + mask_channels (int): Number of channels in the output mask feature + map. This is the channel count of the mask. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + """ + + def __init__(self, + *args, + in_channels: int = 32, + middle_channels: int = 256, + mask_channels: int = 32, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + super().__init__(*args, **kwargs) + self.conv1 = ConvModule( + in_channels, + middle_channels, + kernel_size=3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.conv2 = ConvModule( + middle_channels, + middle_channels, + kernel_size=3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + middle_channels, + mask_channels, + kernel_size=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + return self.conv3(self.conv2(self.upsample(self.conv1(x)))) + + +@MODELS.register_module() +class YOLOv5InsHeadModule(YOLOv5HeadModule): + """Detection and Instance Segmentation Head of YOLOv5. + + Args: + num_classes (int): Number of categories excluding the background + category. + mask_channels (int): Number of channels in the mask feature map. + This is the channel count of the mask. + proto_channels (int): Number of channels in the proto feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + """ + + def __init__(self, + *args, + num_classes: int, + mask_channels: int = 32, + proto_channels: int = 256, + widen_factor: float = 1.0, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + self.mask_channels = mask_channels + self.num_out_attrib_with_proto = 5 + num_classes + mask_channels + self.proto_channels = make_divisible(proto_channels, widen_factor) + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + super().__init__( + *args, + num_classes=num_classes, + widen_factor=widen_factor, + **kwargs) + + def _init_layers(self): + """initialize conv layers in YOLOv5 Ins head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Conv2d( + self.in_channels[i], + self.num_base_priors * self.num_out_attrib_with_proto, 1) + self.convs_pred.append(conv_pred) + + self.proto_pred = ProtoModule( + in_channels=self.in_channels[0], + middle_channels=self.proto_channels, + mask_channels=self.mask_channels, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, objectnesses, and mask predictions. + """ + assert len(x) == self.num_levels + cls_scores, bbox_preds, objectnesses, coeff_preds = multi_apply( + self.forward_single, x, self.convs_pred) + mask_protos = self.proto_pred(x[0]) + return cls_scores, bbox_preds, objectnesses, coeff_preds, mask_protos + + def forward_single( + self, x: Tensor, + convs_pred: nn.Module) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + pred_map = convs_pred(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, + self.num_out_attrib_with_proto, ny, nx) + + cls_score = pred_map[:, :, 5:self.num_classes + 5, + ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + coeff_pred = pred_map[:, :, self.num_classes + 5:, + ...].reshape(bs, -1, ny, nx) + + return cls_score, bbox_pred, objectness, coeff_pred + + +@MODELS.register_module() +class YOLOv5InsHead(YOLOv5Head): + """YOLOv5 Instance Segmentation and Detection head. + + Args: + mask_overlap(bool): Defaults to True. + loss_mask (:obj:`ConfigDict` or dict): Config of mask loss. + loss_mask_weight (float): The weight of mask loss. + """ + + def __init__(self, + *args, + mask_overlap: bool = True, + loss_mask: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=0.05, + **kwargs): + super().__init__(*args, **kwargs) + self.mask_overlap = mask_overlap + self.loss_mask: nn.Module = MODELS.build(loss_mask) + self.loss_mask_weight = loss_mask_weight + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + + if isinstance(batch_data_samples, list): + # TODO: support non-fast version ins segmention + raise NotImplementedError + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['masks'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + coeff_preds: Sequence[Tensor], + proto_preds: Tensor, + batch_gt_instances: Sequence[InstanceData], + batch_gt_masks: Sequence[Tensor], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + coeff_preds (Sequence[Tensor]): Mask coefficient for each scale + level, each is a 4D-tensor, the channel number is + num_priors * mask_channels. + proto_preds (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, mask_channels, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_gt_masks (Sequence[Tensor]): Batch of gt_mask. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + loss_mask = torch.zeros(1, device=device) + scaled_factor = torch.ones(8, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + loss_mask += coeff_preds[i].sum() * 0 + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 8) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + loss_mask += coeff_preds[i].sum() * 0 + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh,\ + priors_targets_inds = _chunk_targets + (priors_inds, targets_inds) = priors_targets_inds.long().T + (img_inds, class_inds) = img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj(objectnesses[i], + target_obj) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls(pred_cls_scores, target_class) + else: + loss_cls += cls_scores[i].sum() * 0 + + # mask regression + retained_coeff_preds = coeff_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + _, c, mask_h, mask_w = proto_preds.shape + if batch_gt_masks.shape[-2:] != (mask_h, mask_w): + batch_gt_masks = F.interpolate( + batch_gt_masks[None], (mask_h, mask_w), mode='nearest')[0] + + xywh_normed = batch_targets_scaled[:, 2:6] / scaled_factor[2:6] + area_normed = xywh_normed[:, 2:].prod(1) + xywh_scaled = xywh_normed * torch.tensor( + proto_preds.shape, device=device)[[3, 2, 3, 2]] + xyxy_scaled = bbox_cxcywh_to_xyxy(xywh_scaled) + + for bs in range(batch_size): + match_inds = (img_inds == bs) # matching index + if not match_inds.any(): + continue + + if self.mask_overlap: + mask_gti = torch.where( + batch_gt_masks[bs][None] == + targets_inds[match_inds].view(-1, 1, 1), 1.0, 0.0) + else: + mask_gti = batch_gt_masks[targets_inds][match_inds] + + mask_preds = (retained_coeff_preds[match_inds] + @ proto_preds[bs].view(c, -1)).view( + -1, mask_h, mask_w) + loss_mask_full = self.loss_mask(mask_preds, mask_gti) + loss_mask += ( + self.crop_mask(loss_mask_full[None], + xyxy_scaled[match_inds]).mean(dim=(2, 3)) / + area_normed[match_inds]).mean() + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size, + loss_mask=loss_mask * self.loss_mask_weight * world_size) + + def _convert_gt_to_norm_format(self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict]) -> Tensor: + """Add target_inds for instance segmentation.""" + batch_targets_normed = super()._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + if self.mask_overlap: + batch_size = len(batch_img_metas) + target_inds = [] + for i in range(batch_size): + # find number of targets of each image + num_gts = (batch_gt_instances[:, 0] == i).sum() + # (num_anchor, num_gts) + target_inds.append( + torch.arange(num_gts, device=batch_gt_instances.device). + float().view(1, num_gts).repeat(self.num_base_priors, 1) + + 1) + target_inds = torch.cat(target_inds, 1) + else: + num_gts = batch_gt_instances.shape[0] + target_inds = torch.arange( + num_gts, device=batch_gt_instances.device).float().view( + 1, num_gts).repeat(self.num_base_priors, 1) + batch_targets_normed = torch.cat( + [batch_targets_normed, target_inds[..., None]], 2) + return batch_targets_normed + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + coeff_preds: Optional[List[Tensor]] = None, + proto_preds: Optional[Tensor] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted from the head into + bbox results. + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + coeff_preds (list[Tensor]): Mask coefficients predictions + for all scale levels, each is a 4D-tensor, has shape + (batch_size, mask_channels, H, W). + proto_preds (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, mask_channels, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + Returns: + list[:obj:`InstanceData`]: Object detection and instance + segmentation results of each image after the post process. + Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + assert len(cls_scores) == len(bbox_preds) == len(coeff_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_coeff_preds = [ + coeff_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.mask_channels) + for coeff_pred in coeff_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors.unsqueeze(0), flatten_bbox_preds, flatten_stride) + + flatten_coeff_preds = torch.cat(flatten_coeff_preds, dim=1) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(len(featmap_sizes))] + + results_list = [] + for (bboxes, scores, objectness, coeffs, mask_proto, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, flatten_coeff_preds, + proto_preds, batch_img_metas): + ori_shape = img_meta['ori_shape'] + batch_input_shape = img_meta['batch_input_shape'] + input_shape_h, input_shape_w = batch_input_shape + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + input_shape_withoutpad = (input_shape_h - pad_param[0] - + pad_param[1], input_shape_w - + pad_param[2] - pad_param[3]) + else: + pad_param = None + input_shape_withoutpad = batch_input_shape + scale_factor = (input_shape_withoutpad[1] / ori_shape[1], + input_shape_withoutpad[0] / ori_shape[0]) + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + coeffs = coeffs[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + # NOTE: Important + coeffs *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + empty_results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0], coeffs=coeffs)) + labels = results['labels'] + coeffs = results['coeffs'] + else: + out = filter_scores_and_topk( + scores, score_thr, nms_pre, results=dict(coeffs=coeffs)) + scores, labels, keep_idxs, filtered_results = out + coeffs = filtered_results['coeffs'] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + coeffs=coeffs) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + + if len(results.bboxes): + masks = self.process_mask(mask_proto, results.coeffs, + results.bboxes, + (input_shape_h, input_shape_w), True) + if rescale: + if pad_param is not None: + # bbox minus pad param + top_pad, _, left_pad, _ = pad_param + results.bboxes -= results.bboxes.new_tensor( + [left_pad, top_pad, left_pad, top_pad]) + # mask crop pad param + top, left = int(top_pad), int(left_pad) + bottom, right = int(input_shape_h - + top_pad), int(input_shape_w - + left_pad) + masks = masks[:, :, top:bottom, left:right] + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + fast_test = cfg.get('fast_test', False) + if fast_test: + masks = F.interpolate( + masks, + size=ori_shape, + mode='bilinear', + align_corners=False) + masks = masks.squeeze(0) + masks = masks > cfg.mask_thr_binary + else: + masks.gt_(cfg.mask_thr_binary) + masks = torch.as_tensor(masks, dtype=torch.uint8) + masks = masks[0].permute(1, 2, + 0).contiguous().cpu().numpy() + masks = mmcv.imresize(masks, + (ori_shape[1], ori_shape[0])) + + if len(masks.shape) == 2: + masks = masks[:, :, None] + masks = torch.from_numpy(masks).permute(2, 0, 1) + + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results.masks = masks.bool() + results_list.append(results) + else: + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(results) + return results_list + + def process_mask(self, + mask_proto: Tensor, + mask_coeff_pred: Tensor, + bboxes: Tensor, + shape: Tuple[int, int], + upsample: bool = False) -> Tensor: + """Generate mask logits results. + + Args: + mask_proto (Tensor): Mask prototype features. + Has shape (num_instance, mask_channels). + mask_coeff_pred (Tensor): Mask coefficients prediction for + single image. Has shape (mask_channels, H, W) + bboxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4). + shape (Tuple): Batch input shape of image. + upsample (bool): Whether upsample masks results to batch input + shape. Default to False. + Return: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + c, mh, mw = mask_proto.shape # CHW + masks = ( + mask_coeff_pred @ mask_proto.float().view(c, -1)).sigmoid().view( + -1, mh, mw)[None] + if upsample: + masks = F.interpolate( + masks, shape, mode='bilinear', align_corners=False) # 1CHW + masks = self.crop_mask(masks, bboxes) + return masks + + def crop_mask(self, masks: Tensor, boxes: Tensor) -> Tensor: + """Crop mask by the bounding box. + + Args: + masks (Tensor): Predicted mask results. Has shape + (1, num_instance, H, W). + boxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4). + Returns: + (torch.Tensor): The masks are being cropped to the bounding box. + """ + _, n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) + r = torch.arange( + w, device=masks.device, + dtype=x1.dtype)[None, None, None, :] # rows shape(1, 1, w, 1) + c = torch.arange( + h, device=masks.device, + dtype=x1.dtype)[None, None, :, None] # cols shape(1, h, 1, 1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) diff --git a/tests/test_datasets/test_transforms/test_formatting.py b/tests/test_datasets/test_transforms/test_formatting.py new file mode 100644 index 00000000..c75475df --- /dev/null +++ b/tests/test_datasets/test_transforms/test_formatting.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import numpy as np +from mmdet.structures import DetDataSample +from mmdet.structures.mask import BitmapMasks +from mmengine.structures import InstanceData, PixelData + +from mmyolo.datasets.transforms import PackDetInputs + + +class TestPackDetInputs(unittest.TestCase): + + def setUp(self): + """Setup the model and optimizer which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + data_prefix = osp.join(osp.dirname(__file__), '../../data') + img_path = osp.join(data_prefix, 'color.jpg') + rng = np.random.RandomState(0) + self.results1 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_bboxes_labels': rng.rand(3, ), + 'gt_ignore_flags': np.array([0, 0, 1], dtype=bool), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.results2 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_bboxes_labels': rng.rand(3, ), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.results3 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_panoptic_seg': rng.rand(1, 300, 400), + 'gt_bboxes_labels': rng.rand(3, ), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.meta_keys = ('img_id', 'img_path', 'ori_shape', 'scale_factor', + 'flip') + + def test_transform(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results1)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 2) + self.assertEqual(len(results['data_samples'].ignored_instances), 1) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + + def test_transform_without_ignore(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results2)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 3) + self.assertEqual(len(results['data_samples'].ignored_instances), 0) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + + def test_transform_with_panoptic_seg(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results3)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 3) + self.assertEqual(len(results['data_samples'].ignored_instances), 0) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + self.assertIsInstance(results['data_samples'].gt_panoptic_seg, + PixelData) + + def test_repr(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + self.assertEqual( + repr(transform), f'PackDetInputs(meta_keys={self.meta_keys})') diff --git a/tests/test_datasets/test_transforms/test_transforms.py b/tests/test_datasets/test_transforms/test_transforms.py index fc46151d..a8b7ea49 100644 --- a/tests/test_datasets/test_transforms/test_transforms.py +++ b/tests/test_datasets/test_transforms/test_transforms.py @@ -148,18 +148,21 @@ class TestLetterResize(unittest.TestCase): self.assertIn('pad_param', data_info) pad_param = data_info['pad_param'].reshape(-1, 2).sum( 1) # (top, b, l, r) -> (h, w) - scale_factor = np.asarray( - data_info['scale_factor'])[::-1] # (w, h) -> (h, w) - scale_factor_keepratio = np.min( - np.asarray((32, 32)) / (input_h, input_w)) - validate_shape = np.floor( - np.asarray((input_h, input_w)) * scale_factor_keepratio + 0.5) - scale_factor_keepratio = np.floor(scale_factor_keepratio * - input_h + 0.5) / input_h - scale_factor_letter = (output_h, output_w) / validate_shape - scale_factor_letter = ( - scale_factor_letter - - (pad_param / validate_shape))[np.argmin(scale_factor_letter)] + scale_factor = np.asarray(data_info['scale_factor']) # (w, h) + + max_long_edge = max((32, 32)) + max_short_edge = min((32, 32)) + scale_factor_keepratio = min( + max_long_edge / max(input_h, input_w), + max_short_edge / min(input_h, input_w)) + validate_shape = np.asarray( + (int(input_h * scale_factor_keepratio), + int(input_w * scale_factor_keepratio))) + scale_factor_keepratio = np.asarray( + (validate_shape[1] / input_w, validate_shape[0] / input_h)) + + scale_factor_letter = ((np.asarray( + (output_h, output_w)) - pad_param) / validate_shape)[::-1] self.assertTrue(data_info['img_shape'][:2] == (output_h, output_w)) self.assertTrue((scale_factor == (scale_factor_keepratio * scale_factor_letter)).all()) diff --git a/tests/test_models/test_dense_heads/test_yolov5_head.py b/tests/test_models/test_dense_heads/test_yolov5_head.py index 31b399bf..974b9a98 100644 --- a/tests/test_models/test_dense_heads/test_yolov5_head.py +++ b/tests/test_models/test_dense_heads/test_yolov5_head.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. from unittest import TestCase +import numpy as np import torch from mmengine.config import Config from mmengine.structures import InstanceData -from mmyolo.models.dense_heads import YOLOv5Head +from mmyolo.models.dense_heads import YOLOv5Head, YOLOv5InsHead from mmyolo.utils import register_all_modules register_all_modules() @@ -234,3 +235,177 @@ class TestYOLOv5Head(TestCase): 'box loss should be non-zero') self.assertGreater(onegt_obj_loss.item(), 0, 'obj loss should be non-zero') + + +class TestYOLOv5InsHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv5InsHeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + mask_channels=32, + proto_channels=32, + widen_factor=1.0) + + def test_init_weights(self): + head = YOLOv5InsHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300, + mask_thr_binary=0.5)) + + head = YOLOv5InsHead(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + with torch.no_grad(): + res = head.forward(feat) + cls_scores, bbox_preds, objectnesses,\ + coeff_preds, proto_preds = res + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + coeff_preds, + proto_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + with self.assertRaises(AssertionError): + head.predict_by_feat( + cls_scores, + bbox_preds, + coeff_preds, + proto_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5InsHead(head_module=self.head_module) + rng = np.random.RandomState(0) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses,\ + coeff_preds, proto_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_bboxes_labels = torch.empty((0, 6)) + gt_masks = rng.rand(0, s // 4, s // 4) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, coeff_preds, + proto_preds, gt_bboxes_labels, + gt_masks, img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_mask_loss = empty_gt_losses['loss_mask'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual( + empty_mask_loss.item(), 0, + 'there should be no mask loss when there are no true masks') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5InsHead(head_module=self.head_module) + + bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]) + labels = torch.Tensor([1.]) + batch_id = torch.LongTensor([0]) + gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes], + dim=1) + gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int() + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + coeff_preds, proto_preds, + gt_bboxes_labels, gt_masks, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_mask_loss = one_gt_losses['loss_mask'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_mask_loss.item(), 0, + 'mask loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5InsHead(head_module=self.head_module) + bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]) + labels = torch.Tensor([1.]) + batch_id = torch.LongTensor([0]) + gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes], + dim=1) + gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int() + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + coeff_preds, proto_preds, + gt_bboxes_labels, gt_masks, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_mask_loss = one_gt_losses['loss_mask'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_mask_loss.item(), 0, + 'mask loss should be non-zero') diff --git a/tools/model_converters/yolov5_to_mmyolo.py b/tools/model_converters/yolov5_to_mmyolo.py index c1d4e41d..a4e62a2f 100644 --- a/tools/model_converters/yolov5_to_mmyolo.py +++ b/tools/model_converters/yolov5_to_mmyolo.py @@ -25,6 +25,7 @@ convert_dict_p5 = { 'model.21': 'neck.downsample_layers.1', 'model.23': 'neck.bottom_up_layers.1', 'model.24.m': 'bbox_head.head_module.convs_pred', + 'model.24.proto': 'bbox_head.head_module.proto_preds', } convert_dict_p6 = { @@ -54,6 +55,7 @@ convert_dict_p6 = { 'model.30': 'neck.downsample_layers.2', 'model.32': 'neck.bottom_up_layers.2', 'model.33.m': 'bbox_head.head_module.convs_pred', + 'model.33.proto': 'bbox_head.head_module.proto_preds', } @@ -94,6 +96,10 @@ def convert(src, dst): if '.m.' in new_key: new_key = new_key.replace('.m.', '.blocks.') new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module.proto_preds.cv' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.proto_preds.cv', + 'bbox_head.head_module.proto_preds.conv') else: new_key = new_key.replace('.cv1', '.main_conv') new_key = new_key.replace('.cv2', '.short_conv') diff --git a/tools/model_converters/yolov8_to_mmyolo.py b/tools/model_converters/yolov8_to_mmyolo.py index df0c514b..4ed64f24 100644 --- a/tools/model_converters/yolov8_to_mmyolo.py +++ b/tools/model_converters/yolov8_to_mmyolo.py @@ -53,6 +53,19 @@ def convert(src, dst): if '.m.' in new_key: new_key = new_key.replace('.m.', '.blocks.') new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module.proto.cv' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.proto.cv', + 'bbox_head.head_module.proto_preds.conv') + elif 'bbox_head.head_module.proto' in new_key: + new_key = new_key.replace('bbox_head.head_module.proto', + 'bbox_head.head_module.proto_preds') + elif 'bbox_head.head_module.cv4.' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.cv4', + 'bbox_head.head_module.mask_coeff_preds') + new_key = new_key.replace('.2.weight', '.2.conv.weight') + new_key = new_key.replace('.2.bias', '.2.conv.bias') elif 'bbox_head.head_module' in new_key: new_key = new_key.replace('.cv2', '.reg_preds') new_key = new_key.replace('.cv3', '.cls_preds')