From 5fb0a86f694d965dc21cb591ae83e519408c9f71 Mon Sep 17 00:00:00 2001 From: Chen Jiayu <38110862+tuofeilunhifi@users.noreply.github.com> Date: Tue, 12 Jul 2022 18:07:02 +0800 Subject: [PATCH] support fcos (#100) support fcos(38.57) --- configs/detection/fcos/coco_detection.py | 90 ++ configs/detection/fcos/fcos.py | 51 ++ ...nreg-giou_r50_caffe_fpn_gn-head_1x_coco.py | 57 ++ docs/source/model_zoo_det.md | 5 + easycv/core/evaluation/coco_tools.py | 1 - .../datasets/detection/data_sources/coco.py | 15 - easycv/models/detection/__init__.py | 14 +- easycv/models/detection/fcos/__init__.py | 1 + easycv/models/detection/fcos/fcos_head.py | 865 ++++++++++++++++++ .../detection/{vitdet => necks}/__init__.py | 1 + easycv/models/detection/necks/fpn.py | 204 +++++ .../models/detection/{vitdet => necks}/sfp.py | 0 easycv/models/detection/utils/__init__.py | 6 +- easycv/models/detection/utils/boxes.py | 101 +- tests/models/detection/fcos/test_fcos.py | 206 +++++ 15 files changed, 1591 insertions(+), 26 deletions(-) create mode 100644 configs/detection/fcos/coco_detection.py create mode 100644 configs/detection/fcos/fcos.py create mode 100644 configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py create mode 100644 easycv/models/detection/fcos/__init__.py create mode 100644 easycv/models/detection/fcos/fcos_head.py rename easycv/models/detection/{vitdet => necks}/__init__.py (50%) create mode 100644 easycv/models/detection/necks/fpn.py rename easycv/models/detection/{vitdet => necks}/sfp.py (100%) create mode 100644 tests/models/detection/fcos/test_fcos.py diff --git a/configs/detection/fcos/coco_detection.py b/configs/detection/fcos/coco_detection.py new file mode 100644 index 00000000..413291ea --- /dev/null +++ b/configs/detection/fcos/coco_detection.py @@ -0,0 +1,90 @@ +CLASSES = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush' +] + +# dataset settings +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) + +train_pipeline = [ + dict(type='MMResize', img_scale=(1333, 800), keep_ratio=True), + dict(type='MMRandomFlip', flip_ratio=0.5), + dict(type='MMNormalize', **img_norm_cfg), + dict(type='MMPad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels'], + meta_keys=('filename', 'ori_filename', 'ori_shape', 'ori_img_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg')) +] +test_pipeline = [ + dict( + type='MMMultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='MMResize', keep_ratio=True), + dict(type='MMRandomFlip'), + dict(type='MMNormalize', **img_norm_cfg), + dict(type='MMPad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict( + type='Collect', + keys=['img'], + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'ori_img_shape', 'img_shape', 'pad_shape', + 'scale_factor', 'flip', 'flip_direction', + 'img_norm_cfg')) + ]) +] + +train_dataset = dict( + type='DetDataset', + data_source=dict( + type='DetSourceCoco', + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + classes=CLASSES, + test_mode=False, + filter_empty_gt=True, + iscrowd=False), + pipeline=train_pipeline) + +val_dataset = dict( + type='DetDataset', + imgs_per_gpu=1, + data_source=dict( + type='DetSourceCoco', + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + classes=CLASSES, + test_mode=True, + filter_empty_gt=False, + iscrowd=True), + pipeline=test_pipeline) + +data = dict( + imgs_per_gpu=2, workers_per_gpu=2, train=train_dataset, val=val_dataset) diff --git a/configs/detection/fcos/fcos.py b/configs/detection/fcos/fcos.py new file mode 100644 index 00000000..27abbd13 --- /dev/null +++ b/configs/detection/fcos/fcos.py @@ -0,0 +1,51 @@ +# model settings +model = dict( + type='Detection', + pretrained= + 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/pretrained_models/easycv/resnet/detectron/resnet50_caffe.pth', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3, 4), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + center_sampling=True, + center_sample_radius=1.5, + norm_on_bbox=True, + centerness_on_reg=True, + conv_cfg=None, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_bias=True, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100))) diff --git a/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py b/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py new file mode 100644 index 00000000..55939f60 --- /dev/null +++ b/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py @@ -0,0 +1,57 @@ +_base_ = ['./fcos.py', './coco_detection.py', 'configs/base.py'] + +CLASSES = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush' +] + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +checkpoint_config = dict(interval=10) +# optimizer +optimizer = dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0001, + paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) + +total_epochs = 12 + +# evaluation +eval_config = dict(initial=True, interval=1, gpu_collect=False) +# eval_config = dict(interval=1, gpu_collect=False) +eval_pipelines = [ + dict( + mode='test', + evaluators=[ + dict(type='CocoDetectionEvaluator', classes=CLASSES), + ], + ) +] + +find_unused_parameters = False diff --git a/docs/source/model_zoo_det.md b/docs/source/model_zoo_det.md index 67469a69..172a2ad2 100644 --- a/docs/source/model_zoo_det.md +++ b/docs/source/model_zoo_det.md @@ -19,6 +19,11 @@ Pretrained on COCO2017 dataset. | ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | | ViTDet_MaskRCNN | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_100e.py) | 50.57 | 44.96 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.log.json) | +## FCOS + +| Algorithm | Config | mAPval
0.5:0.95 | APval
50 | Download | +| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ | +| FCOS-r50 | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py) | 38.58 | 57.18 | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220621_121315.log.json) | ## DETR | Algorithm | Config | bbox_mAPval
0.5:0.95 | APval
50 | Download | diff --git a/easycv/core/evaluation/coco_tools.py b/easycv/core/evaluation/coco_tools.py index 80b1fbbd..424e3a12 100644 --- a/easycv/core/evaluation/coco_tools.py +++ b/easycv/core/evaluation/coco_tools.py @@ -546,7 +546,6 @@ def ExportSingleImageDetectionBoxesToCoco(image_id, category_id_set, do not have the right lengths or (2) if each of the elements inside these lists do not have the correct shapes or (3) if image_ids are not integers. """ - assert len(detection_classes.shape) == 1 and len(detection_scores.shape) == 1, \ 'All entries in detection_classes and detection_scores expected to be of rank 1.' assert len(detection_boxes.shape) == 2,\ diff --git a/easycv/datasets/detection/data_sources/coco.py b/easycv/datasets/detection/data_sources/coco.py index c66ca212..c7527fa7 100644 --- a/easycv/datasets/detection/data_sources/coco.py +++ b/easycv/datasets/detection/data_sources/coco.py @@ -68,10 +68,8 @@ class DetSourceCoco(object): def load_annotations(self, ann_file): """Load annotation from COCO style annotation file. - Args: ann_file (str): Path of annotation file. - Returns: list[dict]: Annotation info from COCO api. """ @@ -97,10 +95,8 @@ class DetSourceCoco(object): def get_ann_info(self, idx): """Get COCO annotation by index. - Args: idx (int): Index of data. - Returns: dict: Annotation info of specified index. """ @@ -112,10 +108,8 @@ class DetSourceCoco(object): def get_cat_ids(self, idx): """Get COCO category ids by index. - Args: idx (int): Index of data. - Returns: list[int]: All categories in the image of specified index. """ @@ -151,7 +145,6 @@ class DetSourceCoco(object): def _set_group_flag(self): """Set flag according to image aspect ratio. - Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. """ @@ -163,11 +156,9 @@ class DetSourceCoco(object): def _parse_ann_info(self, img_info, ann_info): """Parse bbox and mask annotation. - Args: ann_info (list[dict]): Annotation info of an image. with_mask (bool): Whether to parse mask annotations. - Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore,\ labels, masks, seg_map. "masks" are raw annotations and not \ @@ -241,11 +232,9 @@ class DetSourceCoco(object): def xyxy2xywh(self, bbox): """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO evaluation. - Args: bbox (numpy.ndarray): The bounding boxes, shape (4, ), in ``xyxy`` order. - Returns: list[float]: The converted bounding boxes, in ``xywh`` order. """ @@ -299,10 +288,8 @@ class DetSourceCoco(object): def prepare_train_img(self, idx): """Get training data and annotations after pipeline. - Args: idx (int): Index of data. - Returns: dict: Training data and annotation after pipeline with new keys \ introduced by pipeline. @@ -316,10 +303,8 @@ class DetSourceCoco(object): def __getitem__(self, idx): """Get training/test data after pipeline. - Args: idx (int): Index of data. - Returns: dict: Training/test data (with annotation if `test_mode` is set \ True). diff --git a/easycv/models/detection/__init__.py b/easycv/models/detection/__init__.py index 9d40b12a..cb3e45d4 100644 --- a/easycv/models/detection/__init__.py +++ b/easycv/models/detection/__init__.py @@ -1,20 +1,22 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright (c) Alibaba, Inc. and its affiliates. import logging -from .dab_detr import DABDETRHead, DABDetrTransformer -from .detection import Detection -from .detr import DETRHead, DetrTransformer -from .vitdet import SFP +from easycv.models.detection.dab_detr import DABDETRHead, DABDetrTransformer +from easycv.models.detection.detection import Detection +from easycv.models.detection.detr import DETRHead, DetrTransformer +from easycv.models.detection.fcos import FCOSHead +from easycv.models.detection.necks import FPN, SFP try: - from .yolox.yolox import YOLOX + from easycv.models.detection.yolox.yolox import YOLOX except Exception as e: logging.info(f'Exception: {e}') logging.info( 'Import YOLOX failed! please check your CUDA & Pytorch Version') try: - from .yolox_edge.yolox_edge import YOLOX_EDGE + from easycv.models.detection.yolox_edge.yolox_edge import YOLOX_EDGE except Exception as e: logging.info(f'Exception: {e}') logging.info( diff --git a/easycv/models/detection/fcos/__init__.py b/easycv/models/detection/fcos/__init__.py new file mode 100644 index 00000000..a79403f3 --- /dev/null +++ b/easycv/models/detection/fcos/__init__.py @@ -0,0 +1 @@ +from .fcos_head import FCOSHead diff --git a/easycv/models/detection/fcos/fcos_head.py b/easycv/models/detection/fcos/fcos_head.py new file mode 100644 index 00000000..aaa60864 --- /dev/null +++ b/easycv/models/detection/fcos/fcos_head.py @@ -0,0 +1,865 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, Scale + +from easycv.models.builder import HEADS, build_loss +from easycv.models.detection.utils import (MlvlPointGenerator, batched_nms, + bbox2result, distance2bbox, + filter_scores_and_topk, + select_single_mlvl) +from easycv.models.utils import reduce_mean +from easycv.utils.misc import multi_apply + +INF = 1e8 + + +@HEADS.register_module() +class FCOSHead(nn.Module): + """Anchor-free head used in `FCOS `_. + + The FCOS head does not use anchor boxes. Instead bounding boxes are + predicted at each pixel and a centerness measure is used to suppress + low-quality predictions. + Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training + tricks used in official repo, which will bring remarkable mAP gains + of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for + more detail. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + strides (list[int] | list[tuple[int, int]]): Strides of points + in multiple feature levels. Default: (4, 8, 16, 32, 64). + regress_ranges (tuple[tuple[int, int]]): Regress range of multiple + level points. + center_sampling (bool): If true, use center sampling. Default: False. + center_sample_radius (float): Radius of center sampling. Default: 1.5. + norm_on_bbox (bool): If true, normalize the regression targets + with FPN strides. Default: False. + centerness_on_reg (bool): If true, position centerness on the + regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042. + Default: False. + conv_bias (bool | str): If specified as `auto`, it will be decided by the + norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise + False. Default: "auto". + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + loss_centerness (dict): Config of centerness loss. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + + Example: + >>> self = FCOSHead(11, 7) + >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]] + >>> cls_score, bbox_pred, centerness = self.forward(feats) + >>> assert len(cls_score) == len(self.scales) + """ # noqa: E501 + + def __init__(self, + num_classes, + in_channels, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512), + (512, INF)), + center_sampling=False, + center_sample_radius=1.5, + norm_on_bbox=False, + centerness_on_reg=False, + conv_cfg=None, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_bias=True, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100), + **kwargs): + super(FCOSHead, self).__init__() + + self.regress_ranges = regress_ranges + self.center_sampling = center_sampling + self.center_sample_radius = center_sample_radius + self.norm_on_bbox = norm_on_bbox + self.centerness_on_reg = centerness_on_reg + + self.num_classes = num_classes + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.loss_cls = build_loss(loss_cls) + self.loss_bbox = build_loss(loss_bbox) + + self.prior_generator = MlvlPointGenerator(strides) + + # In order to keep a more general interface and be consistent with + # anchor_head. We can think of point like one anchor + self.num_base_priors = self.prior_generator.num_base_priors[0] + + self.test_cfg = test_cfg + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self._init_layers() + + self.loss_centerness = build_loss(loss_centerness) + + def _init_layers(self): + """Initialize layers of the head.""" + self._init_cls_convs() + self._init_reg_convs() + self._init_predictor() + self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1) + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + def _init_cls_convs(self): + """Initialize classification conv layers of the head.""" + self.cls_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + conv_cfg = self.conv_cfg + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + def _init_reg_convs(self): + """Initialize bbox regression conv layers of the head.""" + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + conv_cfg = self.conv_cfg + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + def _init_predictor(self): + """Initialize predictor layers of the head.""" + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + torch.nn.init.normal_(m.weight, std=0.01) + if hasattr(m, 'bias') and m.bias is not None: + nn.init.constant_(m.bias, 0) + + # initialize the bias for focal loss + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + torch.nn.init.constant_(self.conv_cls.bias, bias_value) + + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: + cls_scores (list[Tensor]): Box scores for each scale level, \ + each is a 4D-tensor, the channel number is \ + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each \ + scale level, each is a 4D-tensor, the channel number is \ + num_points * 4. + centernesses (list[Tensor]): centerness for each scale level, \ + each is a 4D-tensor, the channel number is num_points * 1. + """ + return multi_apply(self.forward_single, feats, self.scales, + self.strides) + + def forward_single(self, x, scale, stride): + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps, only + used to normalize the bbox prediction when self.norm_on_bbox + is True. + + Returns: + tuple: scores for each class, bbox predictions and centerness \ + predictions of input feature maps. + """ + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.conv_cls(cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + bbox_pred = self.conv_reg(reg_feat) + + if self.centerness_on_reg: + centerness = self.conv_centerness(reg_feat) + else: + centerness = self.conv_centerness(cls_feat) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + if self.norm_on_bbox: + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + if not self.training: + bbox_pred *= stride + else: + bbox_pred = bbox_pred.exp() + return cls_score, bbox_pred, centerness + + def forward_train(self, + x, + img_metas, + gt_bboxes, + gt_labels=None, + gt_bboxes_ignore=None, + proposal_cfg=None, + **kwargs): + outs = self.forward(x) + if gt_labels is None: + loss_inputs = outs + (gt_bboxes, img_metas) + else: + loss_inputs = outs + (gt_bboxes, gt_labels, img_metas) + losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + return losses + + def forward_test(self, feats, img_metas, rescale=False): + """Test function without test-time augmentation. + + Args: + feats (tuple[torch.Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + img_metas (list[dict]): List of image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is ``bboxes`` with shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + The shape of the second tensor in the tuple is ``labels`` + with shape (n, ). + """ + outs = self.forward(feats) + results_list = self.get_bboxes( + *outs, img_metas=img_metas, rescale=True) + results = [ + bbox2result(det_bboxes, det_labels, self.num_classes) + for det_bboxes, det_labels in results_list + ] + + detection_boxes = [] + detection_scores = [] + detection_classes = [] + for res_i in results: + bbox_result = res_i + bboxes = np.vstack(bbox_result) + labels = [ + np.full(bbox.shape[0], i, dtype=np.int32) + for i, bbox in enumerate(bbox_result) + ] + labels = np.concatenate(labels) + + scores = bboxes[:, 4] if bboxes.shape[1] == 5 else None + bboxes = bboxes[:, 0:4] if bboxes.shape[1] == 5 else bboxes + assert bboxes.shape[1] == 4 + + detection_boxes.append(bboxes) + detection_scores.append(scores) + detection_classes.append(labels) + + assert len(img_metas) == 1 + outputs = { + 'detection_boxes': detection_boxes, + 'detection_scores': detection_scores, + 'detection_classes': detection_classes, + 'img_metas': img_metas + } + + return outputs + + def loss(self, + cls_scores, + bbox_preds, + centernesses, + gt_bboxes, + gt_labels, + img_metas, + gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + centernesses (list[Tensor]): centerness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + gt_bboxes (list[Tensor]): Ground truth bboxes for each image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): specify which bounding + boxes can be ignored when computing the loss. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == len(centernesses) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes, + gt_labels) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores, bbox_preds and centerness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_centerness = [ + centerness.permute(0, 2, 3, 1).reshape(-1) + for centerness in centernesses + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_centerness = torch.cat(flatten_centerness) + flatten_labels = torch.cat(labels) + flatten_bbox_targets = torch.cat(bbox_targets) + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((flatten_labels >= 0) + & (flatten_labels < bg_class_ind)).nonzero().reshape(-1) + num_pos = torch.tensor( + len(pos_inds), dtype=torch.float, device=bbox_preds[0].device) + num_pos = max(reduce_mean(num_pos), 1.0) + loss_cls = self.loss_cls( + flatten_cls_scores, flatten_labels, avg_factor=num_pos) + + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_centerness = flatten_centerness[pos_inds] + pos_bbox_targets = flatten_bbox_targets[pos_inds] + pos_centerness_targets = self.centerness_target(pos_bbox_targets) + # centerness weighted iou loss + centerness_denorm = max( + reduce_mean(pos_centerness_targets.sum().detach()), 1e-6) + + if len(pos_inds) > 0: + pos_points = flatten_points[pos_inds] + pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) + pos_decoded_target_preds = distance2bbox(pos_points, + pos_bbox_targets) + loss_bbox = self.loss_bbox( + pos_decoded_bbox_preds, + pos_decoded_target_preds, + weight=pos_centerness_targets, + avg_factor=centerness_denorm) + loss_centerness = self.loss_centerness( + pos_centerness, pos_centerness_targets, avg_factor=num_pos) + else: + loss_bbox = pos_bbox_preds.sum() + loss_centerness = pos_centerness.sum() + + return dict( + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_centerness=loss_centerness) + + def get_targets(self, points, gt_bboxes_list, gt_labels_list): + """Compute regression, classification and centerness targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + each has shape (num_gt, 4). + gt_labels_list (list[Tensor]): Ground truth labels of each box, + each has shape (num_gt,). + + Returns: + tuple: + concat_lvl_labels (list[Tensor]): Labels of each level. \ + concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + level. + """ + assert len(points) == len(self.regress_ranges) + num_levels = len(points) + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + + # the number of points per img, per lvl + num_points = [center.size(0) for center in points] + + # get labels and bbox_targets of each image + labels_list, bbox_targets_list = multi_apply( + self._get_target_single, + gt_bboxes_list, + gt_labels_list, + points=concat_points, + regress_ranges=concat_regress_ranges, + num_points_per_lvl=num_points) + + # split to per img, per level + labels_list = [labels.split(num_points, 0) for labels in labels_list] + bbox_targets_list = [ + bbox_targets.split(num_points, 0) + for bbox_targets in bbox_targets_list + ] + + # concat per level image + concat_lvl_labels = [] + concat_lvl_bbox_targets = [] + for i in range(num_levels): + concat_lvl_labels.append( + torch.cat([labels[i] for labels in labels_list])) + bbox_targets = torch.cat( + [bbox_targets[i] for bbox_targets in bbox_targets_list]) + if self.norm_on_bbox: + bbox_targets = bbox_targets / self.strides[i] + concat_lvl_bbox_targets.append(bbox_targets) + return concat_lvl_labels, concat_lvl_bbox_targets + + def _get_target_single(self, gt_bboxes, gt_labels, points, regress_ranges, + num_points_per_lvl): + """Compute regression and classification targets for a single image.""" + num_points = points.size(0) + num_gts = gt_labels.size(0) + if num_gts == 0: + return gt_labels.new_full((num_points,), self.num_classes), \ + gt_bboxes.new_zeros((num_points, 4)) + + areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) + # TODO: figure out why these two are different + # areas = areas[None].expand(num_points, num_gts) + areas = areas[None].repeat(num_points, 1) + regress_ranges = regress_ranges[:, None, :].expand( + num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + xs, ys = points[:, 0], points[:, 1] + xs = xs[:, None].expand(num_points, num_gts) + ys = ys[:, None].expand(num_points, num_gts) + + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + bbox_targets = torch.stack((left, top, right, bottom), -1) + + if self.center_sampling: + # condition1: inside a `center bbox` + radius = self.center_sample_radius + center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2 + center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2 + center_gts = torch.zeros_like(gt_bboxes) + stride = center_xs.new_zeros(center_xs.shape) + + # project the points on current lvl back to the `original` sizes + lvl_begin = 0 + for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl): + lvl_end = lvl_begin + num_points_lvl + stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius + lvl_begin = lvl_end + + x_mins = center_xs - stride + y_mins = center_ys - stride + x_maxs = center_xs + stride + y_maxs = center_ys + stride + center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0], + x_mins, gt_bboxes[..., 0]) + center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1], + y_mins, gt_bboxes[..., 1]) + center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2], + gt_bboxes[..., 2], x_maxs) + center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3], + gt_bboxes[..., 3], y_maxs) + + cb_dist_left = xs - center_gts[..., 0] + cb_dist_right = center_gts[..., 2] - xs + cb_dist_top = ys - center_gts[..., 1] + cb_dist_bottom = center_gts[..., 3] - ys + center_bbox = torch.stack( + (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1) + inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 + else: + # condition1: inside a gt bbox + inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0 + + # condition2: limit the regression range for each location + max_regress_distance = bbox_targets.max(-1)[0] + inside_regress_range = ( + (max_regress_distance >= regress_ranges[..., 0]) + & (max_regress_distance <= regress_ranges[..., 1])) + + # if there are still more than one objects for a location, + # we choose the one with minimal area + areas[inside_gt_bbox_mask == 0] = INF + areas[inside_regress_range == 0] = INF + min_area, min_area_inds = areas.min(dim=1) + + labels = gt_labels[min_area_inds] + labels[min_area == INF] = self.num_classes # set as BG + bbox_targets = bbox_targets[range(num_points), min_area_inds] + + return labels, bbox_targets + + def centerness_target(self, pos_bbox_targets): + """Compute centerness targets. + + Args: + pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape + (num_pos, 4) + + Returns: + Tensor: Centerness target. + """ + # only calculate pos centerness targets, otherwise there may be nan + left_right = pos_bbox_targets[:, [0, 2]] + top_bottom = pos_bbox_targets[:, [1, 3]] + if len(left_right) == 0: + centerness_targets = left_right[..., 0] + else: + centerness_targets = ( + left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( + top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) + return torch.sqrt(centerness_targets) + + def get_bboxes(self, + cls_scores, + bbox_preds, + score_factors=None, + img_metas=None, + cfg=None, + rescale=False, + with_nms=True, + **kwargs): + """Transform network outputs of a batch into bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + score_factors (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Default None. + img_metas (list[dict], Optional): Image meta info. Default None. + cfg (mmcv.Config, Optional): Test / postprocessing configuration, + if None, test_cfg would be used. Default None. + rescale (bool): If True, return boxes in original image space. + Default False. + with_nms (bool): If True, do nms before return boxes. + Default True. + + Returns: + list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is an (n, 5) tensor, where the first 4 columns + are bounding box positions (tl_x, tl_y, br_x, br_y) and the + 5-th column is a score between 0 and 1. The second item is a + (n,) tensor where each item is the predicted class label of + the corresponding box. + """ + assert len(cls_scores) == len(bbox_preds) + + if score_factors is None: + # e.g. Retina, FreeAnchor, Foveabox, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, AutoAssign, etc. + with_score_factors = True + assert len(cls_scores) == len(score_factors) + + num_levels = len(cls_scores) + + featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + + result_list = [] + + for img_id in range(len(img_metas)): + img_meta = img_metas[img_id] + cls_score_list = select_single_mlvl(cls_scores, img_id) + bbox_pred_list = select_single_mlvl(bbox_preds, img_id) + if with_score_factors: + score_factor_list = select_single_mlvl(score_factors, img_id) + else: + score_factor_list = [None for _ in range(num_levels)] + + results = self._get_bboxes_single(cls_score_list, bbox_pred_list, + score_factor_list, mlvl_priors, + img_meta, cfg, rescale, with_nms, + **kwargs) + result_list.append(results) + return result_list + + def _get_bboxes_single(self, + cls_score_list, + bbox_pred_list, + score_factor_list, + mlvl_priors, + img_meta, + cfg, + rescale=False, + with_nms=True, + **kwargs): + """Transform outputs of a single image into bbox predictions. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default: False. + with_nms (bool): If True, do nms before return boxes. + Default: True. + + Returns: + tuple[Tensor]: Results of detected bboxes and labels. If with_nms + is False and mlvl_score_factor is None, return mlvl_bboxes and + mlvl_scores, else return mlvl_bboxes, mlvl_scores and + mlvl_score_factor. Usually with_nms is False is used for aug + test. If with_nms is True, then return the following format + + - det_bboxes (Tensor): Predicted bboxes with shape \ + [num_bboxes, 5], where the first 4 columns are bounding \ + box positions (tl_x, tl_y, br_x, br_y) and the 5-th \ + column are scores between 0 and 1. + - det_labels (Tensor): Predicted labels of the corresponding \ + box with shape [num_bboxes]. + """ + if score_factor_list[0] is None: + # e.g. Retina, FreeAnchor, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, etc. + with_score_factors = True + + cfg = self.test_cfg if cfg is None else cfg + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_labels = [] + if with_score_factors: + mlvl_score_factors = [] + else: + mlvl_score_factors = None + for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + score_factor_list, mlvl_priors)): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + if with_score_factors: + score_factor = score_factor.permute(1, 2, + 0).reshape(-1).sigmoid() + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + scores = cls_score.softmax(-1)[:, :-1] + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk( + scores, cfg.score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, keep_idxs, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + + if with_score_factors: + score_factor = score_factor[keep_idxs] + + bboxes = distance2bbox(priors, bbox_pred, max_shape=img_shape) + + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + if with_score_factors: + mlvl_score_factors.append(score_factor) + + return self._bbox_post_process(mlvl_scores, mlvl_labels, mlvl_bboxes, + img_meta['scale_factor'], cfg, rescale, + with_nms, mlvl_score_factors, **kwargs) + + def _bbox_post_process(self, + mlvl_scores, + mlvl_labels, + mlvl_bboxes, + scale_factor, + cfg, + rescale=False, + with_nms=True, + mlvl_score_factors=None, + **kwargs): + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + mlvl_scores (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_bboxes, ). + mlvl_labels (list[Tensor]): Box class labels from all scale + levels of a single image, each item has shape + (num_bboxes, ). + mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale + levels of a single image, each item has shape (num_bboxes, 4). + scale_factor (ndarray, optional): Scale factor of the image arange + as (w_scale, h_scale, w_scale, h_scale). + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default: False. + with_nms (bool): If True, do nms before return boxes. + Default: True. + mlvl_score_factors (list[Tensor], optional): Score factor from + all scale levels of a single image, each item has shape + (num_bboxes, ). Default: None. + + Returns: + tuple[Tensor]: Results of detected bboxes and labels. If with_nms + is False and mlvl_score_factor is None, return mlvl_bboxes and + mlvl_scores, else return mlvl_bboxes, mlvl_scores and + mlvl_score_factor. Usually with_nms is False is used for aug + test. If with_nms is True, then return the following format + + - det_bboxes (Tensor): Predicted bboxes with shape \ + [num_bboxes, 5], where the first 4 columns are bounding \ + box positions (tl_x, tl_y, br_x, br_y) and the 5-th \ + column are scores between 0 and 1. + - det_labels (Tensor): Predicted labels of the corresponding \ + box with shape [num_bboxes]. + """ + assert len(mlvl_scores) == len(mlvl_bboxes) == len(mlvl_labels) + + mlvl_bboxes = torch.cat(mlvl_bboxes) + if rescale: + mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) + mlvl_scores = torch.cat(mlvl_scores) + mlvl_labels = torch.cat(mlvl_labels) + + if mlvl_score_factors is not None: + # TODO: Add sqrt operation in order to be consistent with + # the paper. + mlvl_score_factors = torch.cat(mlvl_score_factors) + mlvl_scores = mlvl_scores * mlvl_score_factors + + if with_nms: + if mlvl_bboxes.numel() == 0: + det_bboxes = torch.cat([mlvl_bboxes, mlvl_scores[:, None]], -1) + return det_bboxes, mlvl_labels + + det_bboxes, keep_idxs = batched_nms(mlvl_bboxes, mlvl_scores, + mlvl_labels, cfg.nms) + det_bboxes = det_bboxes[:cfg.max_per_img] + det_labels = mlvl_labels[keep_idxs][:cfg.max_per_img] + return det_bboxes, det_labels + else: + return mlvl_bboxes, mlvl_scores, mlvl_labels diff --git a/easycv/models/detection/vitdet/__init__.py b/easycv/models/detection/necks/__init__.py similarity index 50% rename from easycv/models/detection/vitdet/__init__.py rename to easycv/models/detection/necks/__init__.py index c3c90ae6..e250f7ce 100644 --- a/easycv/models/detection/vitdet/__init__.py +++ b/easycv/models/detection/necks/__init__.py @@ -1 +1,2 @@ +from .fpn import FPN from .sfp import SFP diff --git a/easycv/models/detection/necks/fpn.py b/easycv/models/detection/necks/fpn.py new file mode 100644 index 00000000..6d14bbef --- /dev/null +++ b/easycv/models/detection/necks/fpn.py @@ -0,0 +1,204 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule + +from easycv.models.registry import NECKS + + +@NECKS.register_module() +class FPN(nn.Module): + r"""Feature Pyramid Network. + This is an implementation of paper `Feature Pyramid Networks for Object + Detection `_. + Args: + in_channels (list[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, it is equivalent to `add_extra_convs='on_input'`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Default: False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer in ConvModule. + Default: None. + upsample_cfg (dict): Config dict for interpolate layer. + Default: dict(mode='nearest'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = FPN(in_channels, 11, len(in_channels)).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False, + relu_before_extra_convs=False, + no_norm_on_lateral=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + upsample_cfg=dict(mode='nearest')): + # init_cfg=dict( + # type='Xavier', layer='Conv2d', distribution='uniform')): + super(FPN, self).__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.relu_before_extra_convs = relu_before_extra_convs + self.no_norm_on_lateral = no_norm_on_lateral + self.upsample_cfg = upsample_cfg.copy() + + if end_level == -1 or end_level == self.num_ins - 1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level is not the last level, no extra level is allowed + self.backbone_end_level = end_level + 1 + assert end_level < self.num_ins + assert num_outs == end_level - start_level + 1 + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + assert isinstance(add_extra_convs, (str, bool)) + if isinstance(add_extra_convs, str): + # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' + assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') + elif add_extra_convs: # True + self.add_extra_convs = 'on_input' + + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, + act_cfg=act_cfg, + inplace=False) + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_levels = num_outs - self.backbone_end_level + self.start_level + if self.add_extra_convs and extra_levels >= 1: + for i in range(extra_levels): + if i == 0 and self.add_extra_convs == 'on_input': + in_channels = self.in_channels[self.backbone_end_level - 1] + else: + in_channels = out_channels + extra_fpn_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + self.fpn_convs.append(extra_fpn_conv) + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.xavier_uniform_(m.weight, gain=1) + if hasattr(m, 'bias') and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, inputs): + """Forward function.""" + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if 'scale_factor' in self.upsample_cfg: + # fix runtime error of "+=" inplace operation in PyTorch 1.10 + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], size=prev_shape, **self.upsample_cfg) + + # build outputs + # part 1: from original levels + outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == 'on_lateral': + extra_source = laterals[-1] + elif self.add_extra_convs == 'on_output': + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/easycv/models/detection/vitdet/sfp.py b/easycv/models/detection/necks/sfp.py similarity index 100% rename from easycv/models/detection/vitdet/sfp.py rename to easycv/models/detection/necks/sfp.py diff --git a/easycv/models/detection/utils/__init__.py b/easycv/models/detection/utils/__init__.py index f979502a..cedd62df 100644 --- a/easycv/models/detection/utils/__init__.py +++ b/easycv/models/detection/utils/__init__.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. -from .boxes import (bbox2result, bbox_overlaps, bboxes_iou, box_cxcywh_to_xyxy, - box_xyxy_to_cxcywh, distance2bbox, generalized_box_iou, - postprocess) +from .boxes import (batched_nms, bbox2result, bbox_overlaps, bboxes_iou, + box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, distance2bbox, + generalized_box_iou, postprocess) from .generator import MlvlPointGenerator from .matcher import HungarianMatcher from .misc import (accuracy, filter_scores_and_topk, fp16_clamp, interpolate, diff --git a/easycv/models/detection/utils/boxes.py b/easycv/models/detection/utils/boxes.py index 5ebae948..65508f11 100644 --- a/easycv/models/detection/utils/boxes.py +++ b/easycv/models/detection/utils/boxes.py @@ -5,7 +5,7 @@ from distutils.version import LooseVersion import numpy as np import torch import torchvision -from torchvision.ops.boxes import box_area +from torchvision.ops.boxes import box_area, nms from easycv.models.detection.utils.misc import fp16_clamp @@ -408,3 +408,102 @@ def distance2bbox(points, distance, max_shape=None): bboxes = torch.where(bboxes > max_xy, max_xy, bboxes) return bboxes + + +def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False): + r"""Performs non-maximum suppression in a batched fashion. + + Modified from `torchvision/ops/boxes.py#L39 + `_. + In order to perform NMS independently per class, we add an offset to all + the boxes. The offset is dependent only on the class idx, and is large + enough so that boxes from different classes do not overlap. + + Note: + In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and + returns sorted raw results when `nms_cfg` is None. + + Args: + boxes (torch.Tensor): boxes in shape (N, 4). + scores (torch.Tensor): scores in shape (N, ). + idxs (torch.Tensor): each index value correspond to a bbox cluster, + and NMS will not be applied between elements of different idxs, + shape (N, ). + nms_cfg (dict | None): Supports skipping the nms when `nms_cfg` + is None, otherwise it should specify nms type and other + parameters like `iou_thr`. Possible keys includes the following. + + - iou_thr (float): IoU threshold used for NMS. + - split_thr (float): threshold number of boxes. In some cases the + number of boxes is large (e.g., 200k). To avoid OOM during + training, the users could set `split_thr` to a small value. + If the number of boxes is greater than the threshold, it will + perform NMS on each group of boxes separately and sequentially. + Defaults to 10000. + class_agnostic (bool): if true, nms is class agnostic, + i.e. IoU thresholding happens over all boxes, + regardless of the predicted class. + + Returns: + tuple: kept dets and indice. + + - boxes (Tensor): Bboxes with score after nms, has shape + (num_bboxes, 5). last dimension 5 arrange as + (x1, y1, x2, y2, score) + - keep (Tensor): The indices of remaining boxes in input + boxes. + """ + # skip nms when nms_cfg is None + if nms_cfg is None: + scores, inds = scores.sort(descending=True) + boxes = boxes[inds] + return torch.cat([boxes, scores[:, None]], -1), inds + + nms_cfg_ = nms_cfg.copy() + class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic) + if class_agnostic: + boxes_for_nms = boxes + else: + max_coordinate = boxes.max() + offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) + boxes_for_nms = boxes + offsets[:, None] + + nms_type = nms_cfg_.pop('type', 'nms') + nms_op = eval(nms_type) + + split_thr = nms_cfg_.pop('split_thr', 10000) + # Won't split to multiple nms nodes when exporting to onnx + if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export(): + keep = nms(boxes_for_nms, scores, **nms_cfg_) + boxes = boxes[keep] + + # This assumes `dets` has arbitrary dimensions where + # the last dimension is score. + # Currently it supports bounding boxes [x1, y1, x2, y2, score] or + # rotated boxes [cx, cy, w, h, angle_radian, score]. + + scores = scores[keep] + else: + max_num = nms_cfg_.pop('max_num', -1) + total_mask = scores.new_zeros(scores.size(), dtype=torch.bool) + # Some type of nms would reweight the score, such as SoftNMS + scores_after_nms = scores.new_zeros(scores.size()) + for id in torch.unique(idxs): + mask = (idxs == id).nonzero(as_tuple=False).view(-1) + keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_) + total_mask[mask[keep]] = True + scores_after_nms[mask[keep]] = scores[keep] + keep = total_mask.nonzero(as_tuple=False).view(-1) + + scores, inds = scores_after_nms[keep].sort(descending=True) + keep = keep[inds] + boxes = boxes[keep] + + if max_num > 0: + keep = keep[:max_num] + boxes = boxes[:max_num] + scores = scores[:max_num] + + boxes = torch.cat([boxes, scores[:, None]], -1) + return boxes, keep diff --git a/tests/models/detection/fcos/test_fcos.py b/tests/models/detection/fcos/test_fcos.py new file mode 100644 index 00000000..73fab1ec --- /dev/null +++ b/tests/models/detection/fcos/test_fcos.py @@ -0,0 +1,206 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import numpy as np +import torch +from mmcv.parallel import collate, scatter +from numpy.testing import assert_array_almost_equal +from torchvision.transforms import Compose + +from easycv.datasets.registry import PIPELINES +from easycv.datasets.utils import replace_ImageToTensor +from easycv.models import build_model +from easycv.utils.checkpoint import load_checkpoint +from easycv.utils.config_tools import mmcv_config_fromfile +from easycv.utils.registry import build_from_cfg + + +class FCOSTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + + def init_fcos(self, model_path, config_path): + self.model_path = model_path + + self.cfg = mmcv_config_fromfile(config_path) + + # modify model_config + if self.cfg.model.head.test_cfg.get('max_per_img', None): + self.cfg.model.head.test_cfg.max_per_img = 10 + + # build model + self.model = build_model(self.cfg.model) + + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + map_location = 'cpu' if self.device == 'cpu' else 'cuda' + self.ckpt = load_checkpoint( + self.model, self.model_path, map_location=map_location) + + self.model.to(self.device) + self.model.eval() + + self.CLASSES = self.cfg.CLASSES + + def predict(self, imgs): + """Inference image(s) with the detector. + Args: + model (nn.Module): The loaded detector. + imgs (str/ndarray or list[str/ndarray] or tuple[str/ndarray]): + Either image files or loaded images. + Returns: + If imgs is a list or tuple, the same length list type results + will be returned, otherwise return the detection results directly. + """ + + if isinstance(imgs, (list, tuple)): + is_batch = True + else: + imgs = [imgs] + is_batch = False + + cfg = self.cfg + device = next(self.model.parameters()).device # model device + + if isinstance(imgs[0], np.ndarray): + cfg = cfg.copy() + # set loading pipeline type + cfg.data.val.pipeline.insert( + 0, + dict( + type='LoadImageFromWebcam', + file_client_args=dict(backend='http'))) + else: + cfg = cfg.copy() + # set loading pipeline type + cfg.data.val.pipeline.insert( + 0, + dict( + type='LoadImageFromFile', + file_client_args=dict(backend='http'))) + + cfg.data.val.pipeline = replace_ImageToTensor(cfg.data.val.pipeline) + + transforms = [] + for transform in cfg.data.val.pipeline: + if 'img_scale' in transform: + transform['img_scale'] = tuple(transform['img_scale']) + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + transforms.append(transform) + elif callable(transform): + transforms.append(transform) + else: + raise TypeError('transform must be callable or a dict') + test_pipeline = Compose(transforms) + + datas = [] + for img in imgs: + # prepare data + if isinstance(img, np.ndarray): + # directly add img + data = dict(img=img) + else: + # add information into dict + data = dict(img_info=dict(filename=img), img_prefix=None) + # build the data pipeline + data = test_pipeline(data) + datas.append(data) + + data = collate(datas, samples_per_gpu=len(imgs)) + # just get the actual data from DataContainer + data['img_metas'] = [ + img_metas.data[0] for img_metas in data['img_metas'] + ] + data['img'] = [img.data[0] for img in data['img']] + if next(self.model.parameters()).is_cuda: + # scatter to specified GPU + data = scatter(data, [device])[0] + + # forward the model + with torch.no_grad(): + results = self.model(mode='test', **data) + + return results + + def test_fcos(self): + model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth' + config_path = 'configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py' + img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg' + self.init_fcos(model_path, config_path) + output = self.predict(img) + + self.assertIn('detection_boxes', output) + self.assertIn('detection_scores', output) + self.assertIn('detection_classes', output) + self.assertIn('img_metas', output) + self.assertEqual(len(output['detection_boxes'][0]), 10) + self.assertEqual(len(output['detection_scores'][0]), 10) + self.assertEqual(len(output['detection_classes'][0]), 10) + + print(output['detection_boxes'][0].tolist()) + print(output['detection_scores'][0].tolist()) + print(output['detection_classes'][0].tolist()) + + self.assertListEqual( + output['detection_classes'][0].tolist(), + np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 13], dtype=np.int32).tolist()) + + assert_array_almost_equal( + output['detection_scores'][0], + np.array([ + 0.6641181707382202, 0.6135501265525818, 0.5985610485076904, + 0.5694775581359863, 0.5586040616035461, 0.5209507942199707, + 0.5056729912757874, 0.4943872094154358, 0.4850597083568573, + 0.45443734526634216 + ], + dtype=np.float32), + decimal=2) + + assert_array_almost_equal( + output['detection_boxes'][0], + np.array([[ + 295.5196228027344, 116.56035614013672, 380.0883483886719, + 150.24908447265625 + ], + [ + 190.57131958007812, 108.96343231201172, + 297.7738037109375, 154.69515991210938 + ], + [ + 480.5726013183594, 110.4341812133789, + 522.8551635742188, 129.9452667236328 + ], + [ + 431.1232604980469, 105.17676544189453, + 483.89617919921875, 131.85870361328125 + ], + [ + 398.6544494628906, 110.90837860107422, + 432.6370849609375, 132.89173889160156 + ], + [ + 609.3126831054688, 111.62432861328125, + 635.4577026367188, 137.03529357910156 + ], + [ + 98.66332244873047, 89.88417053222656, + 118.9398422241211, 101.25397491455078 + ], + [ + 167.9045867919922, 109.57560729980469, + 209.74375915527344, 139.98898315429688 + ], + [ + 591.0496826171875, 110.55867767333984, + 619.4395751953125, 126.65755462646484 + ], + [ + 218.92051696777344, 177.0509033203125, + 455.8321838378906, 385.0356140136719 + ]]), + decimal=1) + + +if __name__ == '__main__': + unittest.main()