support fcos (#100)

support fcos(38.57)
2022-07-12 18:07:02 +08:00 · 2022-07-12 18:07:02 +08:00 · 5fb0a86f69
parent 6b8b04db72
commit 5fb0a86f69
15 changed files with 1591 additions and 26 deletions
--- a/configs/detection/fcos/coco_detection.py
+++ b/configs/detection/fcos/coco_detection.py
@ -0,0 +1,90 @@
+CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush'
+]
+
+# dataset settings
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+train_pipeline = [
+    dict(type='MMResize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='MMRandomFlip', flip_ratio=0.5),
+    dict(type='MMNormalize', **img_norm_cfg),
+    dict(type='MMPad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels'],
+        meta_keys=('filename', 'ori_filename', 'ori_shape', 'ori_img_shape',
+                   'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                   'flip_direction', 'img_norm_cfg'))
+]
+test_pipeline = [
+    dict(
+        type='MMMultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='MMResize', keep_ratio=True),
+            dict(type='MMRandomFlip'),
+            dict(type='MMNormalize', **img_norm_cfg),
+            dict(type='MMPad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_filename', 'ori_shape',
+                           'ori_img_shape', 'img_shape', 'pad_shape',
+                           'scale_factor', 'flip', 'flip_direction',
+                           'img_norm_cfg'))
+        ])
+]
+
+train_dataset = dict(
+    type='DetDataset',
+    data_source=dict(
+        type='DetSourceCoco',
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        classes=CLASSES,
+        test_mode=False,
+        filter_empty_gt=True,
+        iscrowd=False),
+    pipeline=train_pipeline)
+
+val_dataset = dict(
+    type='DetDataset',
+    imgs_per_gpu=1,
+    data_source=dict(
+        type='DetSourceCoco',
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        classes=CLASSES,
+        test_mode=True,
+        filter_empty_gt=False,
+        iscrowd=True),
+    pipeline=test_pipeline)
+
+data = dict(
+    imgs_per_gpu=2, workers_per_gpu=2, train=train_dataset, val=val_dataset)
--- a/configs/detection/fcos/fcos.py
+++ b/configs/detection/fcos/fcos.py
@ -0,0 +1,51 @@
+# model settings
+model = dict(
+    type='Detection',
+    pretrained=
+    'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/pretrained_models/easycv/resnet/detectron/resnet50_caffe.pth',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3, 4),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',  # use P5
+        num_outs=5,
+        relu_before_extra_convs=True),
+    head=dict(
+        type='FCOSHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        strides=[8, 16, 32, 64, 128],
+        center_sampling=True,
+        center_sample_radius=1.5,
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        conv_cfg=None,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        conv_bias=True,
+        test_cfg=dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100)))
--- a/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
+++ b/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
@ -0,0 +1,57 @@
+_base_ = ['./fcos.py', './coco_detection.py', 'configs/base.py']
+
+CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush'
+]
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=10)
+# optimizer
+optimizer = dict(
+    type='SGD',
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.))
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+
+total_epochs = 12
+
+# evaluation
+eval_config = dict(initial=True, interval=1, gpu_collect=False)
+# eval_config = dict(interval=1, gpu_collect=False)
+eval_pipelines = [
+    dict(
+        mode='test',
+        evaluators=[
+            dict(type='CocoDetectionEvaluator', classes=CLASSES),
+        ],
+    )
+]
+
+find_unused_parameters = False
--- a/docs/source/model_zoo_det.md
+++ b/docs/source/model_zoo_det.md
@ -19,6 +19,11 @@ Pretrained on COCO2017 dataset.
 | ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ |
 | ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_100e.py) | 50.57                   | 44.96          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.log.json) |

+## FCOS
+
+| Algorithm  | Config                                                       | mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | --------------- | ------------------------------------------------------------ |
+| FCOS-r50    | [fcos-r50](https://github.com/alibaba/EasyCV/tree/master/configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py) | 38.58                   | 57.18          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/20220621_121315.log.json) |
 ## DETR

 | Algorithm  | Config                                                       | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | AP<sup>val<br/><sub>50</sub> | Download                                                     |
--- a/easycv/core/evaluation/coco_tools.py
+++ b/easycv/core/evaluation/coco_tools.py
@ -546,7 +546,6 @@ def ExportSingleImageDetectionBoxesToCoco(image_id, category_id_set,
      do not have the right lengths or (2) if each of the elements inside these
      lists do not have the correct shapes or (3) if image_ids are not integers.
  """
-
    assert len(detection_classes.shape) == 1 and len(detection_scores.shape) == 1, \
        'All entries in detection_classes and detection_scores expected to be of rank 1.'
    assert len(detection_boxes.shape) == 2,\
--- a/easycv/datasets/detection/data_sources/coco.py
+++ b/easycv/datasets/detection/data_sources/coco.py
@ -68,10 +68,8 @@ class DetSourceCoco(object):

    def load_annotations(self, ann_file):
        """Load annotation from COCO style annotation file.
-
        Args:
            ann_file (str): Path of annotation file.
-
        Returns:
            list[dict]: Annotation info from COCO api.
        """
@ -97,10 +95,8 @@ class DetSourceCoco(object):

    def get_ann_info(self, idx):
        """Get COCO annotation by index.
-
        Args:
            idx (int): Index of data.
-
        Returns:
            dict: Annotation info of specified index.
        """
@ -112,10 +108,8 @@ class DetSourceCoco(object):

    def get_cat_ids(self, idx):
        """Get COCO category ids by index.
-
        Args:
            idx (int): Index of data.
-
        Returns:
            list[int]: All categories in the image of specified index.
        """
@ -151,7 +145,6 @@ class DetSourceCoco(object):

    def _set_group_flag(self):
        """Set flag according to image aspect ratio.
-
        Images with aspect ratio greater than 1 will be set as group 1,
        otherwise group 0.
        """
@ -163,11 +156,9 @@ class DetSourceCoco(object):

    def _parse_ann_info(self, img_info, ann_info):
        """Parse bbox and mask annotation.
-
        Args:
            ann_info (list[dict]): Annotation info of an image.
            with_mask (bool): Whether to parse mask annotations.
-
        Returns:
            dict: A dict containing the following keys: bboxes, bboxes_ignore,\
                labels, masks, seg_map. "masks" are raw annotations and not \
@ -241,11 +232,9 @@ class DetSourceCoco(object):
    def xyxy2xywh(self, bbox):
        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
        evaluation.
-
        Args:
            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
                ``xyxy`` order.
-
        Returns:
            list[float]: The converted bounding boxes, in ``xywh`` order.
        """
@ -299,10 +288,8 @@ class DetSourceCoco(object):

    def prepare_train_img(self, idx):
        """Get training data and annotations after pipeline.
-
        Args:
            idx (int): Index of data.
-
        Returns:
            dict: Training data and annotation after pipeline with new keys \
                introduced by pipeline.
@ -316,10 +303,8 @@ class DetSourceCoco(object):

    def __getitem__(self, idx):
        """Get training/test data after pipeline.
-
        Args:
            idx (int): Index of data.
-
        Returns:
            dict: Training/test data (with annotation if `test_mode` is set \
                True).
--- a/easycv/models/detection/init.py
+++ b/easycv/models/detection/init.py
@ -1,20 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import logging

-from .dab_detr import DABDETRHead, DABDetrTransformer
-from .detection import Detection
-from .detr import DETRHead, DetrTransformer
-from .vitdet import SFP
+from easycv.models.detection.dab_detr import DABDETRHead, DABDetrTransformer
+from easycv.models.detection.detection import Detection
+from easycv.models.detection.detr import DETRHead, DetrTransformer
+from easycv.models.detection.fcos import FCOSHead
+from easycv.models.detection.necks import FPN, SFP

 try:
-    from .yolox.yolox import YOLOX
+    from easycv.models.detection.yolox.yolox import YOLOX
 except Exception as e:
    logging.info(f'Exception: {e}')
    logging.info(
        'Import YOLOX failed! please check your CUDA & Pytorch Version')

 try:
-    from .yolox_edge.yolox_edge import YOLOX_EDGE
+    from easycv.models.detection.yolox_edge.yolox_edge import YOLOX_EDGE
 except Exception as e:
    logging.info(f'Exception: {e}')
    logging.info(
--- a/easycv/models/detection/fcos/init.py
+++ b/easycv/models/detection/fcos/init.py
@ -0,0 +1 @@
+from .fcos_head import FCOSHead
--- a/easycv/models/detection/fcos/fcos_head.py
+++ b/easycv/models/detection/fcos/fcos_head.py
@ -0,0 +1,865 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+
+from easycv.models.builder import HEADS, build_loss
+from easycv.models.detection.utils import (MlvlPointGenerator, batched_nms,
+                                           bbox2result, distance2bbox,
+                                           filter_scores_and_topk,
+                                           select_single_mlvl)
+from easycv.models.utils import reduce_mean
+from easycv.utils.misc import multi_apply
+
+INF = 1e8
+
+
+@HEADS.register_module()
+class FCOSHead(nn.Module):
+    """Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
+
+    The FCOS head does not use anchor boxes. Instead bounding boxes are
+    predicted at each pixel and a centerness measure is used to suppress
+    low-quality predictions.
+    Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
+    tricks used in official repo, which will bring remarkable mAP gains
+    of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
+    more detail.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (list[int] | list[tuple[int, int]]): Strides of points
+            in multiple feature levels. Default: (4, 8, 16, 32, 64).
+        regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Default: False.
+        center_sample_radius (float): Radius of center sampling. Default: 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets
+            with FPN strides. Default: False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_centerness (dict): Config of centerness loss.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> self = FCOSHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, centerness = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 feat_channels=256,
+                 strides=[8, 16, 32, 64, 128],
+                 regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
+                                 (512, INF)),
+                 center_sampling=False,
+                 center_sample_radius=1.5,
+                 norm_on_bbox=False,
+                 centerness_on_reg=False,
+                 conv_cfg=None,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+                 loss_centerness=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 conv_bias=True,
+                 test_cfg=dict(
+                     nms_pre=1000,
+                     min_bbox_size=0,
+                     score_thr=0.05,
+                     nms=dict(type='nms', iou_threshold=0.5),
+                     max_per_img=100),
+                 **kwargs):
+        super(FCOSHead, self).__init__()
+
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+
+        self.num_classes = num_classes
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        self.prior_generator = MlvlPointGenerator(strides)
+
+        # In order to keep a more general interface and be consistent with
+        # anchor_head. We can think of point like one anchor
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self._init_layers()
+
+        self.loss_centerness = build_loss(loss_centerness)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.normal_(m.weight, std=0.01)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+        # initialize the bias for focal loss
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.conv_cls.bias, bias_value)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level, \
+                    each is a 4D-tensor, the channel number is \
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each \
+                    scale level, each is a 4D-tensor, the channel number is \
+                    num_points * 4.
+                centernesses (list[Tensor]): centerness for each scale level, \
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.strides)
+
+    def forward_single(self, x, scale, stride):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness \
+                predictions of input feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        return cls_score, bbox_pred, centerness
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        outs = self.forward(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_test(self, feats, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n, ).
+        """
+        outs = self.forward(feats)
+        results_list = self.get_bboxes(
+            *outs, img_metas=img_metas, rescale=True)
+        results = [
+            bbox2result(det_bboxes, det_labels, self.num_classes)
+            for det_bboxes, det_labels in results_list
+        ]
+
+        detection_boxes = []
+        detection_scores = []
+        detection_classes = []
+        for res_i in results:
+            bbox_result = res_i
+            bboxes = np.vstack(bbox_result)
+            labels = [
+                np.full(bbox.shape[0], i, dtype=np.int32)
+                for i, bbox in enumerate(bbox_result)
+            ]
+            labels = np.concatenate(labels)
+
+            scores = bboxes[:, 4] if bboxes.shape[1] == 5 else None
+            bboxes = bboxes[:, 0:4] if bboxes.shape[1] == 5 else bboxes
+            assert bboxes.shape[1] == 4
+
+            detection_boxes.append(bboxes)
+            detection_scores.append(scores)
+            detection_classes.append(labels)
+
+        assert len(img_metas) == 1
+        outputs = {
+            'detection_boxes': detection_boxes,
+            'detection_scores': detection_scores,
+            'detection_classes': detection_classes,
+            'img_metas': img_metas
+        }
+
+        return outputs
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes,
+                                                gt_labels)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = distance2bbox(pos_points,
+                                                     pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_centerness=loss_centerness)
+
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                concat_lvl_labels (list[Tensor]): Labels of each level. \
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+                    level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list = multi_apply(
+            self._get_target_single,
+            gt_bboxes_list,
+            gt_labels_list,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return concat_lvl_labels, concat_lvl_bbox_targets
+
+    def _get_target_single(self, gt_bboxes, gt_labels, points, regress_ranges,
+                           num_points_per_lvl):
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = gt_labels.size(0)
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4))
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+            center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        return labels, bbox_targets
+
+    def centerness_target(self, pos_bbox_targets):
+        """Compute centerness targets.
+
+        Args:
+            pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
+                (num_pos, 4)
+
+        Returns:
+            Tensor: Centerness target.
+        """
+        # only calculate pos centerness targets, otherwise there may be nan
+        left_right = pos_bbox_targets[:, [0, 2]]
+        top_bottom = pos_bbox_targets[:, [1, 3]]
+        if len(left_right) == 0:
+            centerness_targets = left_right[..., 0]
+        else:
+            centerness_targets = (
+                left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+                    top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness_targets)
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   score_factors=None,
+                   img_metas=None,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True,
+                   **kwargs):
+        """Transform network outputs of a batch into bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Default None.
+            img_metas (list[dict], Optional): Image meta info. Default None.
+            cfg (mmcv.Config, Optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.  Default None.
+            rescale (bool): If True, return boxes in original image space.
+                Default False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default True.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
+                the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device)
+
+        result_list = []
+
+        for img_id in range(len(img_metas)):
+            img_meta = img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(score_factors, img_id)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                              score_factor_list, mlvl_priors,
+                                              img_meta, cfg, rescale, with_nms,
+                                              **kwargs)
+            result_list.append(results)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score_list,
+                           bbox_pred_list,
+                           score_factor_list,
+                           mlvl_priors,
+                           img_meta,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           **kwargs):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            bboxes = distance2bbox(priors, bbox_pred, max_shape=img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        return self._bbox_post_process(mlvl_scores, mlvl_labels, mlvl_bboxes,
+                                       img_meta['scale_factor'], cfg, rescale,
+                                       with_nms, mlvl_score_factors, **kwargs)
+
+    def _bbox_post_process(self,
+                           mlvl_scores,
+                           mlvl_labels,
+                           mlvl_bboxes,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           mlvl_score_factors=None,
+                           **kwargs):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            mlvl_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_labels (list[Tensor]): Box class labels from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale
+                levels of a single image, each item has shape (num_bboxes, 4).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            mlvl_score_factors (list[Tensor], optional): Score factor from
+                all scale levels of a single image, each item has shape
+                (num_bboxes, ). Default: None.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        assert len(mlvl_scores) == len(mlvl_bboxes) == len(mlvl_labels)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_labels = torch.cat(mlvl_labels)
+
+        if mlvl_score_factors is not None:
+            # TODO： Add sqrt operation in order to be consistent with
+            #  the paper.
+            mlvl_score_factors = torch.cat(mlvl_score_factors)
+            mlvl_scores = mlvl_scores * mlvl_score_factors
+
+        if with_nms:
+            if mlvl_bboxes.numel() == 0:
+                det_bboxes = torch.cat([mlvl_bboxes, mlvl_scores[:, None]], -1)
+                return det_bboxes, mlvl_labels
+
+            det_bboxes, keep_idxs = batched_nms(mlvl_bboxes, mlvl_scores,
+                                                mlvl_labels, cfg.nms)
+            det_bboxes = det_bboxes[:cfg.max_per_img]
+            det_labels = mlvl_labels[keep_idxs][:cfg.max_per_img]
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores, mlvl_labels
--- a/easycv/models/detection/vitdet/init.py
+++ b/easycv/models/detection/vitdet/init.py
@ -1 +1,2 @@
+from .fpn import FPN
 from .sfp import SFP
--- a/easycv/models/detection/necks/fpn.py
+++ b/easycv/models/detection/necks/fpn.py
@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from easycv.models.registry import NECKS
+
+
+@NECKS.register_module()
+class FPN(nn.Module):
+    r"""Feature Pyramid Network.
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        #  init_cfg=dict(
+        #      type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_uniform_(m.weight, gain=1)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
--- a/easycv/models/detection/vitdet/sfp.py
+++ b/easycv/models/detection/vitdet/sfp.py
--- a/easycv/models/detection/utils/init.py
+++ b/easycv/models/detection/utils/init.py
@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
-from .boxes import (bbox2result, bbox_overlaps, bboxes_iou, box_cxcywh_to_xyxy,
-                    box_xyxy_to_cxcywh, distance2bbox, generalized_box_iou,
-                    postprocess)
+from .boxes import (batched_nms, bbox2result, bbox_overlaps, bboxes_iou,
+                    box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, distance2bbox,
+                    generalized_box_iou, postprocess)
 from .generator import MlvlPointGenerator
 from .matcher import HungarianMatcher
 from .misc import (accuracy, filter_scores_and_topk, fp16_clamp, interpolate,
--- a/easycv/models/detection/utils/boxes.py
+++ b/easycv/models/detection/utils/boxes.py
@ -5,7 +5,7 @@ from distutils.version import LooseVersion
 import numpy as np
 import torch
 import torchvision
-from torchvision.ops.boxes import box_area
+from torchvision.ops.boxes import box_area, nms

 from easycv.models.detection.utils.misc import fp16_clamp

@ -408,3 +408,102 @@ def distance2bbox(points, distance, max_shape=None):
        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)

    return bboxes
+
+
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    r"""Performs non-maximum suppression in a batched fashion.
+
+    Modified from `torchvision/ops/boxes.py#L39
+    <https://github.com/pytorch/vision/blob/
+    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Note:
+        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
+        returns sorted raw results when `nms_cfg` is None.
+
+    Args:
+        boxes (torch.Tensor): boxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict | None): Supports skipping the nms when `nms_cfg`
+            is None, otherwise it should specify nms type and other
+            parameters like `iou_thr`. Possible keys includes the following.
+
+            - iou_thr (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+              number of boxes is large (e.g., 200k). To avoid OOM during
+              training, the users could set `split_thr` to a small value.
+              If the number of boxes is greater than the threshold, it will
+              perform NMS on each group of boxes separately and sequentially.
+              Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class.
+
+    Returns:
+        tuple: kept dets and indice.
+
+        - boxes (Tensor): Bboxes with score after nms, has shape
+          (num_bboxes, 5). last dimension 5 arrange as
+          (x1, y1, x2, y2, score)
+        - keep (Tensor): The indices of remaining boxes in input
+          boxes.
+    """
+    # skip nms when nms_cfg is None
+    if nms_cfg is None:
+        scores, inds = scores.sort(descending=True)
+        boxes = boxes[inds]
+        return torch.cat([boxes, scores[:, None]], -1), inds
+
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+        boxes_for_nms = boxes + offsets[:, None]
+
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = eval(nms_type)
+
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    # Won't split to multiple nms nodes when exporting to onnx
+    if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
+        keep = nms(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+
+        # This assumes `dets` has arbitrary dimensions where
+        # the last dimension is score.
+        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
+        # rotated boxes [cx, cy, w, h, angle_radian, score].
+
+        scores = scores[keep]
+    else:
+        max_num = nms_cfg_.pop('max_num', -1)
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores_after_nms = scores.new_zeros(scores.size())
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+            scores_after_nms[mask[keep]] = scores[keep]
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+
+        scores, inds = scores_after_nms[keep].sort(descending=True)
+        keep = keep[inds]
+        boxes = boxes[keep]
+
+        if max_num > 0:
+            keep = keep[:max_num]
+            boxes = boxes[:max_num]
+            scores = scores[:max_num]
+
+    boxes = torch.cat([boxes, scores[:, None]], -1)
+    return boxes, keep
--- a/tests/models/detection/fcos/test_fcos.py
+++ b/tests/models/detection/fcos/test_fcos.py
@ -0,0 +1,206 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+import torch
+from mmcv.parallel import collate, scatter
+from numpy.testing import assert_array_almost_equal
+from torchvision.transforms import Compose
+
+from easycv.datasets.registry import PIPELINES
+from easycv.datasets.utils import replace_ImageToTensor
+from easycv.models import build_model
+from easycv.utils.checkpoint import load_checkpoint
+from easycv.utils.config_tools import mmcv_config_fromfile
+from easycv.utils.registry import build_from_cfg
+
+
+class FCOSTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def init_fcos(self, model_path, config_path):
+        self.model_path = model_path
+
+        self.cfg = mmcv_config_fromfile(config_path)
+
+        # modify model_config
+        if self.cfg.model.head.test_cfg.get('max_per_img', None):
+            self.cfg.model.head.test_cfg.max_per_img = 10
+
+        # build model
+        self.model = build_model(self.cfg.model)
+
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        map_location = 'cpu' if self.device == 'cpu' else 'cuda'
+        self.ckpt = load_checkpoint(
+            self.model, self.model_path, map_location=map_location)
+
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.CLASSES = self.cfg.CLASSES
+
+    def predict(self, imgs):
+        """Inference image(s) with the detector.
+        Args:
+            model (nn.Module): The loaded detector.
+            imgs (str/ndarray or list[str/ndarray] or tuple[str/ndarray]):
+            Either image files or loaded images.
+        Returns:
+            If imgs is a list or tuple, the same length list type results
+            will be returned, otherwise return the detection results directly.
+        """
+
+        if isinstance(imgs, (list, tuple)):
+            is_batch = True
+        else:
+            imgs = [imgs]
+            is_batch = False
+
+        cfg = self.cfg
+        device = next(self.model.parameters()).device  # model device
+
+        if isinstance(imgs[0], np.ndarray):
+            cfg = cfg.copy()
+            # set loading pipeline type
+            cfg.data.val.pipeline.insert(
+                0,
+                dict(
+                    type='LoadImageFromWebcam',
+                    file_client_args=dict(backend='http')))
+        else:
+            cfg = cfg.copy()
+            # set loading pipeline type
+            cfg.data.val.pipeline.insert(
+                0,
+                dict(
+                    type='LoadImageFromFile',
+                    file_client_args=dict(backend='http')))
+
+        cfg.data.val.pipeline = replace_ImageToTensor(cfg.data.val.pipeline)
+
+        transforms = []
+        for transform in cfg.data.val.pipeline:
+            if 'img_scale' in transform:
+                transform['img_scale'] = tuple(transform['img_scale'])
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                transforms.append(transform)
+            elif callable(transform):
+                transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+        test_pipeline = Compose(transforms)
+
+        datas = []
+        for img in imgs:
+            # prepare data
+            if isinstance(img, np.ndarray):
+                # directly add img
+                data = dict(img=img)
+            else:
+                # add information into dict
+                data = dict(img_info=dict(filename=img), img_prefix=None)
+            # build the data pipeline
+            data = test_pipeline(data)
+            datas.append(data)
+
+        data = collate(datas, samples_per_gpu=len(imgs))
+        # just get the actual data from DataContainer
+        data['img_metas'] = [
+            img_metas.data[0] for img_metas in data['img_metas']
+        ]
+        data['img'] = [img.data[0] for img in data['img']]
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [device])[0]
+
+        # forward the model
+        with torch.no_grad():
+            results = self.model(mode='test', **data)
+
+        return results
+
+    def test_fcos(self):
+        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/fcos/epoch_12.pth'
+        config_path = 'configs/detection/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py'
+        img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg'
+        self.init_fcos(model_path, config_path)
+        output = self.predict(img)
+
+        self.assertIn('detection_boxes', output)
+        self.assertIn('detection_scores', output)
+        self.assertIn('detection_classes', output)
+        self.assertIn('img_metas', output)
+        self.assertEqual(len(output['detection_boxes'][0]), 10)
+        self.assertEqual(len(output['detection_scores'][0]), 10)
+        self.assertEqual(len(output['detection_classes'][0]), 10)
+
+        print(output['detection_boxes'][0].tolist())
+        print(output['detection_scores'][0].tolist())
+        print(output['detection_classes'][0].tolist())
+
+        self.assertListEqual(
+            output['detection_classes'][0].tolist(),
+            np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 13], dtype=np.int32).tolist())
+
+        assert_array_almost_equal(
+            output['detection_scores'][0],
+            np.array([
+                0.6641181707382202, 0.6135501265525818, 0.5985610485076904,
+                0.5694775581359863, 0.5586040616035461, 0.5209507942199707,
+                0.5056729912757874, 0.4943872094154358, 0.4850597083568573,
+                0.45443734526634216
+            ],
+                     dtype=np.float32),
+            decimal=2)
+
+        assert_array_almost_equal(
+            output['detection_boxes'][0],
+            np.array([[
+                295.5196228027344, 116.56035614013672, 380.0883483886719,
+                150.24908447265625
+            ],
+                      [
+                          190.57131958007812, 108.96343231201172,
+                          297.7738037109375, 154.69515991210938
+                      ],
+                      [
+                          480.5726013183594, 110.4341812133789,
+                          522.8551635742188, 129.9452667236328
+                      ],
+                      [
+                          431.1232604980469, 105.17676544189453,
+                          483.89617919921875, 131.85870361328125
+                      ],
+                      [
+                          398.6544494628906, 110.90837860107422,
+                          432.6370849609375, 132.89173889160156
+                      ],
+                      [
+                          609.3126831054688, 111.62432861328125,
+                          635.4577026367188, 137.03529357910156
+                      ],
+                      [
+                          98.66332244873047, 89.88417053222656,
+                          118.9398422241211, 101.25397491455078
+                      ],
+                      [
+                          167.9045867919922, 109.57560729980469,
+                          209.74375915527344, 139.98898315429688
+                      ],
+                      [
+                          591.0496826171875, 110.55867767333984,
+                          619.4395751953125, 126.65755462646484
+                      ],
+                      [
+                          218.92051696777344, 177.0509033203125,
+                          455.8321838378906, 385.0356140136719
+                      ]]),
+            decimal=1)
+
+
+if __name__ == '__main__':
+    unittest.main()