Refactor ViTDet backbone and simple feature pyramid (#177)

1. The vitdet backbone implemented by d2 is about 20% faster than the vitdet backbone originally reproduced by easycv. 2. 50.57 -> 50.65
2025-06-03 14:49:00 +08:00 · 2022-09-16 11:03:53 +08:00 · 2022-09-16 11:03:53 +08:00 · 9f01a37ad4
commit 9f01a37ad4
parent 1d5edf6d78
20 changed files with 925 additions and 995 deletions
--- a/configs/detection/vitdet/lsj_coco_detection.py
+++ b/configs/detection/vitdet/lsj_coco_detection.py
@ -101,13 +101,15 @@ val_dataset = dict(
    pipeline=test_pipeline)

 data = dict(
-    imgs_per_gpu=1, workers_per_gpu=2, train=train_dataset, val=val_dataset)
+    imgs_per_gpu=4, workers_per_gpu=2, train=train_dataset, val=val_dataset
+)  # 64(total batch size) = 4 (batch size/per gpu) x 8 (gpu num) x 2(node)

 # evaluation
-eval_config = dict(interval=1, gpu_collect=False)
+eval_config = dict(initial=False, interval=1, gpu_collect=False)
 eval_pipelines = [
    dict(
        mode='test',
+        # dist_eval=True,
        evaluators=[
            dict(type='CocoDetectionEvaluator', classes=CLASSES),
        ],
--- a/configs/detection/vitdet/lsj_coco_instance.py
+++ b/configs/detection/vitdet/lsj_coco_instance.py
@ -101,13 +101,15 @@ val_dataset = dict(
    pipeline=test_pipeline)

 data = dict(
-    imgs_per_gpu=1, workers_per_gpu=2, train=train_dataset, val=val_dataset)
+    imgs_per_gpu=4, workers_per_gpu=2, train=train_dataset, val=val_dataset
+)  # 64(total batch size) = 4 (batch size/per gpu) x 8 (gpu num) x 2(node)

 # evaluation
-eval_config = dict(interval=1, gpu_collect=False)
+eval_config = dict(initial=False, interval=1, gpu_collect=False)
 eval_pipelines = [
    dict(
        mode='test',
+        # dist_eval=True,
        evaluators=[
            dict(type='CocoDetectionEvaluator', classes=CLASSES),
            dict(type='CocoMaskEvaluator', classes=CLASSES)
--- a/configs/detection/vitdet/vitdet_basicblock_100e.py
+++ b/configs/detection/vitdet/vitdet_basicblock_100e.py
@ -1,3 +0,0 @@
-_base_ = './vitdet_100e.py'
-
-model = dict(backbone=dict(aggregation='basicblock'))
--- a/configs/detection/vitdet/vitdet_bottleneck_100e.py
+++ b/configs/detection/vitdet/vitdet_bottleneck_100e.py
@ -1,3 +0,0 @@
-_base_ = './vitdet_100e.py'
-
-model = dict(backbone=dict(aggregation='bottleneck'))
--- a/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py
+++ b/configs/detection/vitdet/vitdet_cascade_mask_rcnn.py
@ -0,0 +1,231 @@
+# model settings
+
+norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True)
+
+pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth'
+model = dict(
+    type='CascadeRCNN',
+    pretrained=pretrained,
+    backbone=dict(
+        type='ViTDet',
+        img_size=1024,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
+        mlp_ratio=4,
+        qkv_bias=True,
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_rel_pos=True),
+    neck=dict(
+        type='SFP',
+        in_channels=768,
+        out_channels=256,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
+        norm_cfg=norm_cfg,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        num_convs=2,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                conv_out_channels=256,
+                norm_cfg=norm_cfg,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                conv_out_channels=256,
+                norm_cfg=norm_cfg,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared4Conv1FCBBoxHead',
+                conv_out_channels=256,
+                norm_cfg=norm_cfg,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            norm_cfg=norm_cfg,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
+
+mmlab_modules = [
+    dict(type='mmdet', name='CascadeRCNN', module='model'),
+    dict(type='mmdet', name='RPNHead', module='head'),
+    dict(type='mmdet', name='CascadeRoIHead', module='head'),
+]
--- a/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py
+++ b/configs/detection/vitdet/vitdet_cascade_mask_rcnn_100e.py
@ -0,0 +1,4 @@
+_base_ = [
+    './vitdet_cascade_mask_rcnn.py', './lsj_coco_instance.py',
+    './vitdet_schedule_100e.py'
+]
--- a/configs/detection/vitdet/vitdet_faster_rcnn.py
+++ b/configs/detection/vitdet/vitdet_faster_rcnn.py
@ -1,6 +1,6 @@
 # model settings

-norm_cfg = dict(type='GN', num_groups=1, requires_grad=True)
+norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True)

 pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth'
 model = dict(
@ -9,22 +9,32 @@ model = dict(
    backbone=dict(
        type='ViTDet',
        img_size=1024,
+        patch_size=16,
        embed_dim=768,
        depth=12,
        num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
        mlp_ratio=4,
        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1,
-        use_abs_pos_emb=True,
-        aggregation='attn',
-    ),
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_rel_pos=True),
    neck=dict(
        type='SFP',
-        in_channels=[768, 768, 768, 768],
+        in_channels=768,
        out_channels=256,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
        norm_cfg=norm_cfg,
        num_outs=5),
    rpn_head=dict(
@ -32,7 +42,6 @@ model = dict(
        in_channels=256,
        feat_channels=256,
        num_convs=2,
-        norm_cfg=norm_cfg,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
@ -98,7 +107,7 @@ model = dict(
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
-                match_low_quality=True,
+                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
--- a/configs/detection/vitdet/vitdet_faster_rcnn_100e.py
+++ b/configs/detection/vitdet/vitdet_faster_rcnn_100e.py
@ -1,4 +1,4 @@
 _base_ = [
-    './vitdet_faster_rcnn.py', './lsj_coco_detection.py',
+    './vitdet_faster_rcnn.py', './lsj_coco_instance.py',
    './vitdet_schedule_100e.py'
 ]
--- a/configs/detection/vitdet/vitdet_mask_rcnn.py
+++ b/configs/detection/vitdet/vitdet_mask_rcnn.py
@ -1,6 +1,6 @@
 # model settings

-norm_cfg = dict(type='GN', num_groups=1, requires_grad=True)
+norm_cfg = dict(type='GN', num_groups=1, eps=1e-6, requires_grad=True)

 pretrained = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/selfsup/mae/vit-b-1600/warpper_mae_vit-base-p16-1600e.pth'
 model = dict(
@ -9,22 +9,32 @@ model = dict(
    backbone=dict(
        type='ViTDet',
        img_size=1024,
+        patch_size=16,
        embed_dim=768,
        depth=12,
        num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
        mlp_ratio=4,
        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1,
-        use_abs_pos_emb=True,
-        aggregation='attn',
-    ),
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_rel_pos=True),
    neck=dict(
        type='SFP',
-        in_channels=[768, 768, 768, 768],
+        in_channels=768,
        out_channels=256,
+        scale_factors=(4.0, 2.0, 1.0, 0.5),
        norm_cfg=norm_cfg,
        num_outs=5),
    rpn_head=dict(
@ -32,7 +42,6 @@ model = dict(
        in_channels=256,
        feat_channels=256,
        num_convs=2,
-        norm_cfg=norm_cfg,
        anchor_generator=dict(
            type='AnchorGenerator',
            scales=[8],
@ -112,7 +121,7 @@ model = dict(
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
-                match_low_quality=True,
+                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
--- a/configs/detection/vitdet/vitdet_mask_rcnn_100e.py
+++ b/configs/detection/vitdet/vitdet_mask_rcnn_100e.py
--- a/configs/detection/vitdet/vitdet_schedule_100e.py
+++ b/configs/detection/vitdet/vitdet_schedule_100e.py
@ -1,26 +1,29 @@
 _base_ = 'configs/base.py'

+log_config = dict(
+    interval=200,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
 checkpoint_config = dict(interval=10)
+
 # optimizer
-paramwise_options = {
-    'norm': dict(weight_decay=0.),
-    'bias': dict(weight_decay=0.),
-    'pos_embed': dict(weight_decay=0.),
-    'cls_token': dict(weight_decay=0.)
-}
 optimizer = dict(
    type='AdamW',
    lr=1e-4,
    betas=(0.9, 0.999),
    weight_decay=0.1,
-    paramwise_options=paramwise_options)
-optimizer_config = dict(grad_clip=None, loss_scale=512.)
+    constructor='LayerDecayOptimizerConstructor',
+    paramwise_options=dict(num_layers=12, layer_decay_rate=0.7))
+optimizer_config = dict(grad_clip=None)
 # learning policy
 lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=250,
-    warmup_ratio=0.067,
+    warmup_ratio=0.001,
    step=[88, 96])
 total_epochs = 100

--- a/docs/source/_static/result.jpg
+++ b/docs/source/_static/result.jpg
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee64c0caef841c61c7e6344b7fe2c07a38fba07a8de81ff38c0686c641e0a283
-size 190356
+oid sha256:c696a58a2963b5ac47317751f04ff45bfed4723f2f70bacf91eac711f9710e54
+size 189432
--- a/docs/source/model_zoo_det.md
+++ b/docs/source/model_zoo_det.md
@ -22,7 +22,7 @@ Pretrained on COCO2017 dataset. (The result has been optimized with PAI-Blade, a

 | Algorithm  | Config                                                       | Params<br/>(backbone/total)                      | inference time(V100)<br/>(ms/img)                      | bbox_mAP<sup>val<br/><sub>0.5:0.95</sub> | mask_mAP<sup>val<br/><sub>0.5:0.95</sub> | Download                                                     |
 | ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_100e.py) | 88M/118M | 163ms | 50.57                   | 44.96          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn.log.json) |
+| ViTDet_MaskRCNN    | [vitdet_maskrcnn](https://github.com/alibaba/EasyCV/tree/master/configs/detection/vitdet/vitdet_mask_rcnn_100e.py) | 86M/111M | 138ms | 50.65                   | 45.41          | [model](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100.pth) - [log](https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/20220901_135827.log.json) |

 ## FCOS

--- a/easycv/core/optimizer/layer_decay_optimizer_constructor.py
+++ b/easycv/core/optimizer/layer_decay_optimizer_constructor.py
@ -1,5 +1,3 @@
-# Reference from https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmcv_custom/layer_decay_optimizer_constructor.py
-
 import json

 from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
@ -7,23 +5,32 @@ from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
 from .builder import OPTIMIZER_BUILDERS


-def get_num_layer_for_vit(var_name, num_max_layer, layer_sep=None):
-    if var_name in ('backbone.cls_token', 'backbone.mask_token',
-                    'backbone.pos_embed'):
-        return 0
-    elif var_name.startswith('backbone.patch_embed'):
-        return 0
-    elif var_name.startswith('backbone.blocks'):
-        layer_id = int(var_name.split('.')[2])
-        return layer_id + 1
-    else:
-        return num_max_layer - 1
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Reference from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if '.pos_embed' in name or '.patch_embed' in name:
+        layer_id = 0
+    elif '.blocks.' in name and '.residual.' not in name:
+        layer_id = int(name[name.find('.blocks.'):].split('.')[2]) + 1
+
+    scale = lr_decay_rate**(num_layers + 1 - layer_id)
+
+    return layer_id, scale


@OPTIMIZER_BUILDERS.register_module()
 class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):

-    def add_params(self, params, module, prefix='', is_dcn_module=None):
+    def add_params(self, params, module):
        """Add all parameters of module to the params list.
        The parameters of the given module will be added to the list of param
        groups, with specific rules defined by paramwise_cfg.
@ -31,54 +38,41 @@ class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
            params (list[dict]): A list of param groups, it will be modified
                in place.
            module (nn.Module): The module to be added.
-            prefix (str): The prefix of the module
-            is_dcn_module (int|float|None): If the current module is a
-                submodule of DCN, `is_dcn_module` will be passed to
-                control conv_offset layer's learning rate. Defaults to None.
+
+        Reference from https://github.com/ViTAE-Transformer/ViTDet/blob/main/mmcv_custom/layer_decay_optimizer_constructor.py
+        Note: Currently, this optimizer constructor is built for ViTDet.
        """
-        # get param-wise options

        parameter_groups = {}
        print(self.paramwise_cfg)
-        num_layers = self.paramwise_cfg.get('num_layers') + 2
-        layer_sep = self.paramwise_cfg.get('layer_sep', None)
-        layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        lr_decay_rate = self.paramwise_cfg.get('layer_decay_rate')
+        num_layers = self.paramwise_cfg.get('num_layers')
        print('Build LayerDecayOptimizerConstructor %f - %d' %
-              (layer_decay_rate, num_layers))
+              (lr_decay_rate, num_layers))
+        lr = self.base_lr
        weight_decay = self.base_wd

-        custom_keys = self.paramwise_cfg.get('custom_keys', {})
-        # first sort with alphabet order and then sort with reversed len of str
-        sorted_keys = sorted(custom_keys.keys())
-
        for name, param in module.named_parameters():

            if not param.requires_grad:
                continue  # frozen weights

-            if len(param.shape) == 1 or name.endswith('.bias') or (
-                    'pos_embed' in name) or ('cls_token'
-                                             in name) or ('rel_pos_' in name):
+            if 'backbone' in name and ('.norm' in name or '.pos_embed' in name
+                                       or '.gn.' in name or '.ln.' in name):
                group_name = 'no_decay'
                this_weight_decay = 0.
            else:
                group_name = 'decay'
                this_weight_decay = weight_decay

-            layer_id = get_num_layer_for_vit(name, num_layers, layer_sep)
+            if name.startswith('backbone'):
+                layer_id, scale = get_vit_lr_decay_rate(
+                    name, lr_decay_rate=lr_decay_rate, num_layers=num_layers)
+            else:
+                layer_id, scale = -1, 1
            group_name = 'layer_%d_%s' % (layer_id, group_name)

-            # if the parameter match one of the custom keys, ignore other rules
-            this_lr_multi = 1.
-            for key in sorted_keys:
-                if key in f'{name}':
-                    lr_mult = custom_keys[key].get('lr_mult', 1.)
-                    this_lr_multi = lr_mult
-                    group_name = '%s_%s' % (group_name, key)
-                    break
-
            if group_name not in parameter_groups:
-                scale = layer_decay_rate**(num_layers - layer_id - 1)

                parameter_groups[group_name] = {
                    'weight_decay': this_weight_decay,
@ -86,7 +80,7 @@ class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor):
                    'param_names': [],
                    'lr_scale': scale,
                    'group_name': group_name,
-                    'lr': scale * self.base_lr * this_lr_multi,
+                    'lr': scale * lr,
                }

            parameter_groups[group_name]['params'].append(param)
--- a/easycv/models/backbones/vitdet.py
+++ b/easycv/models/backbones/vitdet.py
--- a/easycv/models/detection/necks/fpn.py
+++ b/easycv/models/detection/necks/fpn.py
@ -37,7 +37,6 @@ class FPN(nn.Module):
            Default: None.
        upsample_cfg (dict): Config dict for interpolate layer.
            Default: dict(mode='nearest').
-        init_cfg (dict or list[dict], optional): Initialization config dict.
    Example:
        >>> import torch
        >>> in_channels = [2, 3, 5, 7]
@ -67,8 +66,6 @@ class FPN(nn.Module):
                 norm_cfg=None,
                 act_cfg=None,
                 upsample_cfg=dict(mode='nearest')):
-        #  init_cfg=dict(
-        #      type='Xavier', layer='Conv2d', distribution='uniform')):
        super(FPN, self).__init__()
        assert isinstance(in_channels, list)
        self.in_channels = in_channels
--- a/easycv/models/detection/necks/sfp.py
+++ b/easycv/models/detection/necks/sfp.py
@ -2,26 +2,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule

 from easycv.models.builder import NECKS


-class Norm2d(nn.Module):
-
-    def __init__(self, embed_dim):
-        super().__init__()
-        self.ln = nn.LayerNorm(embed_dim, eps=1e-6)
-
-    def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
-        x = self.ln(x)
-        x = x.permute(0, 3, 1, 2).contiguous()
-        return x
-
-
@NECKS.register_module()
-class SFP(BaseModule):
+class SFP(nn.Module):
    r"""Simple Feature Pyramid.
    This is an implementation of paper `Exploring Plain Vision Transformer Backbones for Object Detection <https://arxiv.org/abs/2203.16527>`_.
    Args:
@ -32,25 +18,12 @@ class SFP(BaseModule):
            build the feature pyramid. Default: 0.
        end_level (int): Index of the end input backbone level (exclusive) to
            build the feature pyramid. Default: -1, which means the last level.
-        add_extra_convs (bool | str): If bool, it decides whether to add conv
-            layers on top of the original feature maps. Default to False.
-            If True, it is equivalent to `add_extra_convs='on_input'`.
-            If str, it specifies the source feature map of the extra convs.
-            Only the following options are allowed
-            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
-            - 'on_lateral':  Last feature map after lateral convs.
-            - 'on_output': The last output feature map after fpn convs.
-        relu_before_extra_convs (bool): Whether to apply relu before the extra
            conv. Default: False.
-        no_norm_on_lateral (bool): Whether to apply norm on lateral.
            Default: False.
        conv_cfg (dict): Config dict for convolution layer. Default: None.
        norm_cfg (dict): Config dict for normalization layer. Default: None.
        act_cfg (str): Config dict for activation layer in ConvModule.
            Default: None.
-        upsample_cfg (dict): Config dict for interpolate layer.
-            Default: `dict(mode='nearest')`
-        init_cfg (dict or list[dict], optional): Initialization config dict.
    Example:
        >>> import torch
        >>> in_channels = [2, 3, 5, 7]
@ -70,158 +43,83 @@ class SFP(BaseModule):
    def __init__(self,
                 in_channels,
                 out_channels,
+                 scale_factors,
                 num_outs,
-                 start_level=0,
-                 end_level=-1,
-                 add_extra_convs=False,
-                 relu_before_extra_convs=False,
-                 no_norm_on_lateral=False,
                 conv_cfg=None,
                 norm_cfg=None,
-                 act_cfg=None,
-                 upsample_cfg=dict(mode='nearest'),
-                 init_cfg=[
-                     dict(
-                         type='Xavier',
-                         layer=['Conv2d'],
-                         distribution='uniform'),
-                     dict(type='Constant', layer=['LayerNorm'], val=1, bias=0)
-                 ]):
-        super(SFP, self).__init__(init_cfg)
-        assert isinstance(in_channels, list)
-        self.in_channels = in_channels
+                 act_cfg=None):
+        super(SFP, self).__init__()
+        dim = in_channels
        self.out_channels = out_channels
-        self.num_ins = len(in_channels)
+        self.scale_factors = scale_factors
+        self.num_ins = len(scale_factors)
        self.num_outs = num_outs
-        self.relu_before_extra_convs = relu_before_extra_convs
-        self.no_norm_on_lateral = no_norm_on_lateral
-        self.upsample_cfg = upsample_cfg.copy()

-        if end_level == -1:
-            self.backbone_end_level = self.num_ins
-            assert num_outs >= self.num_ins - start_level
-        else:
-            # if end_level < inputs, no extra level is allowed
-            self.backbone_end_level = end_level
-            assert end_level <= len(in_channels)
-            assert num_outs == end_level - start_level
-        self.start_level = start_level
-        self.end_level = end_level
-        self.add_extra_convs = add_extra_convs
-        assert isinstance(add_extra_convs, (str, bool))
-        if isinstance(add_extra_convs, str):
-            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
-            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
-        elif add_extra_convs:  # True
-            self.add_extra_convs = 'on_input'
-
-        self.top_downs = nn.ModuleList()
-        self.lateral_convs = nn.ModuleList()
-        self.fpn_convs = nn.ModuleList()
-
-        for i in range(self.start_level, self.backbone_end_level):
-            if i == 0:
-                top_down = nn.Sequential(
+        self.stages = []
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, 2, stride=2, padding=0),
+                    nn.GroupNorm(1, dim // 2, eps=1e-6),
+                    nn.GELU(),
                    nn.ConvTranspose2d(
-                        in_channels[i], in_channels[i], 2, stride=2,
-                        padding=0), Norm2d(in_channels[i]), nn.GELU(),
-                    nn.ConvTranspose2d(
-                        in_channels[i], in_channels[i], 2, stride=2,
-                        padding=0))
-            elif i == 1:
-                top_down = nn.ConvTranspose2d(
-                    in_channels[i], in_channels[i], 2, stride=2, padding=0)
-            elif i == 2:
-                top_down = nn.Identity()
-            elif i == 3:
-                top_down = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
+                        dim // 2, dim // 4, 2, stride=2, padding=0)
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, 2, stride=2, padding=0)
+                ]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2, padding=0)]
+            else:
+                raise NotImplementedError(
+                    f'scale_factor={scale} is not supported yet.')

-            l_conv = ConvModule(
-                in_channels[i],
-                out_channels,
-                1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
-                inplace=False)
-            fpn_conv = ConvModule(
-                out_channels,
-                out_channels,
-                3,
-                padding=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
-                inplace=False)
-
-            self.top_downs.append(top_down)
-            self.lateral_convs.append(l_conv)
-            self.fpn_convs.append(fpn_conv)
-
-        # add extra conv layers (e.g., RetinaNet)
-        extra_levels = num_outs - self.backbone_end_level + self.start_level
-        if self.add_extra_convs and extra_levels >= 1:
-            for i in range(extra_levels):
-                if i == 0 and self.add_extra_convs == 'on_input':
-                    in_channels = self.in_channels[self.backbone_end_level - 1]
-                else:
-                    in_channels = out_channels
-                extra_fpn_conv = ConvModule(
-                    in_channels,
+            layers.extend([
+                ConvModule(
+                    out_dim,
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False),
+                ConvModule(
+                    out_channels,
                    out_channels,
                    3,
-                    stride=2,
                    padding=1,
                    conv_cfg=conv_cfg,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg,
                    inplace=False)
-                self.fpn_convs.append(extra_fpn_conv)
+            ])
+
+            layers = nn.Sequential(*layers)
+            self.add_module(f'sfp_{idx}', layers)
+            self.stages.append(layers)
+
+    def init_weights(self):
+        pass

    def forward(self, inputs):
        """Forward function."""
-        assert len(inputs) == 1
+        features = inputs[0]
+        outs = []

-        # build top-down path
-        features = [
-            top_down(inputs[0]) for _, top_down in enumerate(self.top_downs)
-        ]
-        assert len(features) == len(self.in_channels)
+        # part 1: build simple feature pyramid
+        for stage in self.stages:
+            outs.append(stage(features))

-        # build laterals
-        laterals = [
-            lateral_conv(features[i + self.start_level])
-            for i, lateral_conv in enumerate(self.lateral_convs)
-        ]
-
-        used_backbone_levels = len(laterals)
-
-        # build outputs
-        # part 1: from original levels
-        outs = [
-            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
-        ]
        # part 2: add extra levels
-        if self.num_outs > len(outs):
+        if self.num_outs > self.num_ins:
            # use max pool to get more levels on top of outputs
            # (e.g., Faster R-CNN, Mask R-CNN)
-            if not self.add_extra_convs:
-                for i in range(self.num_outs - used_backbone_levels):
-                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
-            # add conv layers on top of original feature maps (RetinaNet)
-            else:
-                if self.add_extra_convs == 'on_input':
-                    extra_source = inputs[self.backbone_end_level - 1]
-                elif self.add_extra_convs == 'on_lateral':
-                    extra_source = laterals[-1]
-                elif self.add_extra_convs == 'on_output':
-                    extra_source = outs[-1]
-                else:
-                    raise NotImplementedError
-                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
-                for i in range(used_backbone_levels + 1, self.num_outs):
-                    if self.relu_before_extra_convs:
-                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
-                    else:
-                        outs.append(self.fpn_convs[i](outs[-1]))
+            for i in range(self.num_outs - self.num_ins):
+                outs.append(F.max_pool2d(outs[-1], 1, stride=2))
        return tuple(outs)
--- a/easycv/predictors/detector.py
+++ b/easycv/predictors/detector.py
@ -253,11 +253,11 @@ class DetrPredictor(PredictorInterface):
            img,
            bboxes,
            labels=labels,
-            colors='green',
-            text_color='white',
-            font_size=20,
-            thickness=1,
-            font_scale=0.5,
+            colors='cyan',
+            text_color='cyan',
+            font_size=18,
+            thickness=2,
+            font_scale=0.0,
            show=show,
            out_file=out_file)

--- a/tests/models/backbones/test_vitdet.py
+++ b/tests/models/backbones/test_vitdet.py
@ -14,18 +14,27 @@ class ViTDetTest(unittest.TestCase):
    def test_vitdet(self):
        model = ViTDet(
            img_size=1024,
+            patch_size=16,
            embed_dim=768,
            depth=12,
            num_heads=12,
+            drop_path_rate=0.1,
+            window_size=14,
            mlp_ratio=4,
            qkv_bias=True,
-            qk_scale=None,
-            drop_rate=0.,
-            attn_drop_rate=0.,
-            drop_path_rate=0.1,
-            use_abs_pos_emb=True,
-            aggregation='attn',
-        )
+            window_block_indexes=[
+                # 2, 5, 8 11 for global attention
+                0,
+                1,
+                3,
+                4,
+                6,
+                7,
+                9,
+                10,
+            ],
+            residual_block_indexes=[],
+            use_rel_pos=True)

        model.init_weights()
        model.train()
--- a/tests/predictors/test_detector.py
+++ b/tests/predictors/test_detector.py
@ -155,7 +155,7 @@ class DetectorTest(unittest.TestCase):
            decimal=1)

    def test_vitdet_detector(self):
-        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/vitdet_maskrcnn_export.pth'
+        model_path = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/detection/vitdet/vit_base/epoch_100_export.pth'
        img = 'https://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/data/demo/demo.jpg'
        out_file = './result.jpg'
        vitdet = DetrPredictor(model_path)
@ -167,63 +167,170 @@ class DetectorTest(unittest.TestCase):
        self.assertIn('detection_classes', output)
        self.assertIn('detection_masks', output)
        self.assertIn('img_metas', output)
-        self.assertEqual(len(output['detection_boxes'][0]), 30)
-        self.assertEqual(len(output['detection_scores'][0]), 30)
-        self.assertEqual(len(output['detection_classes'][0]), 30)
+        self.assertEqual(len(output['detection_boxes'][0]), 33)
+        self.assertEqual(len(output['detection_scores'][0]), 33)
+        self.assertEqual(len(output['detection_classes'][0]), 33)

        self.assertListEqual(
            output['detection_classes'][0].tolist(),
            np.array([
                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                2, 2, 2, 7, 7, 13, 13, 13, 56
+                2, 2, 2, 2, 2, 2, 7, 7, 13, 13, 13, 56
            ],
                     dtype=np.int32).tolist())

        assert_array_almost_equal(
            output['detection_scores'][0],
            np.array([
-                0.99791867, 0.99665856, 0.99480623, 0.99060905, 0.9882515,
-                0.98319584, 0.9738879, 0.97290784, 0.9514897, 0.95104814,
-                0.9321701, 0.86165, 0.8228847, 0.7623552, 0.76129806,
-                0.6050861, 0.44348577, 0.3452973, 0.2895671, 0.22109479,
-                0.21265312, 0.17855245, 0.1205352, 0.08981906, 0.10596471,
-                0.05854294, 0.99749386, 0.9472857, 0.5945908, 0.09855112
+                0.9975854158401489, 0.9965696334838867, 0.9922919869422913,
+                0.9833580851554871, 0.983080267906189, 0.970454752445221,
+                0.9701289534568787, 0.9649872183799744, 0.9642795324325562,
+                0.9642238020896912, 0.9529680609703064, 0.9403366446495056,
+                0.9391788244247437, 0.8941807150840759, 0.8178097009658813,
+                0.8013413548469543, 0.6677654385566711, 0.3952914774417877,
+                0.33463895320892334, 0.32501447200775146, 0.27323535084724426,
+                0.20197080075740814, 0.15607696771621704, 0.1068163588643074,
+                0.10183875262737274, 0.09735643863677979, 0.06559795141220093,
+                0.08890066295862198, 0.076363705098629, 0.9954648613929749,
+                0.9212945699691772, 0.5224372148513794, 0.20555885136127472
            ],
                     dtype=np.float32),
            decimal=2)

        assert_array_almost_equal(
            output['detection_boxes'][0],
-            np.array([[294.7058, 117.29371, 378.83713, 149.99928],
-                      [609.05444, 112.526474, 633.2971, 136.35175],
-                      [481.4165, 110.987335, 522.5531, 130.01529],
-                      [167.68184, 109.89049, 215.49057, 139.86987],
-                      [374.75082, 110.68697, 433.10028, 136.23654],
-                      [189.54971, 110.09322, 297.6167, 155.77412],
-                      [266.5185, 105.37718, 326.54385, 127.916374],
-                      [556.30225, 110.43166, 592.8248, 128.03764],
-                      [432.49252, 105.086464, 484.0512, 132.272],
-                      [0., 110.566444, 62.01249, 146.44017],
-                      [591.74664, 110.43527, 619.73816, 126.68549],
-                      [99.126854, 90.947975, 118.46699, 101.11096],
-                      [59.895264, 94.110054, 85.60521, 106.67633],
-                      [142.95819, 96.61966, 165.96964, 104.95929],
-                      [83.062515, 89.802605, 99.1546, 98.69074],
-                      [226.28802, 98.32568, 249.06772, 108.86408],
-                      [136.67789, 94.75706, 154.62924, 104.289536],
-                      [170.42459, 98.458694, 183.16309, 106.203156],
-                      [67.56731, 89.68286, 82.62955, 98.35645],
-                      [222.80092, 97.828445, 239.02655, 108.29377],
-                      [134.34427, 92.31653, 149.19615, 102.97457],
-                      [613.5186, 102.27066, 636.0434, 112.813644],
-                      [607.4787, 110.87984, 630.1123, 127.65646],
-                      [135.13664, 90.989876, 155.67192, 100.18036],
-                      [431.61505, 105.43844, 484.36508, 132.50078],
-                      [189.92722, 110.38832, 297.74353, 155.95557],
-                      [220.67035, 177.13489, 455.32092, 380.45712],
-                      [372.76584, 134.33807, 432.44357, 188.51534],
-                      [50.403812, 110.543495, 70.4368, 119.65186],
-                      [373.50272, 134.27258, 432.18475, 187.81824]]),
+            np.array([[
+                294.22674560546875, 116.6078109741211, 379.4328918457031,
+                150.14097595214844
+            ],
+                      [
+                          482.6017761230469, 110.75955963134766,
+                          522.8798828125, 129.71286010742188
+                      ],
+                      [
+                          167.06460571289062, 109.95974731445312,
+                          212.83975219726562, 140.16102600097656
+                      ],
+                      [
+                          609.2930908203125, 113.13909149169922,
+                          637.3115844726562, 136.4690704345703
+                      ],
+                      [
+                          191.185791015625, 111.1408920288086, 301.31689453125,
+                          155.7731170654297
+                      ],
+                      [
+                          431.2244873046875, 106.19962310791016,
+                          483.860595703125, 132.21627807617188
+                      ],
+                      [
+                          267.48358154296875, 105.5920639038086,
+                          325.2832336425781, 127.11176300048828
+                      ],
+                      [
+                          591.2138671875, 110.29329681396484,
+                          619.8524169921875, 126.1990966796875
+                      ],
+                      [
+                          0.0, 110.7026596069336, 61.487945556640625,
+                          146.33018493652344
+                      ],
+                      [
+                          555.9155883789062, 110.03486633300781,
+                          591.7050170898438, 127.06097412109375
+                      ],
+                      [
+                          60.24559783935547, 94.12760162353516,
+                          85.63741302490234, 106.66705322265625
+                      ],
+                      [
+                          99.02665710449219, 90.53657531738281,
+                          118.83953094482422, 101.18717956542969
+                      ],
+                      [
+                          396.30438232421875, 111.59194946289062,
+                          431.559814453125, 133.96914672851562
+                      ],
+                      [
+                          83.81543731689453, 89.65665435791016,
+                          99.9166259765625, 98.25627899169922
+                      ],
+                      [
+                          139.29647827148438, 96.68000793457031,
+                          165.22410583496094, 105.60000610351562
+                      ],
+                      [
+                          67.27152252197266, 89.42798614501953,
+                          83.25617980957031, 98.0460205078125
+                      ],
+                      [
+                          223.74176025390625, 98.68321990966797,
+                          250.42506408691406, 109.32588958740234
+                      ],
+                      [
+                          136.7582244873047, 96.51412963867188,
+                          152.51190185546875, 104.73160552978516
+                      ],
+                      [
+                          221.71812438964844, 97.86445617675781,
+                          238.9705810546875, 106.96803283691406
+                      ],
+                      [
+                          135.06964111328125, 91.80916595458984, 155.24609375,
+                          102.20686340332031
+                      ],
+                      [
+                          169.11180114746094, 97.53628540039062,
+                          182.88504028320312, 105.95404815673828
+                      ],
+                      [
+                          133.8811798095703, 91.00375366210938,
+                          145.35507202148438, 102.3780288696289
+                      ],
+                      [
+                          614.2507934570312, 102.19828796386719,
+                          636.5692749023438, 112.59198760986328
+                      ],
+                      [
+                          35.94759750366211, 91.7213363647461,
+                          70.38274383544922, 117.19855499267578
+                      ],
+                      [
+                          554.6401977539062, 115.18976593017578,
+                          562.0255737304688, 127.4429931640625
+                      ],
+                      [
+                          39.07550811767578, 92.73261260986328,
+                          85.36636352539062, 106.73953247070312
+                      ],
+                      [
+                          200.85513305664062, 93.00469970703125,
+                          219.73086547851562, 107.99642181396484
+                      ],
+                      [
+                          0.0, 111.18904876708984, 61.7393684387207,
+                          146.72547912597656
+                      ],
+                      [
+                          191.88568115234375, 111.09577178955078,
+                          299.4097900390625, 155.14639282226562
+                      ],
+                      [
+                          221.06834411621094, 176.6427001953125,
+                          458.3475341796875, 378.89300537109375
+                      ],
+                      [
+                          372.7131652832031, 135.51429748535156,
+                          433.2494201660156, 188.0106658935547
+                      ],
+                      [
+                          52.19819641113281, 110.3646011352539,
+                          70.95110321044922, 120.10567474365234
+                      ],
+                      [
+                          376.1671447753906, 133.6930694580078,
+                          432.2721862792969, 187.99481201171875
+                      ]]),
            decimal=1)