diff --git a/configs/_base_/models/upernet_vit-b16_ln_mln.py b/configs/_base_/models/upernet_vit-b16_ln_mln.py new file mode 100644 index 000000000..573612e13 --- /dev/null +++ b/configs/_base_/models/upernet_vit-b16_ln_mln.py @@ -0,0 +1,58 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth', # noqa + backbone=dict( + type='VisionTransformer', + img_size=(512, 512), + patch_size=16, + in_channels=3, + embed_dims=768, + num_layers=12, + num_heads=12, + mlp_ratio=4, + out_indices=(2, 5, 8, 11), + qkv_bias=True, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + with_cls_token=True, + norm_cfg=dict(type='LN', eps=1e-6), + act_cfg=dict(type='GELU'), + norm_eval=False, + out_shape='NCHW', + interpolate_mode='bicubic'), + neck=dict( + type='MultiLevelNeck', + in_channels=[768, 768, 768, 768], + out_channels=768, + scales=[4, 2, 1, 0.5]), + decode_head=dict( + type='UPerHead', + in_channels=[768, 768, 768, 768], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=512, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=768, + in_index=3, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) # yapf: disable diff --git a/configs/vit/README.md b/configs/vit/README.md new file mode 100644 index 000000000..f0b0e1688 --- /dev/null +++ b/configs/vit/README.md @@ -0,0 +1,32 @@ +# Vision Transformer + +## Introduction + +<!-- [ALGORITHM] --> + +```latex +@article{dosoViTskiy2020, + title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, + author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil}, + journal={arXiv preprint arXiv:2010.11929}, + year={2020} +} +``` + +## Results and models + +### ADE20K + +| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | +| ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| UPerNet | ViT-B + MLN | 512x512 | 80000 | 9.20 | 6.94 | 47.71 | 49.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k-0403cee1.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/20210624_130547.log.json) | +| UPerNet | ViT-B + MLN | 512x512 | 160000 | 9.20 | 7.58 | 46.75 | 48.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k-852fa768.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/20210623_192432.log.json) | +| UPerNet | ViT-B + LN + MLN | 512x512 | 160000 | 9.21 | 6.82 | 47.73 | 49.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k-f444c077.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/20210621_172828.log.json) | +| UPerNet | DeiT-S | 512x512 | 80000 | 4.68 | 29.85 | 42.96 | 43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k-afc93ec2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/20210624_095228.log.json) | +| UPerNet | DeiT-S | 512x512 | 160000 | 4.68 | 29.19 | 42.87 | 43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k-5110d916.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/20210621_160903.log.json) | +| UPerNet | DeiT-S + MLN | 512x512 | 160000 | 5.69 | 11.18 | 43.82 | 45.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k-fb9a5dfb.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/20210621_161021.log.json) | +| UPerNet | DeiT-S + LN + MLN | 512x512 | 160000 | 5.69 | 12.39 | 43.52 | 45.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k-c0cd652f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/20210621_161021.log.json) | +| UPerNet | DeiT-B | 512x512 | 80000 | 7.75 | 9.69 | 45.24 | 46.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k-1e090789.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/20210624_130529.log.json) | +| UPerNet | DeiT-B | 512x512 | 160000 | 7.75 | 10.39 | 45.36 | 47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k-828705d7.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/20210621_180100.log.json) | +| UPerNet | DeiT-B + MLN | 512x512 | 160000 | 9.21 | 7.78 | 45.46 | 47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k-4e1450f3.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/20210621_191949.log.json) | +| UPerNet | DeiT-B + LN + MLN | 512x512 | 160000 | 9.21 | 7.75 | 45.37 | 47.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k-8a959c14.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/20210623_153535.log.json) | diff --git a/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py new file mode 100644 index 000000000..6f17d7a64 --- /dev/null +++ b/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py @@ -0,0 +1,6 @@ +_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth', # noqa + backbone=dict(drop_path_rate=0.1), + neck=None) # yapf: disable diff --git a/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py b/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py new file mode 100644 index 000000000..7bff28a10 --- /dev/null +++ b/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py @@ -0,0 +1,6 @@ +_base_ = './upernet_vit-b16_mln_512x512_80k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth', # noqa + backbone=dict(drop_path_rate=0.1), + neck=None) # yapf: disable diff --git a/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py new file mode 100644 index 000000000..f5b2411df --- /dev/null +++ b/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py @@ -0,0 +1,5 @@ +_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth', # noqa + backbone=dict(drop_path_rate=0.1, final_norm=True)) # yapf: disable diff --git a/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py new file mode 100644 index 000000000..68efd4893 --- /dev/null +++ b/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py @@ -0,0 +1,5 @@ +_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth', # noqa + backbone=dict(drop_path_rate=0.1),) # yapf: disable diff --git a/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py new file mode 100644 index 000000000..cae6f466c --- /dev/null +++ b/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py @@ -0,0 +1,8 @@ +_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth', # noqa + backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1), + decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]), + neck=None, + auxiliary_head=dict(num_classes=150, in_channels=384)) # yapf: disable diff --git a/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py b/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py new file mode 100644 index 000000000..b176abb79 --- /dev/null +++ b/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py @@ -0,0 +1,8 @@ +_base_ = './upernet_vit-b16_mln_512x512_80k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth', # noqa + backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1), + decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]), + neck=None, + auxiliary_head=dict(num_classes=150, in_channels=384)) # yapf: disable diff --git a/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py new file mode 100644 index 000000000..f328ca860 --- /dev/null +++ b/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py @@ -0,0 +1,12 @@ +_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth', # noqa + backbone=dict( + num_heads=6, + embed_dims=384, + drop_path_rate=0.1, + final_norm=True), + decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]), + neck=dict(in_channels=[384, 384, 384, 384], out_channels=384), + auxiliary_head=dict(num_classes=150, in_channels=384)) # yapf: disable diff --git a/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py new file mode 100644 index 000000000..a1e1c2a4e --- /dev/null +++ b/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py @@ -0,0 +1,8 @@ +_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py' + +model = dict( + pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth', # noqa + backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1), + decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]), + neck=dict(in_channels=[384, 384, 384, 384], out_channels=384), + auxiliary_head=dict(num_classes=150, in_channels=384)) # yapf: disable diff --git a/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py new file mode 100644 index 000000000..f6f85378b --- /dev/null +++ b/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/upernet_vit-b16_ln_mln.py', + '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +model = dict( + backbone=dict(drop_path_rate=0.1, final_norm=True), + decode_head=dict(num_classes=150), + auxiliary_head=dict(num_classes=150)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_embed': dict(decay_mult=0.), + 'cls_token': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) diff --git a/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py new file mode 100644 index 000000000..cc286f1fb --- /dev/null +++ b/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py @@ -0,0 +1,36 @@ +_base_ = [ + '../_base_/models/upernet_vit-b16_ln_mln.py', + '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_160k.py' +] + +model = dict( + decode_head=dict(num_classes=150), auxiliary_head=dict(num_classes=150)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_embed': dict(decay_mult=0.), + 'cls_token': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) diff --git a/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py b/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py new file mode 100644 index 000000000..d80b0d9fd --- /dev/null +++ b/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py @@ -0,0 +1,36 @@ +_base_ = [ + '../_base_/models/upernet_vit-b16_ln_mln.py', + '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_80k.py' +] + +model = dict( + decode_head=dict(num_classes=150), auxiliary_head=dict(num_classes=150)) + +# AdamW optimizer, no weight decay for position embedding & layer norm +# in backbone +optimizer = dict( + _delete_=True, + type='AdamW', + lr=0.00006, + betas=(0.9, 0.999), + weight_decay=0.01, + paramwise_cfg=dict( + custom_keys={ + 'pos_embed': dict(decay_mult=0.), + 'cls_token': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) diff --git a/mmseg/models/necks/multilevel_neck.py b/mmseg/models/necks/multilevel_neck.py index 941b82992..eb32240bc 100644 --- a/mmseg/models/necks/multilevel_neck.py +++ b/mmseg/models/necks/multilevel_neck.py @@ -1,6 +1,6 @@ import torch.nn as nn import torch.nn.functional as F -from mmcv.cnn import ConvModule +from mmcv.cnn import ConvModule, xavier_init from ..builder import NECKS @@ -13,7 +13,8 @@ class MultiLevelNeck(nn.Module): Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale). - scales (List[int]): Scale factors for each input feature map. + scales (List[float]): Scale factors for each input feature map. + Default: [0.5, 1, 2, 4] norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (dict): Config dict for activation layer in ConvModule. Default: None. @@ -52,6 +53,12 @@ class MultiLevelNeck(nn.Module): norm_cfg=norm_cfg, act_cfg=act_cfg)) + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + def forward(self, inputs): assert len(inputs) == len(self.in_channels) inputs = [ diff --git a/tests/test_models/test_necks/test_multilevel_neck.py b/tests/test_models/test_necks/test_multilevel_neck.py index 8fb2fc928..c5a567d98 100644 --- a/tests/test_models/test_necks/test_multilevel_neck.py +++ b/tests/test_models/test_necks/test_multilevel_neck.py @@ -5,6 +5,9 @@ from mmseg.models import MultiLevelNeck def test_multilevel_neck(): + # Test init_weights + MultiLevelNeck([266], 256).init_weights() + # Test multi feature maps in_channels = [256, 512, 1024, 2048] inputs = [torch.randn(1, c, 14, 14) for i, c in enumerate(in_channels)]