diff --git a/configs/_base_/datasets/cityscapes_768x768.py b/configs/_base_/datasets/cityscapes_768x768.py new file mode 100644 index 000000000..fde9d7c7d --- /dev/null +++ b/configs/_base_/datasets/cityscapes_768x768.py @@ -0,0 +1,35 @@ +_base_ = './cityscapes.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (768, 768) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2049, 1025), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/setr/README.md b/configs/setr/README.md index 3017a8895..9935d16e4 100644 --- a/configs/setr/README.md +++ b/configs/setr/README.md @@ -45,3 +45,11 @@ This head has two version head. | SETR-PUP | ViT-L | 512x512 | 16 | 160000 | 19.54 | 4.50 | 48.24 | 49.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_pup_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json) | | SETR-MLA | ViT-L | 512x512 | 8 | 160000 | 10.96 | - | 47.34 | 49.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b8_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json) | | SETR-MLA | ViT-L | 512x512 | 16 | 160000 | 17.30 | 5.25 | 47.54 | 49.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json) | + +### Cityscapes + +| Method | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | +| ------ | -------- | --------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| SETR-Naive | ViT-L | 768x768 | 8 | 80000 | 24.06 | 0.39 | 78.10 | 80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json) | +| SETR-PUP | ViT-L | 768x768 | 8 | 80000 | 27.96 | 0.37 | 79.21 | 81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json) | +| SETR-MLA | ViT-L | 768x768 | 8 | 80000 | 24.10 | 0.41 | 77.00 | 79.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json) | diff --git a/configs/setr/setr.yml b/configs/setr/setr.yml index ca142362b..c438d7ee3 100644 --- a/configs/setr/setr.yml +++ b/configs/setr/setr.yml @@ -3,6 +3,7 @@ Collections: Metadata: Training Data: - ADE20K + - Cityscapes Paper: URL: https://arxiv.org/abs/2012.15840 Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective @@ -95,3 +96,69 @@ Models: mIoU(ms+flip): 49.37 Config: configs/setr/setr_mla_512x512_160k_b16_ade20k.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth +- Name: setr_vit-large_naive_8x1_768x768_80k_cityscapes + In Collection: setr + Metadata: + backbone: ViT-L + crop size: (768,768) + lr schd: 80000 + inference time (ms/im): + - value: 2564.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (768,768) + Training Memory (GB): 24.06 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 78.1 + mIoU(ms+flip): 80.22 + Config: configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth +- Name: setr_vit-large_pup_8x1_768x768_80k_cityscapes + In Collection: setr + Metadata: + backbone: ViT-L + crop size: (768,768) + lr schd: 80000 + inference time (ms/im): + - value: 2702.7 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (768,768) + Training Memory (GB): 27.96 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 79.21 + mIoU(ms+flip): 81.02 + Config: configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth +- Name: setr_vit-large_mla_8x1_768x768_80k_cityscapes + In Collection: setr + Metadata: + backbone: ViT-L + crop size: (768,768) + lr schd: 80000 + inference time (ms/im): + - value: 2439.02 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (768,768) + Training Memory (GB): 24.1 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 77.0 + mIoU(ms+flip): 79.59 + Config: configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth diff --git a/configs/setr/setr_mla_512x512_160k_b8_ade20k.py b/configs/setr/setr_mla_512x512_160k_b8_ade20k.py index 2958a6df6..6977dbacc 100644 --- a/configs/setr/setr_mla_512x512_160k_b8_ade20k.py +++ b/configs/setr/setr_mla_512x512_160k_b8_ade20k.py @@ -4,8 +4,11 @@ _base_ = [ ] norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( - pretrained='pretrain/vit_large_patch16_384.pth', - backbone=dict(img_size=(512, 512), drop_rate=0.), + pretrained=None, + backbone=dict( + img_size=(512, 512), + drop_rate=0., + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')), decode_head=dict(num_classes=150), auxiliary_head=[ dict( diff --git a/configs/setr/setr_naive_512x512_160k_b16_ade20k.py b/configs/setr/setr_naive_512x512_160k_b16_ade20k.py index 2abf9df77..3b1f9d7d3 100644 --- a/configs/setr/setr_naive_512x512_160k_b16_ade20k.py +++ b/configs/setr/setr_naive_512x512_160k_b16_ade20k.py @@ -4,8 +4,11 @@ _base_ = [ ] norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( - pretrained='pretrain/vit_large_patch16_384.pth', - backbone=dict(img_size=(512, 512), drop_rate=0.), + pretrained=None, + backbone=dict( + img_size=(512, 512), + drop_rate=0., + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')), decode_head=dict(num_classes=150), auxiliary_head=[ dict( diff --git a/configs/setr/setr_pup_512x512_160k_b16_ade20k.py b/configs/setr/setr_pup_512x512_160k_b16_ade20k.py index da3828364..68c3a2a4e 100644 --- a/configs/setr/setr_pup_512x512_160k_b16_ade20k.py +++ b/configs/setr/setr_pup_512x512_160k_b16_ade20k.py @@ -4,8 +4,11 @@ _base_ = [ ] norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( - pretrained='pretrain/vit_large_patch16_384.pth', - backbone=dict(img_size=(512, 512), drop_rate=0.), + pretrained=None, + backbone=dict( + img_size=(512, 512), + drop_rate=0., + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')), decode_head=dict(num_classes=150), auxiliary_head=[ dict( diff --git a/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py b/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py new file mode 100644 index 000000000..3c2fc3af7 --- /dev/null +++ b/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py @@ -0,0 +1,16 @@ +_base_ = [ + '../_base_/models/setr_mla.py', '../_base_/datasets/cityscapes_768x768.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py' +] +model = dict( + pretrained=None, + backbone=dict( + drop_rate=0, + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')), + test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512))) + +optimizer = dict( + lr=0.002, + weight_decay=0.0, + paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)})) +data = dict(samples_per_gpu=1) diff --git a/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py b/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py new file mode 100644 index 000000000..181f444ef --- /dev/null +++ b/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py @@ -0,0 +1,17 @@ +_base_ = [ + '../_base_/models/setr_naive.py', + '../_base_/datasets/cityscapes_768x768.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_80k.py' +] +model = dict( + pretrained=None, + backbone=dict( + drop_rate=0., + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')), + test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512))) + +optimizer = dict( + weight_decay=0.0, + paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)})) + +data = dict(samples_per_gpu=1) diff --git a/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py b/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py new file mode 100644 index 000000000..817a0296e --- /dev/null +++ b/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py @@ -0,0 +1,63 @@ +_base_ = [ + '../_base_/models/setr_pup.py', '../_base_/datasets/cityscapes_768x768.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py' +] + +norm_cfg = dict(type='SyncBN', requires_grad=True) +crop_size = (768, 768) +model = dict( + pretrained=None, + backbone=dict( + drop_rate=0., + init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')), + auxiliary_head=[ + dict( + type='SETRUPHead', + in_channels=1024, + channels=256, + in_index=0, + num_classes=19, + dropout_ratio=0, + norm_cfg=norm_cfg, + num_convs=2, + up_scale=4, + kernel_size=3, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + dict( + type='SETRUPHead', + in_channels=1024, + channels=256, + in_index=1, + num_classes=19, + dropout_ratio=0, + norm_cfg=norm_cfg, + num_convs=2, + up_scale=4, + kernel_size=3, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + dict( + type='SETRUPHead', + in_channels=1024, + channels=256, + in_index=2, + num_classes=19, + dropout_ratio=0, + norm_cfg=norm_cfg, + num_convs=2, + up_scale=4, + kernel_size=3, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)) + ], + test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512))) + +optimizer = dict( + weight_decay=0.0, + paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)})) + +data = dict(samples_per_gpu=1)