[Feature] add setr cityscapes benchmark (#1087)

* [Feature] add setr cityscapes benchmark

* change pretrain

* Update configs/_base_/datasets/cityscapes_768x768.py

Co-authored-by: Junjun2016 <hejunjun@sjtu.edu.cn>

* remove redundant keys

* remove redundant keys

* fix lint error

* update readme

* update pretrain

Co-authored-by: Junjun2016 <hejunjun@sjtu.edu.cn>
pull/1801/head
Rockey 2021-12-03 16:18:50 +08:00 committed by GitHub
parent 07cd6c98e0
commit 6a3c31ae3f
9 changed files with 221 additions and 6 deletions

View File

@ -0,0 +1,35 @@
_base_ = './cityscapes.py'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (768, 768)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2049, 1025),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))

View File

@ -45,3 +45,11 @@ This head has two version head.
| SETR-PUP | ViT-L | 512x512 | 16 | 160000 | 19.54 | 4.50 | 48.24 | 49.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_pup_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json) |
| SETR-MLA | ViT-L | 512x512 | 8 | 160000 | 10.96 | - | 47.34 | 49.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b8_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json) |
| SETR-MLA | ViT-L | 512x512 | 16 | 160000 | 17.30 | 5.25 | 47.54 | 49.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json) |
### Cityscapes
| Method | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
| ------ | -------- | --------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| SETR-Naive | ViT-L | 768x768 | 8 | 80000 | 24.06 | 0.39 | 78.10 | 80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json) |
| SETR-PUP | ViT-L | 768x768 | 8 | 80000 | 27.96 | 0.37 | 79.21 | 81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json) |
| SETR-MLA | ViT-L | 768x768 | 8 | 80000 | 24.10 | 0.41 | 77.00 | 79.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json) |

View File

@ -3,6 +3,7 @@ Collections:
Metadata:
Training Data:
- ADE20K
- Cityscapes
Paper:
URL: https://arxiv.org/abs/2012.15840
Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
@ -95,3 +96,69 @@ Models:
mIoU(ms+flip): 49.37
Config: configs/setr/setr_mla_512x512_160k_b16_ade20k.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth
- Name: setr_vit-large_naive_8x1_768x768_80k_cityscapes
In Collection: setr
Metadata:
backbone: ViT-L
crop size: (768,768)
lr schd: 80000
inference time (ms/im):
- value: 2564.1
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (768,768)
Training Memory (GB): 24.06
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 78.1
mIoU(ms+flip): 80.22
Config: configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth
- Name: setr_vit-large_pup_8x1_768x768_80k_cityscapes
In Collection: setr
Metadata:
backbone: ViT-L
crop size: (768,768)
lr schd: 80000
inference time (ms/im):
- value: 2702.7
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (768,768)
Training Memory (GB): 27.96
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 79.21
mIoU(ms+flip): 81.02
Config: configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth
- Name: setr_vit-large_mla_8x1_768x768_80k_cityscapes
In Collection: setr
Metadata:
backbone: ViT-L
crop size: (768,768)
lr schd: 80000
inference time (ms/im):
- value: 2439.02
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (768,768)
Training Memory (GB): 24.1
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 77.0
mIoU(ms+flip): 79.59
Config: configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth

View File

@ -4,8 +4,11 @@ _base_ = [
]
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
pretrained='pretrain/vit_large_patch16_384.pth',
backbone=dict(img_size=(512, 512), drop_rate=0.),
pretrained=None,
backbone=dict(
img_size=(512, 512),
drop_rate=0.,
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
decode_head=dict(num_classes=150),
auxiliary_head=[
dict(

View File

@ -4,8 +4,11 @@ _base_ = [
]
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
pretrained='pretrain/vit_large_patch16_384.pth',
backbone=dict(img_size=(512, 512), drop_rate=0.),
pretrained=None,
backbone=dict(
img_size=(512, 512),
drop_rate=0.,
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
decode_head=dict(num_classes=150),
auxiliary_head=[
dict(

View File

@ -4,8 +4,11 @@ _base_ = [
]
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
pretrained='pretrain/vit_large_patch16_384.pth',
backbone=dict(img_size=(512, 512), drop_rate=0.),
pretrained=None,
backbone=dict(
img_size=(512, 512),
drop_rate=0.,
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
decode_head=dict(num_classes=150),
auxiliary_head=[
dict(

View File

@ -0,0 +1,16 @@
_base_ = [
'../_base_/models/setr_mla.py', '../_base_/datasets/cityscapes_768x768.py',
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
]
model = dict(
pretrained=None,
backbone=dict(
drop_rate=0,
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512)))
optimizer = dict(
lr=0.002,
weight_decay=0.0,
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
data = dict(samples_per_gpu=1)

View File

@ -0,0 +1,17 @@
_base_ = [
'../_base_/models/setr_naive.py',
'../_base_/datasets/cityscapes_768x768.py', '../_base_/default_runtime.py',
'../_base_/schedules/schedule_80k.py'
]
model = dict(
pretrained=None,
backbone=dict(
drop_rate=0.,
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512)))
optimizer = dict(
weight_decay=0.0,
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
data = dict(samples_per_gpu=1)

View File

@ -0,0 +1,63 @@
_base_ = [
'../_base_/models/setr_pup.py', '../_base_/datasets/cityscapes_768x768.py',
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
]
norm_cfg = dict(type='SyncBN', requires_grad=True)
crop_size = (768, 768)
model = dict(
pretrained=None,
backbone=dict(
drop_rate=0.,
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
auxiliary_head=[
dict(
type='SETRUPHead',
in_channels=1024,
channels=256,
in_index=0,
num_classes=19,
dropout_ratio=0,
norm_cfg=norm_cfg,
num_convs=2,
up_scale=4,
kernel_size=3,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
dict(
type='SETRUPHead',
in_channels=1024,
channels=256,
in_index=1,
num_classes=19,
dropout_ratio=0,
norm_cfg=norm_cfg,
num_convs=2,
up_scale=4,
kernel_size=3,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
dict(
type='SETRUPHead',
in_channels=1024,
channels=256,
in_index=2,
num_classes=19,
dropout_ratio=0,
norm_cfg=norm_cfg,
num_convs=2,
up_scale=4,
kernel_size=3,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
],
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512)))
optimizer = dict(
weight_decay=0.0,
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
data = dict(samples_per_gpu=1)