[Feature] add setr cityscapes benchmark (#1087)
* [Feature] add setr cityscapes benchmark * change pretrain * Update configs/_base_/datasets/cityscapes_768x768.py Co-authored-by: Junjun2016 <hejunjun@sjtu.edu.cn> * remove redundant keys * remove redundant keys * fix lint error * update readme * update pretrain Co-authored-by: Junjun2016 <hejunjun@sjtu.edu.cn>pull/1801/head
parent
07cd6c98e0
commit
6a3c31ae3f
|
@ -0,0 +1,35 @@
|
|||
_base_ = './cityscapes.py'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
crop_size = (768, 768)
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(type='LoadAnnotations'),
|
||||
dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
|
||||
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
||||
dict(type='RandomFlip', prob=0.5),
|
||||
dict(type='PhotoMetricDistortion'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
||||
dict(type='DefaultFormatBundle'),
|
||||
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
||||
]
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='MultiScaleFlipAug',
|
||||
img_scale=(2049, 1025),
|
||||
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
||||
flip=False,
|
||||
transforms=[
|
||||
dict(type='Resize', keep_ratio=True),
|
||||
dict(type='RandomFlip'),
|
||||
dict(type='Normalize', **img_norm_cfg),
|
||||
dict(type='ImageToTensor', keys=['img']),
|
||||
dict(type='Collect', keys=['img']),
|
||||
])
|
||||
]
|
||||
data = dict(
|
||||
train=dict(pipeline=train_pipeline),
|
||||
val=dict(pipeline=test_pipeline),
|
||||
test=dict(pipeline=test_pipeline))
|
|
@ -45,3 +45,11 @@ This head has two version head.
|
|||
| SETR-PUP | ViT-L | 512x512 | 16 | 160000 | 19.54 | 4.50 | 48.24 | 49.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_pup_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json) |
|
||||
| SETR-MLA | ViT-L | 512x512 | 8 | 160000 | 10.96 | - | 47.34 | 49.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b8_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json) |
|
||||
| SETR-MLA | ViT-L | 512x512 | 16 | 160000 | 17.30 | 5.25 | 47.54 | 49.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json) |
|
||||
|
||||
### Cityscapes
|
||||
|
||||
| Method | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
|
||||
| ------ | -------- | --------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| SETR-Naive | ViT-L | 768x768 | 8 | 80000 | 24.06 | 0.39 | 78.10 | 80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json) |
|
||||
| SETR-PUP | ViT-L | 768x768 | 8 | 80000 | 27.96 | 0.37 | 79.21 | 81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json) |
|
||||
| SETR-MLA | ViT-L | 768x768 | 8 | 80000 | 24.10 | 0.41 | 77.00 | 79.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json) |
|
||||
|
|
|
@ -3,6 +3,7 @@ Collections:
|
|||
Metadata:
|
||||
Training Data:
|
||||
- ADE20K
|
||||
- Cityscapes
|
||||
Paper:
|
||||
URL: https://arxiv.org/abs/2012.15840
|
||||
Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
|
||||
|
@ -95,3 +96,69 @@ Models:
|
|||
mIoU(ms+flip): 49.37
|
||||
Config: configs/setr/setr_mla_512x512_160k_b16_ade20k.py
|
||||
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth
|
||||
- Name: setr_vit-large_naive_8x1_768x768_80k_cityscapes
|
||||
In Collection: setr
|
||||
Metadata:
|
||||
backbone: ViT-L
|
||||
crop size: (768,768)
|
||||
lr schd: 80000
|
||||
inference time (ms/im):
|
||||
- value: 2564.1
|
||||
hardware: V100
|
||||
backend: PyTorch
|
||||
batch size: 1
|
||||
mode: FP32
|
||||
resolution: (768,768)
|
||||
Training Memory (GB): 24.06
|
||||
Results:
|
||||
- Task: Semantic Segmentation
|
||||
Dataset: Cityscapes
|
||||
Metrics:
|
||||
mIoU: 78.1
|
||||
mIoU(ms+flip): 80.22
|
||||
Config: configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py
|
||||
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth
|
||||
- Name: setr_vit-large_pup_8x1_768x768_80k_cityscapes
|
||||
In Collection: setr
|
||||
Metadata:
|
||||
backbone: ViT-L
|
||||
crop size: (768,768)
|
||||
lr schd: 80000
|
||||
inference time (ms/im):
|
||||
- value: 2702.7
|
||||
hardware: V100
|
||||
backend: PyTorch
|
||||
batch size: 1
|
||||
mode: FP32
|
||||
resolution: (768,768)
|
||||
Training Memory (GB): 27.96
|
||||
Results:
|
||||
- Task: Semantic Segmentation
|
||||
Dataset: Cityscapes
|
||||
Metrics:
|
||||
mIoU: 79.21
|
||||
mIoU(ms+flip): 81.02
|
||||
Config: configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py
|
||||
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth
|
||||
- Name: setr_vit-large_mla_8x1_768x768_80k_cityscapes
|
||||
In Collection: setr
|
||||
Metadata:
|
||||
backbone: ViT-L
|
||||
crop size: (768,768)
|
||||
lr schd: 80000
|
||||
inference time (ms/im):
|
||||
- value: 2439.02
|
||||
hardware: V100
|
||||
backend: PyTorch
|
||||
batch size: 1
|
||||
mode: FP32
|
||||
resolution: (768,768)
|
||||
Training Memory (GB): 24.1
|
||||
Results:
|
||||
- Task: Semantic Segmentation
|
||||
Dataset: Cityscapes
|
||||
Metrics:
|
||||
mIoU: 77.0
|
||||
mIoU(ms+flip): 79.59
|
||||
Config: configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py
|
||||
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth
|
||||
|
|
|
@ -4,8 +4,11 @@ _base_ = [
|
|||
]
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
pretrained='pretrain/vit_large_patch16_384.pth',
|
||||
backbone=dict(img_size=(512, 512), drop_rate=0.),
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
img_size=(512, 512),
|
||||
drop_rate=0.,
|
||||
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
|
||||
decode_head=dict(num_classes=150),
|
||||
auxiliary_head=[
|
||||
dict(
|
||||
|
|
|
@ -4,8 +4,11 @@ _base_ = [
|
|||
]
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
pretrained='pretrain/vit_large_patch16_384.pth',
|
||||
backbone=dict(img_size=(512, 512), drop_rate=0.),
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
img_size=(512, 512),
|
||||
drop_rate=0.,
|
||||
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
|
||||
decode_head=dict(num_classes=150),
|
||||
auxiliary_head=[
|
||||
dict(
|
||||
|
|
|
@ -4,8 +4,11 @@ _base_ = [
|
|||
]
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
model = dict(
|
||||
pretrained='pretrain/vit_large_patch16_384.pth',
|
||||
backbone=dict(img_size=(512, 512), drop_rate=0.),
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
img_size=(512, 512),
|
||||
drop_rate=0.,
|
||||
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
|
||||
decode_head=dict(num_classes=150),
|
||||
auxiliary_head=[
|
||||
dict(
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
_base_ = [
|
||||
'../_base_/models/setr_mla.py', '../_base_/datasets/cityscapes_768x768.py',
|
||||
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
|
||||
]
|
||||
model = dict(
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
drop_rate=0,
|
||||
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
|
||||
test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512)))
|
||||
|
||||
optimizer = dict(
|
||||
lr=0.002,
|
||||
weight_decay=0.0,
|
||||
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
|
||||
data = dict(samples_per_gpu=1)
|
|
@ -0,0 +1,17 @@
|
|||
_base_ = [
|
||||
'../_base_/models/setr_naive.py',
|
||||
'../_base_/datasets/cityscapes_768x768.py', '../_base_/default_runtime.py',
|
||||
'../_base_/schedules/schedule_80k.py'
|
||||
]
|
||||
model = dict(
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
drop_rate=0.,
|
||||
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
|
||||
test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512)))
|
||||
|
||||
optimizer = dict(
|
||||
weight_decay=0.0,
|
||||
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
|
||||
|
||||
data = dict(samples_per_gpu=1)
|
|
@ -0,0 +1,63 @@
|
|||
_base_ = [
|
||||
'../_base_/models/setr_pup.py', '../_base_/datasets/cityscapes_768x768.py',
|
||||
'../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
|
||||
]
|
||||
|
||||
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||
crop_size = (768, 768)
|
||||
model = dict(
|
||||
pretrained=None,
|
||||
backbone=dict(
|
||||
drop_rate=0.,
|
||||
init_cfg=dict(type='Pretrained', checkpoint='mmcls://vit_large_p16')),
|
||||
auxiliary_head=[
|
||||
dict(
|
||||
type='SETRUPHead',
|
||||
in_channels=1024,
|
||||
channels=256,
|
||||
in_index=0,
|
||||
num_classes=19,
|
||||
dropout_ratio=0,
|
||||
norm_cfg=norm_cfg,
|
||||
num_convs=2,
|
||||
up_scale=4,
|
||||
kernel_size=3,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
dict(
|
||||
type='SETRUPHead',
|
||||
in_channels=1024,
|
||||
channels=256,
|
||||
in_index=1,
|
||||
num_classes=19,
|
||||
dropout_ratio=0,
|
||||
norm_cfg=norm_cfg,
|
||||
num_convs=2,
|
||||
up_scale=4,
|
||||
kernel_size=3,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
||||
dict(
|
||||
type='SETRUPHead',
|
||||
in_channels=1024,
|
||||
channels=256,
|
||||
in_index=2,
|
||||
num_classes=19,
|
||||
dropout_ratio=0,
|
||||
norm_cfg=norm_cfg,
|
||||
num_convs=2,
|
||||
up_scale=4,
|
||||
kernel_size=3,
|
||||
align_corners=False,
|
||||
loss_decode=dict(
|
||||
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
|
||||
],
|
||||
test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512)))
|
||||
|
||||
optimizer = dict(
|
||||
weight_decay=0.0,
|
||||
paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
|
||||
|
||||
data = dict(samples_per_gpu=1)
|
Loading…
Reference in New Issue