230 lines
7.9 KiB
Python
230 lines
7.9 KiB
Python
_base_ = [
|
|
'../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
|
|
]
|
|
|
|
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth' # noqa
|
|
|
|
crop_size = (640, 640)
|
|
data_preprocessor = dict(
|
|
type='SegDataPreProcessor',
|
|
mean=[123.675, 116.28, 103.53],
|
|
std=[58.395, 57.12, 57.375],
|
|
bgr_to_rgb=True,
|
|
pad_val=0,
|
|
seg_pad_val=255,
|
|
size=crop_size)
|
|
num_classes = 150
|
|
|
|
depths = [2, 2, 18, 2]
|
|
model = dict(
|
|
type='EncoderDecoder',
|
|
data_preprocessor=data_preprocessor,
|
|
backbone=dict(
|
|
type='SwinTransformer',
|
|
pretrain_img_size=384,
|
|
embed_dims=128,
|
|
depths=depths,
|
|
num_heads=[4, 8, 16, 32],
|
|
window_size=12,
|
|
mlp_ratio=4,
|
|
qkv_bias=True,
|
|
qk_scale=None,
|
|
drop_rate=0.,
|
|
attn_drop_rate=0.,
|
|
drop_path_rate=0.3,
|
|
patch_norm=True,
|
|
out_indices=(0, 1, 2, 3),
|
|
with_cp=False,
|
|
frozen_stages=-1,
|
|
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
|
|
decode_head=dict(
|
|
type='Mask2FormerHead',
|
|
in_channels=[128, 256, 512, 1024],
|
|
strides=[4, 8, 16, 32],
|
|
feat_channels=256,
|
|
out_channels=256,
|
|
num_classes=num_classes,
|
|
num_queries=100,
|
|
num_transformer_feat_level=3,
|
|
align_corners=False,
|
|
pixel_decoder=dict(
|
|
type='mmdet.MSDeformAttnPixelDecoder',
|
|
num_outs=3,
|
|
norm_cfg=dict(type='GN', num_groups=32),
|
|
act_cfg=dict(type='ReLU'),
|
|
encoder=dict( # DeformableDetrTransformerEncoder
|
|
num_layers=6,
|
|
layer_cfg=dict( # DeformableDetrTransformerEncoderLayer
|
|
self_attn_cfg=dict( # MultiScaleDeformableAttention
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
num_levels=3,
|
|
num_points=4,
|
|
im2col_step=64,
|
|
dropout=0.0,
|
|
batch_first=True,
|
|
norm_cfg=None,
|
|
init_cfg=None),
|
|
ffn_cfg=dict(
|
|
embed_dims=256,
|
|
feedforward_channels=1024,
|
|
num_fcs=2,
|
|
ffn_drop=0.0,
|
|
act_cfg=dict(type='ReLU', inplace=True))),
|
|
init_cfg=None),
|
|
positional_encoding=dict( # SinePositionalEncoding
|
|
num_feats=128, normalize=True),
|
|
init_cfg=None),
|
|
enforce_decoder_input_project=False,
|
|
positional_encoding=dict( # SinePositionalEncoding
|
|
num_feats=128, normalize=True),
|
|
transformer_decoder=dict( # Mask2FormerTransformerDecoder
|
|
return_intermediate=True,
|
|
num_layers=9,
|
|
layer_cfg=dict( # Mask2FormerTransformerDecoderLayer
|
|
self_attn_cfg=dict( # MultiheadAttention
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
attn_drop=0.0,
|
|
proj_drop=0.0,
|
|
dropout_layer=None,
|
|
batch_first=True),
|
|
cross_attn_cfg=dict( # MultiheadAttention
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
attn_drop=0.0,
|
|
proj_drop=0.0,
|
|
dropout_layer=None,
|
|
batch_first=True),
|
|
ffn_cfg=dict(
|
|
embed_dims=256,
|
|
feedforward_channels=2048,
|
|
num_fcs=2,
|
|
act_cfg=dict(type='ReLU', inplace=True),
|
|
ffn_drop=0.0,
|
|
dropout_layer=None,
|
|
add_identity=True)),
|
|
init_cfg=None),
|
|
loss_cls=dict(
|
|
type='mmdet.CrossEntropyLoss',
|
|
use_sigmoid=False,
|
|
loss_weight=2.0,
|
|
reduction='mean',
|
|
class_weight=[1.0] * num_classes + [0.1]),
|
|
loss_mask=dict(
|
|
type='mmdet.CrossEntropyLoss',
|
|
use_sigmoid=True,
|
|
reduction='mean',
|
|
loss_weight=5.0),
|
|
loss_dice=dict(
|
|
type='mmdet.DiceLoss',
|
|
use_sigmoid=True,
|
|
activate=True,
|
|
reduction='mean',
|
|
naive_dice=True,
|
|
eps=1.0,
|
|
loss_weight=5.0),
|
|
train_cfg=dict(
|
|
num_points=12544,
|
|
oversample_ratio=3.0,
|
|
importance_sample_ratio=0.75,
|
|
assigner=dict(
|
|
type='mmdet.HungarianAssigner',
|
|
match_costs=[
|
|
dict(type='mmdet.ClassificationCost', weight=2.0),
|
|
dict(
|
|
type='mmdet.CrossEntropyLossCost',
|
|
weight=5.0,
|
|
use_sigmoid=True),
|
|
dict(
|
|
type='mmdet.DiceCost',
|
|
weight=5.0,
|
|
pred_act=True,
|
|
eps=1.0)
|
|
]),
|
|
sampler=dict(type='mmdet.MaskPseudoSampler'))),
|
|
train_cfg=dict(),
|
|
test_cfg=dict(mode='whole'))
|
|
|
|
# dataset config
|
|
train_pipeline = [
|
|
dict(type='LoadImageFromFile'),
|
|
dict(type='LoadAnnotations', reduce_zero_label=True),
|
|
dict(
|
|
type='RandomChoiceResize',
|
|
scales=[int(x * 0.1 * 640) for x in range(5, 21)],
|
|
resize_type='ResizeShortestEdge',
|
|
max_size=2560),
|
|
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
|
dict(type='RandomFlip', prob=0.5),
|
|
dict(type='PhotoMetricDistortion'),
|
|
dict(type='PackSegInputs')
|
|
]
|
|
train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
|
|
|
|
# set all layers in backbone to lr_mult=0.1
|
|
# set all norm layers, position_embeding,
|
|
# query_embeding, level_embeding to decay_multi=0.0
|
|
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
|
|
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
|
|
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
|
|
custom_keys = {
|
|
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
|
|
'backbone.patch_embed.norm': backbone_norm_multi,
|
|
'backbone.norm': backbone_norm_multi,
|
|
'absolute_pos_embed': backbone_embed_multi,
|
|
'relative_position_bias_table': backbone_embed_multi,
|
|
'query_embed': embed_multi,
|
|
'query_feat': embed_multi,
|
|
'level_embed': embed_multi
|
|
}
|
|
custom_keys.update({
|
|
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
|
|
for stage_id, num_blocks in enumerate(depths)
|
|
for block_id in range(num_blocks)
|
|
})
|
|
custom_keys.update({
|
|
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
|
|
for stage_id in range(len(depths) - 1)
|
|
})
|
|
# optimizer
|
|
optimizer = dict(
|
|
type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
|
|
optim_wrapper = dict(
|
|
type='OptimWrapper',
|
|
optimizer=optimizer,
|
|
clip_grad=dict(max_norm=0.01, norm_type=2),
|
|
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
|
|
|
|
# learning policy
|
|
param_scheduler = [
|
|
dict(
|
|
type='PolyLR',
|
|
eta_min=0,
|
|
power=0.9,
|
|
begin=0,
|
|
end=160000,
|
|
by_epoch=False)
|
|
]
|
|
|
|
# training schedule for 160k
|
|
train_cfg = dict(
|
|
type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
|
|
val_cfg = dict(type='ValLoop')
|
|
test_cfg = dict(type='TestLoop')
|
|
default_hooks = dict(
|
|
timer=dict(type='IterTimerHook'),
|
|
logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
|
|
param_scheduler=dict(type='ParamSchedulerHook'),
|
|
checkpoint=dict(
|
|
type='CheckpointHook', by_epoch=False, interval=5000,
|
|
save_best='mIoU'),
|
|
sampler_seed=dict(type='DistSamplerSeedHook'),
|
|
visualization=dict(type='SegVisualizationHook'))
|
|
|
|
# Default setting for scaling LR automatically
|
|
# - `enable` means enable scaling LR automatically
|
|
# or not by default.
|
|
# - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
|
|
auto_scale_lr = dict(enable=False, base_batch_size=16)
|