142 lines
4.5 KiB
Python
142 lines
4.5 KiB
Python
_base_ = [
|
|
'../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
|
|
'../_base_/schedules/schedule_160k.py'
|
|
]
|
|
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
|
crop_size = (512, 512)
|
|
data_preprocessor = dict(
|
|
type='SegDataPreProcessor',
|
|
size=crop_size,
|
|
mean=[123.675, 116.28, 103.53],
|
|
std=[58.395, 57.12, 57.375],
|
|
bgr_to_rgb=True,
|
|
pad_val=0,
|
|
seg_pad_val=255)
|
|
# model_cfg
|
|
num_classes = 150
|
|
model = dict(
|
|
type='EncoderDecoder',
|
|
data_preprocessor=data_preprocessor,
|
|
backbone=dict(
|
|
type='ResNet',
|
|
depth=50,
|
|
num_stages=4,
|
|
out_indices=(0, 1, 2, 3),
|
|
dilations=(1, 1, 1, 1),
|
|
strides=(1, 2, 2, 2),
|
|
norm_cfg=norm_cfg,
|
|
norm_eval=True,
|
|
style='pytorch',
|
|
contract_dilation=True,
|
|
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
|
|
decode_head=dict(
|
|
type='MaskFormerHead',
|
|
in_channels=[256, 512, 1024,
|
|
2048], # input channels of pixel_decoder modules
|
|
feat_channels=256,
|
|
in_index=[0, 1, 2, 3],
|
|
num_classes=150,
|
|
out_channels=256,
|
|
num_queries=100,
|
|
pixel_decoder=dict(
|
|
type='mmdet.PixelDecoder',
|
|
norm_cfg=dict(type='GN', num_groups=32),
|
|
act_cfg=dict(type='ReLU')),
|
|
enforce_decoder_input_project=False,
|
|
positional_encoding=dict( # SinePositionalEncoding
|
|
num_feats=128, normalize=True),
|
|
transformer_decoder=dict( # DetrTransformerDecoder
|
|
return_intermediate=True,
|
|
num_layers=6,
|
|
layer_cfg=dict( # DetrTransformerDecoderLayer
|
|
self_attn_cfg=dict( # MultiheadAttention
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
attn_drop=0.1,
|
|
proj_drop=0.1,
|
|
dropout_layer=None,
|
|
batch_first=True),
|
|
cross_attn_cfg=dict( # MultiheadAttention
|
|
embed_dims=256,
|
|
num_heads=8,
|
|
attn_drop=0.1,
|
|
proj_drop=0.1,
|
|
dropout_layer=None,
|
|
batch_first=True),
|
|
ffn_cfg=dict(
|
|
embed_dims=256,
|
|
feedforward_channels=2048,
|
|
num_fcs=2,
|
|
act_cfg=dict(type='ReLU', inplace=True),
|
|
ffn_drop=0.1,
|
|
dropout_layer=None,
|
|
add_identity=True)),
|
|
init_cfg=None),
|
|
loss_cls=dict(
|
|
type='mmdet.CrossEntropyLoss',
|
|
use_sigmoid=False,
|
|
loss_weight=1.0,
|
|
reduction='mean',
|
|
class_weight=[1.0] * num_classes + [0.1]),
|
|
loss_mask=dict(
|
|
type='mmdet.FocalLoss',
|
|
use_sigmoid=True,
|
|
gamma=2.0,
|
|
alpha=0.25,
|
|
reduction='mean',
|
|
loss_weight=20.0),
|
|
loss_dice=dict(
|
|
type='mmdet.DiceLoss',
|
|
use_sigmoid=True,
|
|
activate=True,
|
|
reduction='mean',
|
|
naive_dice=True,
|
|
eps=1.0,
|
|
loss_weight=1.0),
|
|
train_cfg=dict(
|
|
assigner=dict(
|
|
type='mmdet.HungarianAssigner',
|
|
match_costs=[
|
|
dict(type='mmdet.ClassificationCost', weight=1.0),
|
|
dict(
|
|
type='mmdet.FocalLossCost',
|
|
weight=20.0,
|
|
binary_input=True),
|
|
dict(
|
|
type='mmdet.DiceCost',
|
|
weight=1.0,
|
|
pred_act=True,
|
|
eps=1.0)
|
|
]),
|
|
sampler=dict(type='mmdet.MaskPseudoSampler'))),
|
|
# training and testing settings
|
|
train_cfg=dict(),
|
|
test_cfg=dict(mode='whole'),
|
|
)
|
|
# optimizer
|
|
optimizer = dict(
|
|
type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
|
|
optim_wrapper = dict(
|
|
_delete_=True,
|
|
type='OptimWrapper',
|
|
optimizer=optimizer,
|
|
clip_grad=dict(max_norm=0.01, norm_type=2),
|
|
paramwise_cfg=dict(custom_keys={
|
|
'backbone': dict(lr_mult=0.1),
|
|
}))
|
|
# learning policy
|
|
param_scheduler = [
|
|
dict(
|
|
type='PolyLR',
|
|
eta_min=0,
|
|
power=0.9,
|
|
begin=0,
|
|
end=160000,
|
|
by_epoch=False)
|
|
]
|
|
|
|
# In MaskFormer implementation we use batch size 2 per GPU as default
|
|
train_dataloader = dict(batch_size=2, num_workers=2)
|
|
val_dataloader = dict(batch_size=1, num_workers=4)
|
|
test_dataloader = val_dataloader
|