33 lines
861 B
Python
33 lines
861 B
Python
_base_ = ['./san-vit-b16_voc12aug-640x640.py']
|
|
|
|
model = dict(
|
|
type='MultimodalEncoderDecoder',
|
|
pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
|
|
encoder_resolution=0.7,
|
|
image_encoder=dict(
|
|
type='VisionTransformer',
|
|
img_size=(336, 336),
|
|
patch_size=14,
|
|
patch_pad=0,
|
|
embed_dims=1024,
|
|
num_layers=18,
|
|
num_heads=16,
|
|
out_indices=(5, 11, 17),
|
|
),
|
|
text_encoder=dict(
|
|
type='CLIPTextEncoder',
|
|
embed_dims=768,
|
|
num_layers=12,
|
|
num_heads=12,
|
|
output_dims=768,
|
|
),
|
|
decode_head=dict(
|
|
type='SideAdapterCLIPHead',
|
|
san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
|
|
maskgen_cfg=dict(
|
|
num_layers=6,
|
|
embed_dims=1024,
|
|
num_heads=16,
|
|
out_dims=768,
|
|
)))
|