Improve kwarg passthrough for swin, vit, deit, beit, eva

pull/1760/head
Ross Wightman 2023-04-05 21:37:16 -07:00
parent a09e240cd6
commit 1bb3989b61
9 changed files with 345 additions and 373 deletions

View File

@ -513,62 +513,62 @@ def _create_beit(variant, pretrained=False, **kwargs):
@register_model
def beit_base_patch16_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
model = _create_beit('beit_base_patch16_224', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1)
model = _create_beit('beit_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def beit_base_patch16_384(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1)
model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def beit_large_patch16_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def beit_large_patch16_384(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def beit_large_patch16_512(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def beitv2_base_patch16_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
model = _create_beit('beitv2_base_patch16_224', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
model = _create_beit('beitv2_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def beitv2_large_patch16_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **model_kwargs)
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5)
model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model

View File

@ -226,8 +226,8 @@ def deit_tiny_patch16_224(pretrained=False, **kwargs):
""" DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
model = _create_deit('deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
model = _create_deit('deit_tiny_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -236,8 +236,8 @@ def deit_small_patch16_224(pretrained=False, **kwargs):
""" DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
model = _create_deit('deit_small_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
model = _create_deit('deit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -246,8 +246,8 @@ def deit_base_patch16_224(pretrained=False, **kwargs):
""" DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
model = _create_deit('deit_base_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_deit('deit_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -256,8 +256,8 @@ def deit_base_patch16_384(pretrained=False, **kwargs):
""" DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
model = _create_deit('deit_base_patch16_384', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_deit('deit_base_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -266,9 +266,9 @@ def deit_tiny_distilled_patch16_224(pretrained=False, **kwargs):
""" DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
model_args = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
model = _create_deit(
'deit_tiny_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
'deit_tiny_distilled_patch16_224', pretrained=pretrained, distilled=True, **dict(model_args, **kwargs))
return model
@ -277,9 +277,9 @@ def deit_small_distilled_patch16_224(pretrained=False, **kwargs):
""" DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
model = _create_deit(
'deit_small_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
'deit_small_distilled_patch16_224', pretrained=pretrained, distilled=True, **dict(model_args, **kwargs))
return model
@ -288,9 +288,9 @@ def deit_base_distilled_patch16_224(pretrained=False, **kwargs):
""" DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_deit(
'deit_base_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
'deit_base_distilled_patch16_224', pretrained=pretrained, distilled=True, **dict(model_args, **kwargs))
return model
@ -299,9 +299,9 @@ def deit_base_distilled_patch16_384(pretrained=False, **kwargs):
""" DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_deit(
'deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs)
'deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **dict(model_args, **kwargs))
return model
@ -310,9 +310,8 @@ def deit3_small_patch16_224(pretrained=False, **kwargs):
""" DeiT-3 small model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_small_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -321,9 +320,8 @@ def deit3_small_patch16_384(pretrained=False, **kwargs):
""" DeiT-3 small model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_small_patch16_384', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_small_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -332,9 +330,8 @@ def deit3_medium_patch16_224(pretrained=False, **kwargs):
""" DeiT-3 medium model @ 224x224 (https://arxiv.org/abs/2012.12877).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_medium_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=512, depth=12, num_heads=8, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_medium_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -343,9 +340,8 @@ def deit3_base_patch16_224(pretrained=False, **kwargs):
""" DeiT-3 base model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_base_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -354,9 +350,8 @@ def deit3_base_patch16_384(pretrained=False, **kwargs):
""" DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_base_patch16_384', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_base_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -365,9 +360,8 @@ def deit3_large_patch16_224(pretrained=False, **kwargs):
""" DeiT-3 large model @ 224x224 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_large_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_large_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -376,9 +370,8 @@ def deit3_large_patch16_384(pretrained=False, **kwargs):
""" DeiT-3 large model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_large_patch16_384', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_large_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -387,9 +380,8 @@ def deit3_huge_patch14_224(pretrained=False, **kwargs):
""" DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118).
ImageNet-1k weights from https://github.com/facebookresearch/deit.
"""
model_kwargs = dict(
patch_size=14, embed_dim=1280, depth=32, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs)
model = _create_deit('deit3_huge_patch14_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, no_embed_class=True, init_values=1e-6)
model = _create_deit('deit3_huge_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model

View File

@ -774,33 +774,30 @@ default_cfgs = generate_default_cfgs({
@register_model
def eva_giant_patch14_224(pretrained=False, **kwargs):
""" EVA-g model https://arxiv.org/abs/2211.07636 """
model_kwargs = dict(
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
model = _create_eva('eva_giant_patch14_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408)
model = _create_eva('eva_giant_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva_giant_patch14_336(pretrained=False, **kwargs):
""" EVA-g model https://arxiv.org/abs/2211.07636 """
model_kwargs = dict(
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
model = _create_eva('eva_giant_patch14_336', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408)
model = _create_eva('eva_giant_patch14_336', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva_giant_patch14_560(pretrained=False, **kwargs):
""" EVA-g model https://arxiv.org/abs/2211.07636 """
model_kwargs = dict(
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
model = _create_eva('eva_giant_patch14_560', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408)
model = _create_eva('eva_giant_patch14_560', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_tiny_patch14_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=14,
embed_dim=192,
@ -811,13 +808,13 @@ def eva02_tiny_patch14_224(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_tiny_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_tiny_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_small_patch14_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=14,
embed_dim=384,
@ -828,13 +825,13 @@ def eva02_small_patch14_224(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_small_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_small_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_base_patch14_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=14,
embed_dim=768,
@ -847,13 +844,13 @@ def eva02_base_patch14_224(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_base_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_base_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_large_patch14_224(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=14,
embed_dim=1024,
@ -866,13 +863,13 @@ def eva02_large_patch14_224(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_large_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_large_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_tiny_patch14_336(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=336,
patch_size=14,
embed_dim=192,
@ -883,13 +880,13 @@ def eva02_tiny_patch14_336(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_tiny_patch14_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_tiny_patch14_336', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_small_patch14_336(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=336,
patch_size=14,
embed_dim=384,
@ -900,13 +897,13 @@ def eva02_small_patch14_336(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_small_patch14_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_small_patch14_336', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_base_patch14_448(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=448,
patch_size=14,
embed_dim=768,
@ -919,13 +916,13 @@ def eva02_base_patch14_448(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_base_patch14_448', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_base_patch14_448', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_large_patch14_448(pretrained=False, **kwargs):
model_kwargs = dict(
model_args = dict(
img_size=448,
patch_size=14,
embed_dim=1024,
@ -938,14 +935,14 @@ def eva02_large_patch14_448(pretrained=False, **kwargs):
use_rot_pos_emb=True,
ref_feat_shape=(16, 16), # 224/14
)
model = _create_eva('eva02_large_patch14_448', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_large_patch14_448', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_base_patch16_clip_224(pretrained=False, **kwargs):
# A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_base
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=16,
embed_dim=768,
@ -960,14 +957,14 @@ def eva02_base_patch16_clip_224(pretrained=False, **kwargs):
ref_feat_shape=(16, 16), # 224/14
global_pool='token',
)
model = _create_eva('eva02_base_patch16_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_base_patch16_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_large_patch14_clip_224(pretrained=False, **kwargs):
# A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=14,
embed_dim=1024,
@ -982,14 +979,14 @@ def eva02_large_patch14_clip_224(pretrained=False, **kwargs):
ref_feat_shape=(16, 16), # 224/14
global_pool='token',
)
model = _create_eva('eva02_large_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_large_patch14_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva02_enormous_patch14_clip_224(pretrained=False, **kwargs):
# A EVA-CLIP specific variant that uses residual post-norm in blocks
model_kwargs = dict(
model_args = dict(
img_size=224,
patch_size=14,
embed_dim=1792,
@ -999,5 +996,5 @@ def eva02_enormous_patch14_clip_224(pretrained=False, **kwargs):
use_post_norm=True,
global_pool='token',
)
model = _create_eva('eva02_enormous_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model = _create_eva('eva02_enormous_patch14_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model

View File

@ -701,84 +701,81 @@ default_cfgs = generate_default_cfgs({
def swin_tiny_patch4_window7_224(pretrained=False, **kwargs):
""" Swin-T @ 224x224, trained ImageNet-1k
"""
model_kwargs = dict(
patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer('swin_tiny_patch4_window7_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer(
'swin_tiny_patch4_window7_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_small_patch4_window7_224(pretrained=False, **kwargs):
""" Swin-S @ 224x224
"""
model_kwargs = dict(
patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer('swin_small_patch4_window7_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer(
'swin_small_patch4_window7_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_base_patch4_window7_224(pretrained=False, **kwargs):
""" Swin-B @ 224x224
"""
model_kwargs = dict(
patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
return _create_swin_transformer('swin_base_patch4_window7_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32))
return _create_swin_transformer(
'swin_base_patch4_window7_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_base_patch4_window12_384(pretrained=False, **kwargs):
""" Swin-B @ 384x384
"""
model_kwargs = dict(
patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
return _create_swin_transformer('swin_base_patch4_window12_384', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32))
return _create_swin_transformer(
'swin_base_patch4_window12_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_large_patch4_window7_224(pretrained=False, **kwargs):
""" Swin-L @ 224x224
"""
model_kwargs = dict(
patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
return _create_swin_transformer('swin_large_patch4_window7_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48))
return _create_swin_transformer(
'swin_large_patch4_window7_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_large_patch4_window12_384(pretrained=False, **kwargs):
""" Swin-L @ 384x384
"""
model_kwargs = dict(
patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
return _create_swin_transformer('swin_large_patch4_window12_384', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48))
return _create_swin_transformer(
'swin_large_patch4_window12_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_s3_tiny_224(pretrained=False, **kwargs):
""" Swin-S3-T @ 224x224, https://arxiv.org/abs/2111.14725
"""
model_kwargs = dict(
patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 6, 2),
num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer('swin_s3_tiny_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer('swin_s3_tiny_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_s3_small_224(pretrained=False, **kwargs):
""" Swin-S3-S @ 224x224, https://arxiv.org/abs/2111.14725
"""
model_kwargs = dict(
patch_size=4, window_size=(14, 14, 14, 7), embed_dim=96, depths=(2, 2, 18, 2),
num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer('swin_s3_small_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=4, window_size=(14, 14, 14, 7), embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer('swin_s3_small_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swin_s3_base_224(pretrained=False, **kwargs):
""" Swin-S3-B @ 224x224, https://arxiv.org/abs/2111.14725
"""
model_kwargs = dict(
patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 30, 2),
num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer('swin_s3_base_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 30, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer('swin_s3_base_224', pretrained=pretrained, **dict(model_args, **kwargs))
register_model_deprecations(__name__, {

View File

@ -709,116 +709,116 @@ default_cfgs = generate_default_cfgs({
def swinv2_tiny_window16_256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=16, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer_v2('swinv2_tiny_window16_256', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=16, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer_v2(
'swinv2_tiny_window16_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_tiny_window8_256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=8, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer_v2('swinv2_tiny_window8_256', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=8, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer_v2(
'swinv2_tiny_window8_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_small_window16_256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=16, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer_v2('swinv2_small_window16_256', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=16, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer_v2(
'swinv2_small_window16_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_small_window8_256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=8, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
return _create_swin_transformer_v2('swinv2_small_window8_256', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=8, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24))
return _create_swin_transformer_v2(
'swinv2_small_window8_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_base_window16_256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
return _create_swin_transformer_v2('swinv2_base_window16_256', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32))
return _create_swin_transformer_v2(
'swinv2_base_window16_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_base_window8_256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=8, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
return _create_swin_transformer_v2('swinv2_base_window8_256', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=8, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32))
return _create_swin_transformer_v2(
'swinv2_base_window8_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_base_window12_192(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
return _create_swin_transformer_v2('swinv2_base_window12_192', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32))
return _create_swin_transformer_v2(
'swinv2_base_window12_192', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_base_window12to16_192to256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
model_args = dict(
window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32),
pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
pretrained_window_sizes=(12, 12, 12, 6))
return _create_swin_transformer_v2(
'swinv2_base_window12to16_192to256', pretrained=pretrained, **model_kwargs)
'swinv2_base_window12to16_192to256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_base_window12to24_192to384(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
model_args = dict(
window_size=24, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32),
pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
pretrained_window_sizes=(12, 12, 12, 6))
return _create_swin_transformer_v2(
'swinv2_base_window12to24_192to384', pretrained=pretrained, **model_kwargs)
'swinv2_base_window12to24_192to384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_large_window12_192(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
return _create_swin_transformer_v2('swinv2_large_window12_192', pretrained=pretrained, **model_kwargs)
model_args = dict(window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48))
return _create_swin_transformer_v2(
'swinv2_large_window12_192', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_large_window12to16_192to256(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
model_args = dict(
window_size=16, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48),
pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
pretrained_window_sizes=(12, 12, 12, 6))
return _create_swin_transformer_v2(
'swinv2_large_window12to16_192to256', pretrained=pretrained, **model_kwargs)
'swinv2_large_window12to16_192to256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_large_window12to24_192to384(pretrained=False, **kwargs):
"""
"""
model_kwargs = dict(
model_args = dict(
window_size=24, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48),
pretrained_window_sizes=(12, 12, 12, 6), **kwargs)
pretrained_window_sizes=(12, 12, 12, 6))
return _create_swin_transformer_v2(
'swinv2_large_window12to24_192to384', pretrained=pretrained, **model_kwargs)
'swinv2_large_window12to24_192to384', pretrained=pretrained, **dict(model_args, **kwargs))
register_model_deprecations(__name__, {

View File

@ -838,25 +838,23 @@ default_cfgs = generate_default_cfgs({
@register_model
def swinv2_cr_tiny_384(pretrained=False, **kwargs):
"""Swin-T V2 CR @ 384x384, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 6, 2),
num_heads=(3, 6, 12, 24),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_tiny_384', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_tiny_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_tiny_224(pretrained=False, **kwargs):
"""Swin-T V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 6, 2),
num_heads=(3, 6, 12, 24),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_tiny_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_tiny_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
@ -864,176 +862,160 @@ def swinv2_cr_tiny_ns_224(pretrained=False, **kwargs):
"""Swin-T V2 CR @ 224x224, trained ImageNet-1k w/ extra stage norms.
** Experimental, may make default if results are improved. **
"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 6, 2),
num_heads=(3, 6, 12, 24),
extra_norm_stage=True,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_tiny_ns_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_tiny_ns_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_small_384(pretrained=False, **kwargs):
"""Swin-S V2 CR @ 384x384, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 18, 2),
num_heads=(3, 6, 12, 24),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_small_384', pretrained=pretrained, **model_kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_small_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_small_224(pretrained=False, **kwargs):
"""Swin-S V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 18, 2),
num_heads=(3, 6, 12, 24),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_small_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_small_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_small_ns_224(pretrained=False, **kwargs):
"""Swin-S V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 18, 2),
num_heads=(3, 6, 12, 24),
extra_norm_stage=True,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_small_ns_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_small_ns_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_small_ns_256(pretrained=False, **kwargs):
"""Swin-S V2 CR @ 256x256, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=96,
depths=(2, 2, 18, 2),
num_heads=(3, 6, 12, 24),
extra_norm_stage=True,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_small_ns_256', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_small_ns_256', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_base_384(pretrained=False, **kwargs):
"""Swin-B V2 CR @ 384x384, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=128,
depths=(2, 2, 18, 2),
num_heads=(4, 8, 16, 32),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_base_384', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_base_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_base_224(pretrained=False, **kwargs):
"""Swin-B V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=128,
depths=(2, 2, 18, 2),
num_heads=(4, 8, 16, 32),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_base_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_base_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_base_ns_224(pretrained=False, **kwargs):
"""Swin-B V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=128,
depths=(2, 2, 18, 2),
num_heads=(4, 8, 16, 32),
extra_norm_stage=True,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_base_ns_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_base_ns_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_large_384(pretrained=False, **kwargs):
"""Swin-L V2 CR @ 384x384, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=192,
depths=(2, 2, 18, 2),
num_heads=(6, 12, 24, 48),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_large_384', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_large_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_large_224(pretrained=False, **kwargs):
"""Swin-L V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=192,
depths=(2, 2, 18, 2),
num_heads=(6, 12, 24, 48),
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_large_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_large_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_huge_384(pretrained=False, **kwargs):
"""Swin-H V2 CR @ 384x384, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=352,
depths=(2, 2, 18, 2),
num_heads=(11, 22, 44, 88), # head count not certain for Huge, 384 & 224 trying diff values
extra_norm_period=6,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_huge_384', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_huge_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_huge_224(pretrained=False, **kwargs):
"""Swin-H V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=352,
depths=(2, 2, 18, 2),
num_heads=(8, 16, 32, 64), # head count not certain for Huge, 384 & 224 trying diff values
extra_norm_period=6,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_huge_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_huge_224', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_giant_384(pretrained=False, **kwargs):
"""Swin-G V2 CR @ 384x384, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=512,
depths=(2, 2, 42, 2),
num_heads=(16, 32, 64, 128),
extra_norm_period=6,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_giant_384', pretrained=pretrained, **model_kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_giant_384', pretrained=pretrained, **dict(model_args, **kwargs))
@register_model
def swinv2_cr_giant_224(pretrained=False, **kwargs):
"""Swin-G V2 CR @ 224x224, trained ImageNet-1k"""
model_kwargs = dict(
model_args = dict(
embed_dim=512,
depths=(2, 2, 42, 2),
num_heads=(16, 32, 64, 128),
extra_norm_period=6,
**kwargs
)
return _create_swin_transformer_v2_cr('swinv2_cr_giant_224', pretrained=pretrained, **model_kwargs)
return _create_swin_transformer_v2_cr('swinv2_cr_giant_224', pretrained=pretrained, **dict(model_args, **kwargs))

View File

@ -1319,8 +1319,8 @@ def _create_vision_transformer(variant, pretrained=False, **kwargs):
def vit_tiny_patch16_224(pretrained=False, **kwargs):
""" ViT-Tiny (Vit-Ti/16)
"""
model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
model = _create_vision_transformer('vit_tiny_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
model = _create_vision_transformer('vit_tiny_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1328,8 +1328,8 @@ def vit_tiny_patch16_224(pretrained=False, **kwargs):
def vit_tiny_patch16_384(pretrained=False, **kwargs):
""" ViT-Tiny (Vit-Ti/16) @ 384x384.
"""
model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
model = _create_vision_transformer('vit_tiny_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
model = _create_vision_transformer('vit_tiny_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1337,8 +1337,8 @@ def vit_tiny_patch16_384(pretrained=False, **kwargs):
def vit_small_patch32_224(pretrained=False, **kwargs):
""" ViT-Small (ViT-S/32)
"""
model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch32_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch32_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1346,8 +1346,8 @@ def vit_small_patch32_224(pretrained=False, **kwargs):
def vit_small_patch32_384(pretrained=False, **kwargs):
""" ViT-Small (ViT-S/32) at 384x384.
"""
model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch32_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch32_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1355,8 +1355,8 @@ def vit_small_patch32_384(pretrained=False, **kwargs):
def vit_small_patch16_224(pretrained=False, **kwargs):
""" ViT-Small (ViT-S/16)
"""
model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1364,8 +1364,8 @@ def vit_small_patch16_224(pretrained=False, **kwargs):
def vit_small_patch16_384(pretrained=False, **kwargs):
""" ViT-Small (ViT-S/16)
"""
model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1373,8 +1373,8 @@ def vit_small_patch16_384(pretrained=False, **kwargs):
def vit_small_patch8_224(pretrained=False, **kwargs):
""" ViT-Small (ViT-S/8)
"""
model_kwargs = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch8_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer('vit_small_patch8_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1383,8 +1383,8 @@ def vit_base_patch32_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1393,8 +1393,8 @@ def vit_base_patch32_384(pretrained=False, **kwargs):
""" ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1403,8 +1403,8 @@ def vit_base_patch16_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1413,8 +1413,8 @@ def vit_base_patch16_384(pretrained=False, **kwargs):
""" ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1423,8 +1423,8 @@ def vit_base_patch8_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch8_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer('vit_base_patch8_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1432,8 +1432,8 @@ def vit_base_patch8_224(pretrained=False, **kwargs):
def vit_large_patch32_224(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
"""
model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1442,8 +1442,8 @@ def vit_large_patch32_384(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1452,8 +1452,8 @@ def vit_large_patch16_224(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1462,8 +1462,8 @@ def vit_large_patch16_384(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
"""
model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1471,8 +1471,8 @@ def vit_large_patch16_384(pretrained=False, **kwargs):
def vit_large_patch14_224(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/14)
"""
model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer('vit_large_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1480,8 +1480,8 @@ def vit_large_patch14_224(pretrained=False, **kwargs):
def vit_huge_patch14_224(pretrained=False, **kwargs):
""" ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
"""
model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16)
model = _create_vision_transformer('vit_huge_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16)
model = _create_vision_transformer('vit_huge_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1489,8 +1489,8 @@ def vit_huge_patch14_224(pretrained=False, **kwargs):
def vit_giant_patch14_224(pretrained=False, **kwargs):
""" ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
"""
model_kwargs = dict(patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16)
model = _create_vision_transformer('vit_giant_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16)
model = _create_vision_transformer('vit_giant_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1498,9 +1498,9 @@ def vit_giant_patch14_224(pretrained=False, **kwargs):
def vit_gigantic_patch14_224(pretrained=False, **kwargs):
""" ViT-Gigantic (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
"""
model_kwargs = dict(patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16)
model_args = dict(patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16)
model = _create_vision_transformer(
'vit_gigantic_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_gigantic_patch14_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1509,9 +1509,9 @@ def vit_base_patch16_224_miil(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False)
model = _create_vision_transformer(
'vit_base_patch16_224_miil', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_224_miil', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1519,11 +1519,11 @@ def vit_base_patch16_224_miil(pretrained=False, **kwargs):
def vit_medium_patch16_gap_240(pretrained=False, **kwargs):
""" ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 240x240
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, class_token=False,
global_pool='avg', qkv_bias=False, init_values=1e-6, fc_norm=False)
model = _create_vision_transformer(
'vit_medium_patch16_gap_240', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_medium_patch16_gap_240', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1531,11 +1531,11 @@ def vit_medium_patch16_gap_240(pretrained=False, **kwargs):
def vit_medium_patch16_gap_256(pretrained=False, **kwargs):
""" ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 256x256
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, class_token=False,
global_pool='avg', qkv_bias=False, init_values=1e-6, fc_norm=False)
model = _create_vision_transformer(
'vit_medium_patch16_gap_256', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_medium_patch16_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1543,11 +1543,11 @@ def vit_medium_patch16_gap_256(pretrained=False, **kwargs):
def vit_medium_patch16_gap_384(pretrained=False, **kwargs):
""" ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 384x384
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, class_token=False,
global_pool='avg', qkv_bias=False, init_values=1e-6, fc_norm=False)
model = _create_vision_transformer(
'vit_medium_patch16_gap_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_medium_patch16_gap_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1555,10 +1555,10 @@ def vit_medium_patch16_gap_384(pretrained=False, **kwargs):
def vit_base_patch16_gap_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/o class token, w/ avg-pool @ 256x256
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=16, class_token=False, global_pool='avg', fc_norm=False)
model = _create_vision_transformer(
'vit_base_patch16_gap_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_gap_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1566,10 +1566,10 @@ def vit_base_patch16_gap_224(pretrained=False, **kwargs):
def vit_base_patch32_clip_224(pretrained=False, **kwargs):
""" ViT-B/32 CLIP image tower @ 224x224
"""
model_kwargs = dict(
model_args = dict(
patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch32_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch32_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1577,10 +1577,10 @@ def vit_base_patch32_clip_224(pretrained=False, **kwargs):
def vit_base_patch32_clip_384(pretrained=False, **kwargs):
""" ViT-B/32 CLIP image tower @ 384x384
"""
model_kwargs = dict(
model_args = dict(
patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch32_clip_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch32_clip_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1588,10 +1588,10 @@ def vit_base_patch32_clip_384(pretrained=False, **kwargs):
def vit_base_patch32_clip_448(pretrained=False, **kwargs):
""" ViT-B/32 CLIP image tower @ 448x448
"""
model_kwargs = dict(
model_args = dict(
patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch32_clip_448', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch32_clip_448', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1599,9 +1599,9 @@ def vit_base_patch32_clip_448(pretrained=False, **kwargs):
def vit_base_patch16_clip_224(pretrained=False, **kwargs):
""" ViT-B/16 CLIP image tower
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch16_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1609,9 +1609,9 @@ def vit_base_patch16_clip_224(pretrained=False, **kwargs):
def vit_base_patch16_clip_384(pretrained=False, **kwargs):
""" ViT-B/16 CLIP image tower @ 384x384
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_base_patch16_clip_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_clip_384', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1619,9 +1619,9 @@ def vit_base_patch16_clip_384(pretrained=False, **kwargs):
def vit_large_patch14_clip_224(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/14) CLIP image tower
"""
model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model_args = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_large_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_large_patch14_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1629,9 +1629,9 @@ def vit_large_patch14_clip_224(pretrained=False, **kwargs):
def vit_large_patch14_clip_336(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/14) CLIP image tower @ 336x336
"""
model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model_args = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_large_patch14_clip_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_large_patch14_clip_336', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1639,9 +1639,9 @@ def vit_large_patch14_clip_336(pretrained=False, **kwargs):
def vit_huge_patch14_clip_224(pretrained=False, **kwargs):
""" ViT-Huge model (ViT-H/14) CLIP image tower.
"""
model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model_args = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_huge_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_huge_patch14_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1649,9 +1649,9 @@ def vit_huge_patch14_clip_224(pretrained=False, **kwargs):
def vit_huge_patch14_clip_336(pretrained=False, **kwargs):
""" ViT-Huge model (ViT-H/14) CLIP image tower @ 336x336
"""
model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model_args = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_huge_patch14_clip_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_huge_patch14_clip_336', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1660,10 +1660,10 @@ def vit_giant_patch14_clip_224(pretrained=False, **kwargs):
""" ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
Pretrained weights from CLIP image tower.
"""
model_kwargs = dict(
model_args = dict(
patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_giant_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_giant_patch14_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1672,10 +1672,10 @@ def vit_gigantic_patch14_clip_224(pretrained=False, **kwargs):
""" ViT-bigG model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
Pretrained weights from CLIP image tower.
"""
model_kwargs = dict(
model_args = dict(
patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
model = _create_vision_transformer(
'vit_gigantic_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_gigantic_patch14_clip_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
# Experimental models below
@ -1684,9 +1684,9 @@ def vit_gigantic_patch14_clip_224(pretrained=False, **kwargs):
def vit_base_patch32_plus_256(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/32+)
"""
model_kwargs = dict(patch_size=32, embed_dim=896, depth=12, num_heads=14, init_values=1e-5)
model_args = dict(patch_size=32, embed_dim=896, depth=12, num_heads=14, init_values=1e-5)
model = _create_vision_transformer(
'vit_base_patch32_plus_256', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch32_plus_256', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1694,9 +1694,9 @@ def vit_base_patch32_plus_256(pretrained=False, **kwargs):
def vit_base_patch16_plus_240(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16+)
"""
model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, init_values=1e-5)
model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, init_values=1e-5)
model = _create_vision_transformer(
'vit_base_patch16_plus_240', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_plus_240', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1704,11 +1704,11 @@ def vit_base_patch16_plus_240(pretrained=False, **kwargs):
def vit_base_patch16_rpn_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ residual post-norm
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, init_values=1e-5,
class_token=False, block_fn=ResPostBlock, global_pool='avg')
model = _create_vision_transformer(
'vit_base_patch16_rpn_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_rpn_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1718,9 +1718,9 @@ def vit_small_patch16_36x1_224(pretrained=False, **kwargs):
Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
"""
model_kwargs = dict(patch_size=16, embed_dim=384, depth=36, num_heads=6, init_values=1e-5)
model_args = dict(patch_size=16, embed_dim=384, depth=36, num_heads=6, init_values=1e-5)
model = _create_vision_transformer(
'vit_small_patch16_36x1_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_small_patch16_36x1_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1730,10 +1730,10 @@ def vit_small_patch16_18x2_224(pretrained=False, **kwargs):
Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=384, depth=18, num_heads=6, init_values=1e-5, block_fn=ParallelThingsBlock)
model = _create_vision_transformer(
'vit_small_patch16_18x2_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_small_patch16_18x2_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1742,27 +1742,27 @@ def vit_base_patch16_18x2_224(pretrained=False, **kwargs):
""" ViT-Base w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=768, depth=18, num_heads=12, init_values=1e-5, block_fn=ParallelThingsBlock)
model = _create_vision_transformer(
'vit_base_patch16_18x2_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_18x2_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva_large_patch14_196(pretrained=False, **kwargs):
""" EVA-large model https://arxiv.org/abs/2211.07636 /via MAE MIM pretrain"""
model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, global_pool='avg')
model_args = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, global_pool='avg')
model = _create_vision_transformer(
'eva_large_patch14_196', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'eva_large_patch14_196', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@register_model
def eva_large_patch14_336(pretrained=False, **kwargs):
""" EVA-large model https://arxiv.org/abs/2211.07636 via MAE MIM pretrain"""
model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, global_pool='avg')
model = _create_vision_transformer('eva_large_patch14_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, global_pool='avg')
model = _create_vision_transformer('eva_large_patch14_336', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1770,8 +1770,8 @@ def eva_large_patch14_336(pretrained=False, **kwargs):
def flexivit_small(pretrained=False, **kwargs):
""" FlexiViT-Small
"""
model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True)
model = _create_vision_transformer('flexivit_small', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True)
model = _create_vision_transformer('flexivit_small', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1779,8 +1779,8 @@ def flexivit_small(pretrained=False, **kwargs):
def flexivit_base(pretrained=False, **kwargs):
""" FlexiViT-Base
"""
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True)
model = _create_vision_transformer('flexivit_base', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True)
model = _create_vision_transformer('flexivit_base', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1788,8 +1788,8 @@ def flexivit_base(pretrained=False, **kwargs):
def flexivit_large(pretrained=False, **kwargs):
""" FlexiViT-Large
"""
model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True)
model = _create_vision_transformer('flexivit_large', pretrained=pretrained, **dict(model_kwargs, **kwargs))
model_args = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True)
model = _create_vision_transformer('flexivit_large', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1797,12 +1797,12 @@ def flexivit_large(pretrained=False, **kwargs):
def vit_base_patch16_xp_224(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/14) w/ parallel blocks and qk norm enabled.
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, no_embed_class=True,
norm_layer=RmsNorm, block_fn=ParallelScalingBlock, qkv_bias=False, qk_norm=True,
)
model = _create_vision_transformer(
'vit_base_patch16_xp_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_base_patch16_xp_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1810,12 +1810,12 @@ def vit_base_patch16_xp_224(pretrained=False, **kwargs):
def vit_large_patch14_xp_224(pretrained=False, **kwargs):
""" ViT-Large model (ViT-L/14) w/ parallel blocks and qk norm enabled.
"""
model_kwargs = dict(
model_args = dict(
patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, no_embed_class=True,
norm_layer=RmsNorm, block_fn=ParallelScalingBlock, qkv_bias=False, qk_norm=True,
)
model = _create_vision_transformer(
'vit_large_patch14_xp_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_large_patch14_xp_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -1823,12 +1823,12 @@ def vit_large_patch14_xp_224(pretrained=False, **kwargs):
def vit_huge_patch14_xp_224(pretrained=False, **kwargs):
""" ViT-Huge model (ViT-H/14) w/ parallel blocks and qk norm enabled.
"""
model_kwargs = dict(
model_args = dict(
patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, no_embed_class=True,
norm_layer=RmsNorm, block_fn=ParallelScalingBlock, qkv_bias=False, qk_norm=True,
)
model = _create_vision_transformer(
'vit_huge_patch14_xp_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
'vit_huge_patch14_xp_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model

View File

@ -181,9 +181,9 @@ def vit_tiny_r_s16_p8_224(pretrained=False, **kwargs):
""" R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
"""
backbone = _resnetv2(layers=(), **kwargs)
model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs)
model_args = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3)
model = _create_vision_transformer_hybrid(
'vit_tiny_r_s16_p8_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_tiny_r_s16_p8_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -192,9 +192,9 @@ def vit_tiny_r_s16_p8_384(pretrained=False, **kwargs):
""" R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384.
"""
backbone = _resnetv2(layers=(), **kwargs)
model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs)
model_args = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3)
model = _create_vision_transformer_hybrid(
'vit_tiny_r_s16_p8_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_tiny_r_s16_p8_384', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -203,9 +203,9 @@ def vit_small_r26_s32_224(pretrained=False, **kwargs):
""" R26+ViT-S/S32 hybrid.
"""
backbone = _resnetv2((2, 2, 2, 2), **kwargs)
model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
model_args = dict(embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer_hybrid(
'vit_small_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_small_r26_s32_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -214,9 +214,9 @@ def vit_small_r26_s32_384(pretrained=False, **kwargs):
""" R26+ViT-S/S32 hybrid.
"""
backbone = _resnetv2((2, 2, 2, 2), **kwargs)
model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
model_args = dict(embed_dim=384, depth=12, num_heads=6)
model = _create_vision_transformer_hybrid(
'vit_small_r26_s32_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_small_r26_s32_384', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -225,9 +225,9 @@ def vit_base_r26_s32_224(pretrained=False, **kwargs):
""" R26+ViT-B/S32 hybrid.
"""
backbone = _resnetv2((2, 2, 2, 2), **kwargs)
model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer_hybrid(
'vit_base_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_base_r26_s32_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -236,9 +236,9 @@ def vit_base_r50_s16_224(pretrained=False, **kwargs):
""" R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
"""
backbone = _resnetv2((3, 4, 9), **kwargs)
model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer_hybrid(
'vit_base_r50_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_base_r50_s16_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -248,9 +248,9 @@ def vit_base_r50_s16_384(pretrained=False, **kwargs):
ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
"""
backbone = _resnetv2((3, 4, 9), **kwargs)
model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer_hybrid(
'vit_base_r50_s16_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_base_r50_s16_384', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -259,9 +259,9 @@ def vit_large_r50_s32_224(pretrained=False, **kwargs):
""" R50+ViT-L/S32 hybrid.
"""
backbone = _resnetv2((3, 4, 6, 3), **kwargs)
model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
model_args = dict(embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer_hybrid(
'vit_large_r50_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_large_r50_s32_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -270,9 +270,9 @@ def vit_large_r50_s32_384(pretrained=False, **kwargs):
""" R50+ViT-L/S32 hybrid.
"""
backbone = _resnetv2((3, 4, 6, 3), **kwargs)
model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs)
model_args = dict(embed_dim=1024, depth=24, num_heads=16)
model = _create_vision_transformer_hybrid(
'vit_large_r50_s32_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_large_r50_s32_384', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -281,9 +281,9 @@ def vit_small_resnet26d_224(pretrained=False, **kwargs):
""" Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
"""
backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs)
model_args = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3)
model = _create_vision_transformer_hybrid(
'vit_small_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_small_resnet26d_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -292,9 +292,9 @@ def vit_small_resnet50d_s16_224(pretrained=False, **kwargs):
""" Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
"""
backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[3])
model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs)
model_args = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3)
model = _create_vision_transformer_hybrid(
'vit_small_resnet50d_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_small_resnet50d_s16_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -303,9 +303,9 @@ def vit_base_resnet26d_224(pretrained=False, **kwargs):
""" Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
"""
backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer_hybrid(
'vit_base_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_base_resnet26d_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -314,9 +314,9 @@ def vit_base_resnet50d_224(pretrained=False, **kwargs):
""" Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
"""
backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
model_args = dict(embed_dim=768, depth=12, num_heads=12)
model = _create_vision_transformer_hybrid(
'vit_base_resnet50d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
'vit_base_resnet50d_224', backbone=backbone, pretrained=pretrained, **dict(model_args, **kwargs))
return model

View File

@ -451,10 +451,9 @@ default_cfgs = generate_default_cfgs({
def vit_relpos_base_patch32_plus_rpn_256(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/32+) w/ relative log-coord position and residual post-norm, no class token
"""
model_kwargs = dict(
patch_size=32, embed_dim=896, depth=12, num_heads=14, block_fn=ResPostRelPosBlock, **kwargs)
model_args = dict(patch_size=32, embed_dim=896, depth=12, num_heads=14, block_fn=ResPostRelPosBlock)
model = _create_vision_transformer_relpos(
'vit_relpos_base_patch32_plus_rpn_256', pretrained=pretrained, **model_kwargs)
'vit_relpos_base_patch32_plus_rpn_256', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -462,8 +461,9 @@ def vit_relpos_base_patch32_plus_rpn_256(pretrained=False, **kwargs):
def vit_relpos_base_patch16_plus_240(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16+) w/ relative log-coord position, no class token
"""
model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, **kwargs)
model = _create_vision_transformer_relpos('vit_relpos_base_patch16_plus_240', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14)
model = _create_vision_transformer_relpos(
'vit_relpos_base_patch16_plus_240', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -471,9 +471,9 @@ def vit_relpos_base_patch16_plus_240(pretrained=False, **kwargs):
def vit_relpos_small_patch16_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
"""
model_kwargs = dict(
patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=True, **kwargs)
model = _create_vision_transformer_relpos('vit_relpos_small_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=True)
model = _create_vision_transformer_relpos(
'vit_relpos_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -481,9 +481,10 @@ def vit_relpos_small_patch16_224(pretrained=False, **kwargs):
def vit_relpos_medium_patch16_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
"""
model_kwargs = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=True, **kwargs)
model = _create_vision_transformer_relpos('vit_relpos_medium_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=True)
model = _create_vision_transformer_relpos(
'vit_relpos_medium_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -491,9 +492,10 @@ def vit_relpos_medium_patch16_224(pretrained=False, **kwargs):
def vit_relpos_base_patch16_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, **kwargs)
model = _create_vision_transformer_relpos('vit_relpos_base_patch16_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True)
model = _create_vision_transformer_relpos(
'vit_relpos_base_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -501,10 +503,11 @@ def vit_relpos_base_patch16_224(pretrained=False, **kwargs):
def vit_srelpos_small_patch16_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=False,
rel_pos_dim=384, shared_rel_pos=True, **kwargs)
model = _create_vision_transformer_relpos('vit_srelpos_small_patch16_224', pretrained=pretrained, **model_kwargs)
rel_pos_dim=384, shared_rel_pos=True)
model = _create_vision_transformer_relpos(
'vit_srelpos_small_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -512,11 +515,11 @@ def vit_srelpos_small_patch16_224(pretrained=False, **kwargs):
def vit_srelpos_medium_patch16_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=False,
rel_pos_dim=512, shared_rel_pos=True, **kwargs)
rel_pos_dim=512, shared_rel_pos=True)
model = _create_vision_transformer_relpos(
'vit_srelpos_medium_patch16_224', pretrained=pretrained, **model_kwargs)
'vit_srelpos_medium_patch16_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -524,11 +527,11 @@ def vit_srelpos_medium_patch16_224(pretrained=False, **kwargs):
def vit_relpos_medium_patch16_cls_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-M/16) w/ relative log-coord position, class token present
"""
model_kwargs = dict(
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=False,
rel_pos_dim=256, class_token=True, global_pool='token', **kwargs)
rel_pos_dim=256, class_token=True, global_pool='token')
model = _create_vision_transformer_relpos(
'vit_relpos_medium_patch16_cls_224', pretrained=pretrained, **model_kwargs)
'vit_relpos_medium_patch16_cls_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -536,10 +539,10 @@ def vit_relpos_medium_patch16_cls_224(pretrained=False, **kwargs):
def vit_relpos_base_patch16_cls_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position, class token present
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False,
class_token=True, global_pool='token', **kwargs)
model = _create_vision_transformer_relpos('vit_relpos_base_patch16_cls_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, class_token=True, global_pool='token')
model = _create_vision_transformer_relpos(
'vit_relpos_base_patch16_cls_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -549,9 +552,10 @@ def vit_relpos_base_patch16_clsgap_224(pretrained=False, **kwargs):
NOTE this config is a bit of a mistake, class token was enabled but global avg-pool w/ fc-norm was not disabled
Leaving here for comparisons w/ a future re-train as it performs quite well.
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, class_token=True, **kwargs)
model = _create_vision_transformer_relpos('vit_relpos_base_patch16_clsgap_224', pretrained=pretrained, **model_kwargs)
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, class_token=True)
model = _create_vision_transformer_relpos(
'vit_relpos_base_patch16_clsgap_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -559,10 +563,10 @@ def vit_relpos_base_patch16_clsgap_224(pretrained=False, **kwargs):
def vit_relpos_small_patch16_rpn_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
"""
model_kwargs = dict(
patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs)
model_args = dict(
patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, block_fn=ResPostRelPosBlock)
model = _create_vision_transformer_relpos(
'vit_relpos_small_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
'vit_relpos_small_patch16_rpn_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -570,10 +574,10 @@ def vit_relpos_small_patch16_rpn_224(pretrained=False, **kwargs):
def vit_relpos_medium_patch16_rpn_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
"""
model_kwargs = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs)
model_args = dict(
patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, block_fn=ResPostRelPosBlock)
model = _create_vision_transformer_relpos(
'vit_relpos_medium_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
'vit_relpos_medium_patch16_rpn_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model
@ -581,8 +585,8 @@ def vit_relpos_medium_patch16_rpn_224(pretrained=False, **kwargs):
def vit_relpos_base_patch16_rpn_224(pretrained=False, **kwargs):
""" ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
"""
model_kwargs = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs)
model_args = dict(
patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, block_fn=ResPostRelPosBlock)
model = _create_vision_transformer_relpos(
'vit_relpos_base_patch16_rpn_224', pretrained=pretrained, **model_kwargs)
'vit_relpos_base_patch16_rpn_224', pretrained=pretrained, **dict(model_args, **kwargs))
return model