From c5cf0e00497943b967ce3df294c417e2b1a1d668 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Sat, 18 Jan 2025 14:02:04 -0800 Subject: [PATCH] Add the 256x256 in1k ft of the so150m, add an alternate so150m def --- timm/models/vision_transformer.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 2672e1ad..8261aa00 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -2152,15 +2152,20 @@ default_cfgs = { 'vit_base_patch16_reg4_gap_256.untrained': _cfg( input_size=(3, 256, 256)), - 'vit_so150m_patch16_reg4_gap_384.sbb_e250_in12k_ft_in1k': _cfg( + 'vit_so150m_patch16_reg4_gap_256.sbb_e250_in12k_ft_in1k': _cfg( hf_hub_id='timm/', - input_size=(3, 384, 384), crop_pct=1.0), + input_size=(3, 256, 256), crop_pct=0.95), 'vit_so150m_patch16_reg4_gap_256.sbb_e250_in12k': _cfg( hf_hub_id='timm/', num_classes=11821, input_size=(3, 256, 256), crop_pct=0.95), + 'vit_so150m_patch16_reg4_gap_384.sbb_e250_in12k_ft_in1k': _cfg( + hf_hub_id='timm/', + input_size=(3, 384, 384), crop_pct=1.0), 'vit_so150m_patch16_reg4_map_256.untrained': _cfg( input_size=(3, 256, 256)), + 'vit_so150m2_patch16_reg1_gap_256.untrained': _cfg( + input_size=(3, 256, 256), crop_pct=0.95), 'vit_intern300m_patch14_448.ogvl_dist': _cfg( hf_hub_id='timm/', @@ -3467,6 +3472,7 @@ def vit_base_patch16_reg4_gap_256(pretrained: bool = False, **kwargs) -> VisionT @register_model def vit_so150m_patch16_reg4_map_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ SO150M (shape optimized, but diff than paper def, optimized for GPU) """ model_args = dict( patch_size=16, embed_dim=896, depth=18, num_heads=14, mlp_ratio=2.572, class_token=False, reg_tokens=4, global_pool='map', @@ -3478,6 +3484,7 @@ def vit_so150m_patch16_reg4_map_256(pretrained: bool = False, **kwargs) -> Visio @register_model def vit_so150m_patch16_reg4_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ SO150M (shape optimized, but diff than paper def, optimized for GPU) """ model_args = dict( patch_size=16, embed_dim=896, depth=18, num_heads=14, mlp_ratio=2.572, class_token=False, reg_tokens=4, global_pool='avg', fc_norm=False, @@ -3489,6 +3496,7 @@ def vit_so150m_patch16_reg4_gap_256(pretrained: bool = False, **kwargs) -> Visio @register_model def vit_so150m_patch16_reg4_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ SO150M (shape optimized, but diff than paper def, optimized for GPU) """ model_args = dict( patch_size=16, embed_dim=896, depth=18, num_heads=14, mlp_ratio=2.572, class_token=False, reg_tokens=4, global_pool='avg', fc_norm=False, @@ -3498,6 +3506,18 @@ def vit_so150m_patch16_reg4_gap_384(pretrained: bool = False, **kwargs) -> Visio return model +@register_model +def vit_so150m2_patch16_reg1_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ SO150M v2 (shape optimized, but diff than paper def, optimized for GPU) """ + model_args = dict( + patch_size=16, embed_dim=896, depth=20, num_heads=14, mlp_ratio=2.429, init_values=1e-5, + qkv_bias=False, class_token=False, reg_tokens=1, global_pool='avg', + ) + model = _create_vision_transformer( + 'vit_so150m2_patch16_reg1_gap_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_intern300m_patch14_448(pretrained: bool = False, **kwargs) -> VisionTransformer: model_args = dict(