From 7234f5c6c55ba4e538ad339dcb804ebd72707511 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 17 Feb 2025 12:59:10 -0800 Subject: [PATCH] Add 448 so150m2 weight/model, add updated internvit 300m weight --- timm/models/_hub.py | 2 +- timm/models/vision_transformer.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/timm/models/_hub.py b/timm/models/_hub.py index 809705ed..1da0942b 100644 --- a/timm/models/_hub.py +++ b/timm/models/_hub.py @@ -395,7 +395,7 @@ def push_to_hf_hub( def generate_readme(model_card: dict, model_name: str): - tags = model_card.get('tags', None) or ['image-classification', 'timm'] + tags = model_card.get('tags', None) or ['image-classification', 'timm', 'transformers'] readme_text = "---\n" if tags: readme_text += "tags:\n" diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 6dbd758e..bba7afac 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -2174,12 +2174,20 @@ default_cfgs = { 'vit_so150m2_patch16_reg1_gap_384.sbb_e200_in12k_ft_in1k': _cfg( hf_hub_id='timm/', input_size=(3, 384, 384), crop_pct=1.0), + 'vit_so150m2_patch16_reg1_gap_448.sbb_e200_in12k_ft_in1k': _cfg( + hf_hub_id='timm/', + input_size=(3, 448, 448), crop_pct=1.0, crop_mode='squash'), 'vit_intern300m_patch14_448.ogvl_dist': _cfg( hf_hub_id='timm/', mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, input_size=(3, 448, 448), crop_pct=1.0, num_classes=0, ), + 'vit_intern300m_patch14_448.ogvl_2pt5': _cfg( + hf_hub_id='timm/', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, + input_size=(3, 448, 448), crop_pct=1.0, num_classes=0, + ), 'aimv2_large_patch14_224.apple_pt': _cfg( hf_hub_id='timm/', @@ -3538,6 +3546,18 @@ def vit_so150m2_patch16_reg1_gap_384(pretrained: bool = False, **kwargs) -> Visi return model +@register_model +def vit_so150m2_patch16_reg1_gap_448(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ SO150M v2 (shape optimized, but diff than paper def, optimized for GPU) """ + model_args = dict( + patch_size=16, embed_dim=832, depth=21, num_heads=13, mlp_ratio=34/13, init_values=1e-5, + qkv_bias=False, class_token=False, reg_tokens=1, global_pool='avg', + ) + model = _create_vision_transformer( + 'vit_so150m2_patch16_reg1_gap_448', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_intern300m_patch14_448(pretrained: bool = False, **kwargs) -> VisionTransformer: model_args = dict(