From 1b5cae681cee88fab71f0d31eb4115c514b61039 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Wed, 23 Oct 2024 15:26:03 -0700 Subject: [PATCH] Update some clip pretrained weights to point to new hub locations, add a few missing weights --- timm/models/byobnet.py | 39 +++++++++++ timm/models/vision_transformer.py | 109 ++++++++++++++++++++++-------- 2 files changed, 120 insertions(+), 28 deletions(-) diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py index e999dd7b..a368a4b0 100644 --- a/timm/models/byobnet.py +++ b/timm/models/byobnet.py @@ -2315,6 +2315,27 @@ default_cfgs = generate_default_cfgs({ fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14), classifier='head.proj', ), + 'resnet50_clip.cc12m': _cfgr( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), + classifier='head.proj', + ), + 'resnet50_clip.yfcc15m': _cfgr( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), + classifier='head.proj', + ), + 'resnet101_clip.yfcc15m': _cfgr( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7), + classifier='head.proj', + ), # avg-pool w/ optional standard classifier head variants 'resnet50_clip_gap.openai': _cfgr( @@ -2347,6 +2368,24 @@ default_cfgs = generate_default_cfgs({ num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 448, 448), pool_size=(14, 14), ), + 'resnet50_clip_gap.cc12m': _cfgr( + hf_hub_id='timm/resnet50_clip.cc12m', + hf_hub_filename='open_clip_pytorch_model.bin', + num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + input_size=(3, 224, 224), pool_size=(7, 7), + ), + 'resnet50_clip_gap.yfcc15m': _cfgr( + hf_hub_id='timm/resnet50_clip.cc12m', + hf_hub_filename='open_clip_pytorch_model.bin', + num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + input_size=(3, 224, 224), pool_size=(7, 7), + ), + 'resnet101_clip_gap.yfcc15m': _cfgr( + hf_hub_id='timm/resnet101_clip_gap.yfcc15m', + hf_hub_filename='open_clip_pytorch_model.bin', + num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + input_size=(3, 224, 224), pool_size=(7, 7), + ), 'resnet50_mlp.untrained': _cfgr( input_size=(3, 256, 256), pool_size=(8, 8), diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index ae2ae3b8..ba5d1958 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -23,6 +23,7 @@ Acknowledgments: Hacked together by / Copyright 2020, Ross Wightman """ +import copy import logging import math from collections import OrderedDict @@ -1601,6 +1602,21 @@ default_cfgs = { hf_hub_filename='open_clip_pytorch_model.bin', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), + 'vit_base_patch32_clip_224.laion400m_e32': _cfg( + hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + notes=('natively QuickGELU, use quickgelu model variant for original results',), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), + 'vit_base_patch16_clip_224.laion400m_e32': _cfg( + hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), + 'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg( + hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + input_size=(3, 240, 240), crop_pct=1.0, num_classes=512), + 'vit_large_patch14_clip_224.laion400m_e32': _cfg( + hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), + 'vit_base_patch32_clip_224.datacompxl': _cfg( hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K', hf_hub_filename='open_clip_pytorch_model.bin', @@ -1641,44 +1657,68 @@ default_cfgs = { crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024), 'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='facebook/metaclip-b32-fullcc2.5b', - hf_hub_filename='metaclip_b32_fullcc2.5b.bin', + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='facebook/metaclip-b16-fullcc2.5b', - hf_hub_filename='metaclip_b16_fullcc2.5b.bin', + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='facebook/metaclip-l14-fullcc2.5b', - hf_hub_filename='metaclip_l14_fullcc2.5b.bin', + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='facebook/metaclip-h14-fullcc2.5b', - hf_hub_filename='metaclip_h14_fullcc2.5b.bin', + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), + 'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + license='cc-by-nc-4.0', + notes=('natively QuickGELU, use quickgelu model variant for original results',), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), + 'vit_base_patch32_clip_224.metaclip_400m': _cfg( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + license='cc-by-nc-4.0', + notes=('natively QuickGELU, use quickgelu model variant for original results',), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), + 'vit_base_patch16_clip_224.metaclip_400m': _cfg( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + license='cc-by-nc-4.0', + notes=('natively QuickGELU, use quickgelu model variant for original results',), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), + 'vit_large_patch14_clip_224.metaclip_400m': _cfg( + hf_hub_id='timm/', + hf_hub_filename='open_clip_pytorch_model.bin', + license='cc-by-nc-4.0', + notes=('natively QuickGELU, use quickgelu model variant for original results',), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch32_clip_224.openai': _cfg( - hf_hub_id='timm/vit_base_patch32_clip_224.openai', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_base_patch16_clip_224.openai': _cfg( - hf_hub_id='timm/vit_base_patch16_clip_224.openai', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_large_patch14_clip_224.openai': _cfg( - hf_hub_id='timm/vit_large_patch14_clip_224.openai', + hf_hub_id='timm/', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_large_patch14_clip_336.openai': _cfg( - hf_hub_id='timm/vit_large_patch14_clip_336.openai', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336), num_classes=768), @@ -2071,22 +2111,13 @@ default_cfgs = { input_size=(3, 160, 160), crop_pct=0.95), } -_quick_gelu_cfgs = [ - 'vit_large_patch14_clip_224.dfn2b', - 'vit_huge_patch14_clip_224.dfn5b', - 'vit_huge_patch14_clip_378.dfn5b', - 'vit_base_patch32_clip_224.metaclip_2pt5b', - 'vit_base_patch16_clip_224.metaclip_2pt5b', - 'vit_large_patch14_clip_224.metaclip_2pt5b', - 'vit_huge_patch14_clip_224.metaclip_2pt5b', - 'vit_base_patch32_clip_224.openai', - 'vit_base_patch16_clip_224.openai', - 'vit_large_patch14_clip_224.openai', - 'vit_large_patch14_clip_336.openai', -] -default_cfgs.update({ - n.replace('_clip_', '_clip_quickgelu_'): default_cfgs[n] for n in _quick_gelu_cfgs -}) +_quick_gelu_cfgs = [n for n, c in default_cfgs.items() if c.get('notes', ()) and 'quickgelu' in c['notes'][0]] +for n in _quick_gelu_cfgs: + # generate quickgelu default cfgs based on contents of notes field + c = copy.deepcopy(default_cfgs[n]) + if c['hf_hub_id'] == 'timm/': + c['hf_hub_id'] = 'timm/' + n # need to use non-quickgelu model name for hub id + default_cfgs[n.replace('_clip_', '_clip_quickgelu_')] = c default_cfgs = generate_default_cfgs(default_cfgs) @@ -2510,6 +2541,16 @@ def vit_base_patch16_clip_384(pretrained: bool = False, **kwargs) -> VisionTrans return model +@register_model +def vit_base_patch16_plus_clip_240(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ ViT-Base (ViT-B/16+) CLIP image tower @ 240x240 + """ + model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, pre_norm=True, norm_layer=nn.LayerNorm) + model = _create_vision_transformer( + 'vit_base_patch16_plus_clip_240', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer: """ ViT-Large model (ViT-L/14) CLIP image tower @@ -2656,6 +2697,18 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained: bool = False, **kwargs) -> V return model +@register_model +def vit_gigantic_patch14_clip_quickgelu_224(pretrained: bool = False, **kwargs) -> VisionTransformer: + """ ViT-bigG model (ViT-G/14) w/ QuickGELU act + """ + model_args = dict( + patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True, + norm_layer=nn.LayerNorm, act_layer='quick_gelu') + model = _create_vision_transformer( + 'vit_gigantic_patch14_clip_quickgelu_224', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + # Experimental models below @register_model