diff --git a/timm/models/convnext.py b/timm/models/convnext.py index e682379f..a6d1999b 100644 --- a/timm/models/convnext.py +++ b/timm/models/convnext.py @@ -916,53 +916,43 @@ default_cfgs = generate_default_cfgs({ # CLIP original image tower weights 'convnext_base.clip_laion2b': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laion2b_augreg': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona_320': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640), 'convnext_base.clip_laiona_augreg_320': _cfg( - hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640), 'convnext_large_mlp.clip_laion2b_augreg': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=768), 'convnext_large_mlp.clip_laion2b_ft_320': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768), 'convnext_large_mlp.clip_laion2b_ft_soup_320': _cfg( - hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768), 'convnext_xxlarge.clip_laion2b_soup': _cfg( - hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024), 'convnext_xxlarge.clip_laion2b_rewind': _cfg( - hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024), diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index b3b0ddca..63526c93 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1556,9 +1556,6 @@ default_cfgs = { hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', # FIXME weight exists, need to push - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), @@ -1569,9 +1566,6 @@ default_cfgs = { hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), - 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg( - # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', # FIXME weight exists, need to push - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), @@ -1580,28 +1574,22 @@ default_cfgs = { mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), 'vit_base_patch32_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), 'vit_base_patch16_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_giant_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_gigantic_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.laion400m_e32': _cfg( @@ -1620,21 +1608,17 @@ default_cfgs = { mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch32_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch32_clip_256.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 256, 256), num_classes=512), 'vit_base_patch16_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-B-16-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.datacompxl': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K', - hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_base_patch16_clip_224.dfn2b': _cfg( @@ -1659,42 +1643,46 @@ default_cfgs = { crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024), 'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), 'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), + 'vit_huge_patch14_clip_224.metaclip_altogether': _cfg( + hf_hub_id='timm/', + license='cc-by-nc-4.0', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), 'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch16_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_large_patch14_clip_224.metaclip_400m': _cfg( - hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin', + hf_hub_id='timm/', license='cc-by-nc-4.0', notes=('natively QuickGELU, use quickgelu model variant for original results',), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),