mirror of
https://github.com/huggingface/pytorch-image-models.git
synced 2025-06-03 15:01:08 +08:00
Update some clip pretrained weights to point to new hub locations, add a few missing weights
This commit is contained in:
parent
310ffa32c5
commit
eb2964b178
@ -2315,6 +2315,27 @@ default_cfgs = generate_default_cfgs({
|
||||
fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14),
|
||||
classifier='head.proj',
|
||||
),
|
||||
'resnet50_clip.cc12m': _cfgr(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
|
||||
classifier='head.proj',
|
||||
),
|
||||
'resnet50_clip.yfcc15m': _cfgr(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
|
||||
classifier='head.proj',
|
||||
),
|
||||
'resnet101_clip.yfcc15m': _cfgr(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
|
||||
classifier='head.proj',
|
||||
),
|
||||
|
||||
# avg-pool w/ optional standard classifier head variants
|
||||
'resnet50_clip_gap.openai': _cfgr(
|
||||
@ -2347,6 +2368,24 @@ default_cfgs = generate_default_cfgs({
|
||||
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
input_size=(3, 448, 448), pool_size=(14, 14),
|
||||
),
|
||||
'resnet50_clip_gap.cc12m': _cfgr(
|
||||
hf_hub_id='timm/resnet50_clip.cc12m',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
input_size=(3, 224, 224), pool_size=(7, 7),
|
||||
),
|
||||
'resnet50_clip_gap.yfcc15m': _cfgr(
|
||||
hf_hub_id='timm/resnet50_clip.cc12m',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
input_size=(3, 224, 224), pool_size=(7, 7),
|
||||
),
|
||||
'resnet101_clip_gap.yfcc15m': _cfgr(
|
||||
hf_hub_id='timm/resnet101_clip_gap.yfcc15m',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
input_size=(3, 224, 224), pool_size=(7, 7),
|
||||
),
|
||||
|
||||
'resnet50_mlp.untrained': _cfgr(
|
||||
input_size=(3, 256, 256), pool_size=(8, 8),
|
||||
|
@ -23,6 +23,7 @@ Acknowledgments:
|
||||
|
||||
Hacked together by / Copyright 2020, Ross Wightman
|
||||
"""
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
from collections import OrderedDict
|
||||
@ -1601,6 +1602,21 @@ default_cfgs = {
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
|
||||
|
||||
'vit_base_patch32_clip_224.laion400m_e32': _cfg(
|
||||
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
|
||||
'vit_base_patch16_clip_224.laion400m_e32': _cfg(
|
||||
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
|
||||
'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg(
|
||||
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
input_size=(3, 240, 240), crop_pct=1.0, num_classes=512),
|
||||
'vit_large_patch14_clip_224.laion400m_e32': _cfg(
|
||||
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
|
||||
|
||||
'vit_base_patch32_clip_224.datacompxl': _cfg(
|
||||
hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
@ -1641,44 +1657,68 @@ default_cfgs = {
|
||||
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),
|
||||
|
||||
'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg(
|
||||
hf_hub_id='facebook/metaclip-b32-fullcc2.5b',
|
||||
hf_hub_filename='metaclip_b32_fullcc2.5b.bin',
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
|
||||
'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg(
|
||||
hf_hub_id='facebook/metaclip-b16-fullcc2.5b',
|
||||
hf_hub_filename='metaclip_b16_fullcc2.5b.bin',
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
|
||||
'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg(
|
||||
hf_hub_id='facebook/metaclip-l14-fullcc2.5b',
|
||||
hf_hub_filename='metaclip_l14_fullcc2.5b.bin',
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
|
||||
'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg(
|
||||
hf_hub_id='facebook/metaclip-h14-fullcc2.5b',
|
||||
hf_hub_filename='metaclip_h14_fullcc2.5b.bin',
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
|
||||
'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
|
||||
'vit_base_patch32_clip_224.metaclip_400m': _cfg(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
|
||||
'vit_base_patch16_clip_224.metaclip_400m': _cfg(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
|
||||
'vit_large_patch14_clip_224.metaclip_400m': _cfg(
|
||||
hf_hub_id='timm/',
|
||||
hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
license='cc-by-nc-4.0',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
|
||||
|
||||
'vit_base_patch32_clip_224.openai': _cfg(
|
||||
hf_hub_id='timm/vit_base_patch32_clip_224.openai',
|
||||
hf_hub_id='timm/',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
|
||||
'vit_base_patch16_clip_224.openai': _cfg(
|
||||
hf_hub_id='timm/vit_base_patch16_clip_224.openai',
|
||||
hf_hub_id='timm/',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
|
||||
'vit_large_patch14_clip_224.openai': _cfg(
|
||||
hf_hub_id='timm/vit_large_patch14_clip_224.openai',
|
||||
hf_hub_id='timm/',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
|
||||
'vit_large_patch14_clip_336.openai': _cfg(
|
||||
hf_hub_id='timm/vit_large_patch14_clip_336.openai', hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
|
||||
notes=('natively QuickGELU, use quickgelu model variant for original results',),
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
crop_pct=1.0, input_size=(3, 336, 336), num_classes=768),
|
||||
@ -2071,22 +2111,13 @@ default_cfgs = {
|
||||
input_size=(3, 160, 160), crop_pct=0.95),
|
||||
}
|
||||
|
||||
_quick_gelu_cfgs = [
|
||||
'vit_large_patch14_clip_224.dfn2b',
|
||||
'vit_huge_patch14_clip_224.dfn5b',
|
||||
'vit_huge_patch14_clip_378.dfn5b',
|
||||
'vit_base_patch32_clip_224.metaclip_2pt5b',
|
||||
'vit_base_patch16_clip_224.metaclip_2pt5b',
|
||||
'vit_large_patch14_clip_224.metaclip_2pt5b',
|
||||
'vit_huge_patch14_clip_224.metaclip_2pt5b',
|
||||
'vit_base_patch32_clip_224.openai',
|
||||
'vit_base_patch16_clip_224.openai',
|
||||
'vit_large_patch14_clip_224.openai',
|
||||
'vit_large_patch14_clip_336.openai',
|
||||
]
|
||||
default_cfgs.update({
|
||||
n.replace('_clip_', '_clip_quickgelu_'): default_cfgs[n] for n in _quick_gelu_cfgs
|
||||
})
|
||||
_quick_gelu_cfgs = [n for n, c in default_cfgs.items() if c.get('notes', ()) and 'quickgelu' in c['notes'][0]]
|
||||
for n in _quick_gelu_cfgs:
|
||||
# generate quickgelu default cfgs based on contents of notes field
|
||||
c = copy.deepcopy(default_cfgs[n])
|
||||
if c['hf_hub_id'] == 'timm/':
|
||||
c['hf_hub_id'] = 'timm/' + n # need to use non-quickgelu model name for hub id
|
||||
default_cfgs[n.replace('_clip_', '_clip_quickgelu_')] = c
|
||||
default_cfgs = generate_default_cfgs(default_cfgs)
|
||||
|
||||
|
||||
@ -2510,6 +2541,16 @@ def vit_base_patch16_clip_384(pretrained: bool = False, **kwargs) -> VisionTrans
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def vit_base_patch16_plus_clip_240(pretrained: bool = False, **kwargs) -> VisionTransformer:
|
||||
""" ViT-Base (ViT-B/16+) CLIP image tower @ 240x240
|
||||
"""
|
||||
model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, pre_norm=True, norm_layer=nn.LayerNorm)
|
||||
model = _create_vision_transformer(
|
||||
'vit_base_patch16_plus_clip_240', pretrained=pretrained, **dict(model_args, **kwargs))
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def vit_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
|
||||
""" ViT-Large model (ViT-L/14) CLIP image tower
|
||||
@ -2656,6 +2697,18 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained: bool = False, **kwargs) -> V
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def vit_gigantic_patch14_clip_quickgelu_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
|
||||
""" ViT-bigG model (ViT-G/14) w/ QuickGELU act
|
||||
"""
|
||||
model_args = dict(
|
||||
patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True,
|
||||
norm_layer=nn.LayerNorm, act_layer='quick_gelu')
|
||||
model = _create_vision_transformer(
|
||||
'vit_gigantic_patch14_clip_quickgelu_224', pretrained=pretrained, **dict(model_args, **kwargs))
|
||||
return model
|
||||
|
||||
|
||||
# Experimental models below
|
||||
|
||||
@register_model
|
||||
|
Loading…
x
Reference in New Issue
Block a user