From 6894ec7edcd66a137468f6ebf729850802b5c098 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Thu, 2 Nov 2023 20:12:04 -0700 Subject: [PATCH] Forgot about datcomp b32 models --- timm/models/vision_transformer.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index a07b27e8..4a14afba 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1435,11 +1435,11 @@ default_cfgs = generate_default_cfgs({ mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280), 'vit_base_patch32_clip_224.datacompxl': _cfg( - hf_hub_id='laion/', + hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K', hf_hub_filename='open_clip_pytorch_model.bin', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), 'vit_base_patch32_clip_256.datacompxl': _cfg( - hf_hub_id='laion/', + hf_hub_id='laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K', hf_hub_filename='open_clip_pytorch_model.bin', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 256, 256), num_classes=512), @@ -1994,6 +1994,17 @@ def vit_base_patch32_clip_224(pretrained=False, **kwargs) -> VisionTransformer: return model +@register_model +def vit_base_patch32_clip_256(pretrained=False, **kwargs) -> VisionTransformer: + """ ViT-B/32 CLIP image tower @ 256x256 + """ + model_args = dict( + patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm) + model = _create_vision_transformer( + 'vit_base_patch32_clip_256', pretrained=pretrained, **dict(model_args, **kwargs)) + return model + + @register_model def vit_base_patch32_clip_384(pretrained=False, **kwargs) -> VisionTransformer: """ ViT-B/32 CLIP image tower @ 384x384