diff --git a/dinov2/models/vision_transformer.py b/dinov2/models/vision_transformer.py index 9a8894c..aa5bf26 100644 --- a/dinov2/models/vision_transformer.py +++ b/dinov2/models/vision_transformer.py @@ -405,6 +405,20 @@ def vit_large(patch_size=16, num_register_tokens=0, **kwargs): return model +def vit_huge(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1280, + depth=32, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs): """ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64