Parametrizable attn_class in models/ViT
parent
ace39d1b53
commit
235f50669e
|
@ -337,49 +337,49 @@ def init_weights_vit_timm(module: nn.Module, name: str = ""):
|
|||
nn.init.zeros_(module.bias)
|
||||
|
||||
|
||||
def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
|
||||
def vit_small(patch_size=1, attn_class: nn.Module = MemEffAttention, num_register_tokens=0, **kwargs):
|
||||
model = DinoVisionTransformer(
|
||||
patch_size=patch_size,
|
||||
embed_dim=384,
|
||||
depth=12,
|
||||
num_heads=6,
|
||||
mlp_ratio=4,
|
||||
block_fn=partial(Block, attn_class=MemEffAttention),
|
||||
block_fn=partial(Block, attn_class=attn_class),
|
||||
num_register_tokens=num_register_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
|
||||
def vit_base(patch_size=16, attn_class: nn.Module = MemEffAttention, num_register_tokens=0, **kwargs):
|
||||
model = DinoVisionTransformer(
|
||||
patch_size=patch_size,
|
||||
embed_dim=768,
|
||||
depth=12,
|
||||
num_heads=12,
|
||||
mlp_ratio=4,
|
||||
block_fn=partial(Block, attn_class=MemEffAttention),
|
||||
block_fn=partial(Block, attn_class=attn_class),
|
||||
num_register_tokens=num_register_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
|
||||
def vit_large(patch_size=16, attn_class: nn.Module = MemEffAttention, num_register_tokens=0, **kwargs):
|
||||
model = DinoVisionTransformer(
|
||||
patch_size=patch_size,
|
||||
embed_dim=1024,
|
||||
depth=24,
|
||||
num_heads=16,
|
||||
mlp_ratio=4,
|
||||
block_fn=partial(Block, attn_class=MemEffAttention),
|
||||
block_fn=partial(Block, attn_class=attn_class),
|
||||
num_register_tokens=num_register_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
|
||||
def vit_giant2(patch_size=16, attn_class: nn.Module = MemEffAttention, num_register_tokens=0, **kwargs):
|
||||
"""
|
||||
Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
|
||||
"""
|
||||
|
@ -389,7 +389,7 @@ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
|
|||
depth=40,
|
||||
num_heads=24,
|
||||
mlp_ratio=4,
|
||||
block_fn=partial(Block, attn_class=MemEffAttention),
|
||||
block_fn=partial(Block, attn_class=attn_class),
|
||||
num_register_tokens=num_register_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue