mirror of
https://github.com/huggingface/pytorch-image-models.git
synced 2025-06-03 15:01:08 +08:00
Merge branch 'rwightman:main' into main
This commit is contained in:
commit
434a03937d
13
README.md
13
README.md
@ -21,6 +21,18 @@ And a big thanks to all GitHub sponsors who helped with some of my costs before
|
||||
|
||||
## What's New
|
||||
|
||||
# Dec 6, 2022
|
||||
* Add 'EVA g', BEiT style ViT-g/14 model weights w/ both MIM pretrain and CLIP pretrain to `beit.py`.
|
||||
* original source: https://github.com/baaivision/EVA
|
||||
* paper: https://arxiv.org/abs/2211.07636
|
||||
|
||||
| model | top1 | param_count | gmac | macts | hub |
|
||||
|:-----------------------------------------|-------:|--------------:|-------:|--------:|:----------------------------------------|
|
||||
| eva_giant_patch14_560.m30m_ft_in22k_in1k | 89.8 | 1014.4 | 1906.8 | 2577.2 | [link](https://huggingface.co/BAAI/EVA) |
|
||||
| eva_giant_patch14_336.m30m_ft_in22k_in1k | 89.6 | 1013 | 620.6 | 550.7 | [link](https://huggingface.co/BAAI/EVA) |
|
||||
| eva_giant_patch14_336.clip_ft_in1k | 89.4 | 1013 | 620.6 | 550.7 | [link](https://huggingface.co/BAAI/EVA) |
|
||||
| eva_giant_patch14_224.clip_ft_in1k | 89.1 | 1012.6 | 267.2 | 192.6 | [link](https://huggingface.co/BAAI/EVA) |
|
||||
|
||||
# Dec 5, 2022
|
||||
|
||||
* Pre-release (`0.8.0dev0`) of multi-weight support (`model_arch.pretrained_tag`). Install with `pip install --pre timm`
|
||||
@ -376,6 +388,7 @@ A full version of the list below with source links can be found in the [document
|
||||
* MobileNet-V2 - https://arxiv.org/abs/1801.04381
|
||||
* Single-Path NAS - https://arxiv.org/abs/1904.02877
|
||||
* TinyNet - https://arxiv.org/abs/2010.14819
|
||||
* EVA - https://arxiv.org/abs/2211.07636
|
||||
* GCViT (Global Context Vision Transformer) - https://arxiv.org/abs/2206.09959
|
||||
* GhostNet - https://arxiv.org/abs/1911.11907
|
||||
* gMLP - https://arxiv.org/abs/2105.08050
|
||||
|
@ -80,9 +80,11 @@ parser.add_argument('--results-file', default='', type=str,
|
||||
parser.add_argument('--results-format', default='csv', type=str,
|
||||
help='Format for results file one of (csv, json) (default: csv).')
|
||||
parser.add_argument('--num-warm-iter', default=10, type=int,
|
||||
metavar='N', help='Number of warmup iterations (default: 10)')
|
||||
help='Number of warmup iterations (default: 10)')
|
||||
parser.add_argument('--num-bench-iter', default=40, type=int,
|
||||
metavar='N', help='Number of benchmark iterations (default: 40)')
|
||||
help='Number of benchmark iterations (default: 40)')
|
||||
parser.add_argument('--device', default='cuda', type=str,
|
||||
help="device to run benchmark on")
|
||||
|
||||
# common inference / train args
|
||||
parser.add_argument('--model', '-m', metavar='NAME', default='resnet50',
|
||||
|
@ -27,7 +27,7 @@ NON_STD_FILTERS = [
|
||||
'vit_*', 'tnt_*', 'pit_*', 'swin_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
|
||||
'convit_*', 'levit*', 'visformer*', 'deit*', 'jx_nest_*', 'nest_*', 'xcit_*', 'crossvit_*', 'beit*',
|
||||
'poolformer_*', 'volo_*', 'sequencer2d_*', 'swinv2_*', 'pvt_v2*', 'mvitv2*', 'gcvit*', 'efficientformer*',
|
||||
'coatnet*', 'coatnext*', 'maxvit*', 'maxxvit*',
|
||||
'coatnet*', 'coatnext*', 'maxvit*', 'maxxvit*', 'eva_*'
|
||||
]
|
||||
NUM_NON_STD = len(NON_STD_FILTERS)
|
||||
|
||||
@ -39,7 +39,7 @@ if 'GITHUB_ACTIONS' in os.environ:
|
||||
'*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*', '*efficientnetv2_xl*',
|
||||
'*resnetrs350*', '*resnetrs420*', 'xcit_large_24_p8*', 'vit_huge*', 'vit_gi*', 'swin*huge*',
|
||||
'swin*giant*']
|
||||
NON_STD_EXCLUDE_FILTERS = ['vit_huge*', 'vit_gi*', 'swin*giant*']
|
||||
NON_STD_EXCLUDE_FILTERS = ['vit_huge*', 'vit_gi*', 'swin*giant*', 'eva_giant*']
|
||||
else:
|
||||
EXCLUDE_FILTERS = []
|
||||
NON_STD_EXCLUDE_FILTERS = ['vit_gi*']
|
||||
|
@ -1,8 +1,6 @@
|
||||
""" BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
|
||||
""" BEiT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
|
||||
|
||||
Model from official source: https://github.com/microsoft/unilm/tree/master/beit
|
||||
and
|
||||
https://github.com/microsoft/unilm/tree/master/beit2
|
||||
|
||||
@inproceedings{beit,
|
||||
title={{BEiT}: {BERT} Pre-Training of Image Transformers},
|
||||
@ -12,6 +10,8 @@ year={2022},
|
||||
url={https://openreview.net/forum?id=p-BhZSz59o4}
|
||||
}
|
||||
|
||||
BEiT-v2 from https://github.com/microsoft/unilm/tree/master/beit2
|
||||
|
||||
@article{beitv2,
|
||||
title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
|
||||
author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
|
||||
@ -21,6 +21,17 @@ archivePrefix={arXiv},
|
||||
primaryClass={cs.CV}
|
||||
}
|
||||
|
||||
EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
|
||||
|
||||
@article{EVA,
|
||||
title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
|
||||
author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang,
|
||||
Tiejun and Wang, Xinlong and Cao, Yue},
|
||||
journal={arXiv preprint arXiv:2211.07636},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
|
||||
At this point only the 1k fine-tuned classification weights and model configs have been added,
|
||||
see original source above for pre-training models and procedure.
|
||||
|
||||
@ -37,6 +48,9 @@ Modifications by / Copyright 2021 Ross Wightman, original copyrights below
|
||||
# https://github.com/facebookresearch/deit/
|
||||
# https://github.com/facebookresearch/dino
|
||||
# --------------------------------------------------------'
|
||||
|
||||
# EVA models Copyright (c) 2022 BAAI-Vision
|
||||
|
||||
import math
|
||||
from functools import partial
|
||||
from typing import Optional, Tuple
|
||||
@ -46,72 +60,14 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
|
||||
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
||||
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from .helpers import build_model_with_cfg
|
||||
from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_
|
||||
from .pretrained import generate_default_cfgs
|
||||
from .registry import register_model
|
||||
from .vision_transformer import checkpoint_filter_fn
|
||||
|
||||
|
||||
def _cfg(url='', **kwargs):
|
||||
return {
|
||||
'url': url,
|
||||
'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
|
||||
'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
|
||||
'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
|
||||
'first_conv': 'patch_embed.proj', 'classifier': 'head',
|
||||
**kwargs
|
||||
}
|
||||
|
||||
|
||||
default_cfgs = {
|
||||
'beit_base_patch16_224': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth'),
|
||||
'beit_base_patch16_384': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth',
|
||||
input_size=(3, 384, 384), crop_pct=1.0,
|
||||
),
|
||||
'beit_base_patch16_224_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth',
|
||||
num_classes=21841,
|
||||
),
|
||||
'beit_large_patch16_224': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth'),
|
||||
'beit_large_patch16_384': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth',
|
||||
input_size=(3, 384, 384), crop_pct=1.0,
|
||||
),
|
||||
'beit_large_patch16_512': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth',
|
||||
input_size=(3, 512, 512), crop_pct=1.0,
|
||||
),
|
||||
'beit_large_patch16_224_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth',
|
||||
num_classes=21841,
|
||||
),
|
||||
|
||||
'beitv2_base_patch16_224': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth',
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
'beitv2_base_patch16_224_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth',
|
||||
num_classes=21841,
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
'beitv2_large_patch16_224': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth',
|
||||
crop_pct=0.95,
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
'beitv2_large_patch16_224_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth',
|
||||
num_classes=21841,
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def gen_relative_position_index(window_size: Tuple[int, int]) -> torch.Tensor:
|
||||
num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
|
||||
# cls to token & token 2 cls & cls to cls
|
||||
@ -384,6 +340,82 @@ class Beit(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
def _cfg(url='', **kwargs):
|
||||
return {
|
||||
'url': url,
|
||||
'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
|
||||
'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
|
||||
'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
|
||||
'first_conv': 'patch_embed.proj', 'classifier': 'head',
|
||||
**kwargs
|
||||
}
|
||||
|
||||
|
||||
default_cfgs = generate_default_cfgs({
|
||||
'beit_base_patch16_224.in22k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth'),
|
||||
'beit_base_patch16_384.in22k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth',
|
||||
input_size=(3, 384, 384), crop_pct=1.0,
|
||||
),
|
||||
'beit_base_patch16_224.in22k_ft_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth',
|
||||
num_classes=21841,
|
||||
),
|
||||
'beit_large_patch16_224.in22k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth'),
|
||||
'beit_large_patch16_384.in22k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth',
|
||||
input_size=(3, 384, 384), crop_pct=1.0,
|
||||
),
|
||||
'beit_large_patch16_512.in22k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth',
|
||||
input_size=(3, 512, 512), crop_pct=1.0,
|
||||
),
|
||||
'beit_large_patch16_224.in22k_ft_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth',
|
||||
num_classes=21841,
|
||||
),
|
||||
|
||||
'beitv2_base_patch16_224.in1k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth',
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
'beitv2_base_patch16_224.in1k_ft_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth',
|
||||
num_classes=21841,
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
'beitv2_large_patch16_224.in1k_ft_in22k_in1k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth',
|
||||
crop_pct=0.95,
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
'beitv2_large_patch16_224.in1k_ft_in22k': _cfg(
|
||||
url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth',
|
||||
num_classes=21841,
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD
|
||||
),
|
||||
|
||||
'eva_giant_patch14_224.clip_ft_in1k': _cfg(
|
||||
hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz224_ftcls_89p1.pt',
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0,
|
||||
),
|
||||
'eva_giant_patch14_336.clip_ft_in1k': _cfg(
|
||||
hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz336_ftcls_89p4.pt',
|
||||
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
|
||||
input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'),
|
||||
'eva_giant_patch14_336.m30m_ft_in22k_in1k': _cfg(
|
||||
hf_hub_id='BAAI/EVA', hf_hub_filename='eva_21k_1k_336px_psz14_ema_89p6.pt',
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
|
||||
input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'),
|
||||
'eva_giant_patch14_560.m30m_ft_in22k_in1k': _cfg(
|
||||
hf_hub_id='BAAI/EVA', hf_hub_filename='eva_21k_1k_560px_psz14_ema_89p7.pt',
|
||||
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
|
||||
input_size=(3, 560, 560), crop_pct=1.0, crop_mode='squash'),
|
||||
})
|
||||
|
||||
|
||||
def _beit_checkpoint_filter_fn(state_dict, model):
|
||||
if 'module' in state_dict:
|
||||
# beit v2 didn't strip module
|
||||
@ -393,7 +425,7 @@ def _beit_checkpoint_filter_fn(state_dict, model):
|
||||
|
||||
def _create_beit(variant, pretrained=False, **kwargs):
|
||||
if kwargs.get('features_only', None):
|
||||
raise RuntimeError('features_only not implemented for Beit models.')
|
||||
raise RuntimeError('features_only not implemented for BEiT models.')
|
||||
|
||||
model = build_model_with_cfg(
|
||||
Beit, variant, pretrained,
|
||||
@ -415,25 +447,16 @@ def beit_base_patch16_224(pretrained=False, **kwargs):
|
||||
@register_model
|
||||
def beit_base_patch16_384(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
|
||||
img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
|
||||
model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beit_base_patch16_224_in22k(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs)
|
||||
model = _create_beit('beit_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beit_large_patch16_224(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
|
||||
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
@ -442,7 +465,7 @@ def beit_large_patch16_224(pretrained=False, **kwargs):
|
||||
@register_model
|
||||
def beit_large_patch16_384(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
|
||||
img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
@ -451,21 +474,12 @@ def beit_large_patch16_384(pretrained=False, **kwargs):
|
||||
@register_model
|
||||
def beit_large_patch16_512(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
|
||||
img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beit_large_patch16_224_in22k(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beit_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beitv2_base_patch16_224(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
@ -475,28 +489,37 @@ def beitv2_base_patch16_224(pretrained=False, **kwargs):
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beitv2_base_patch16_224_in22k(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beitv2_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beitv2_large_patch16_224(pretrained=False, **kwargs):
|
||||
model_kwargs = dict(
|
||||
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
|
||||
patch_size=16, embed_dim=1024, depth=24, num_heads=16,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def beitv2_large_patch16_224_in22k(pretrained=False, **kwargs):
|
||||
def eva_giant_patch14_224(pretrained=False, **kwargs):
|
||||
""" EVA-g model https://arxiv.org/abs/2211.07636 """
|
||||
model_kwargs = dict(
|
||||
patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True,
|
||||
use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs)
|
||||
model = _create_beit('beitv2_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs)
|
||||
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
|
||||
model = _create_beit('eva_giant_patch14_224', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def eva_giant_patch14_336(pretrained=False, **kwargs):
|
||||
""" EVA-g model https://arxiv.org/abs/2211.07636 """
|
||||
model_kwargs = dict(
|
||||
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
|
||||
model = _create_beit('eva_giant_patch14_336', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@register_model
|
||||
def eva_giant_patch14_560(pretrained=False, **kwargs):
|
||||
""" EVA-g model https://arxiv.org/abs/2211.07636 """
|
||||
model_kwargs = dict(
|
||||
patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=6144 / 1408, **kwargs)
|
||||
model = _create_beit('eva_giant_patch14_560', pretrained=pretrained, **model_kwargs)
|
||||
return model
|
||||
|
@ -59,10 +59,11 @@ class PretrainedCfg:
|
||||
|
||||
def filter_pretrained_cfg(cfg, remove_source=False, remove_null=True):
|
||||
filtered_cfg = {}
|
||||
keep_none = {'pool_size', 'first_conv', 'classifier'} # always keep these keys, even if none
|
||||
for k, v in cfg.items():
|
||||
if remove_source and k in {'url', 'file', 'hf_hub_id', 'hf_hub_id', 'hf_hub_filename', 'source'}:
|
||||
continue
|
||||
if remove_null and v is None:
|
||||
if remove_null and v is None and k not in keep_none:
|
||||
continue
|
||||
filtered_cfg[k] = v
|
||||
return filtered_cfg
|
||||
|
Loading…
x
Reference in New Issue
Block a user