Add MAE version ViT-base training results and fix some errors in configs.
parent
d8f556668e
commit
c0f3ba68a8
|
@ -21,12 +21,7 @@ train_pipeline = [
|
|||
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='ResizeEdge',
|
||||
scale=256,
|
||||
edge='short',
|
||||
backend='pillow',
|
||||
interpolation='bicubic'),
|
||||
dict(type='Resize', scale=384, backend='pillow', interpolation='bicubic'),
|
||||
dict(type='PackClsInputs'),
|
||||
]
|
||||
|
||||
|
|
|
@ -36,8 +36,7 @@ env_cfg = dict(
|
|||
|
||||
# set visualizer
|
||||
vis_backends = [dict(type='LocalVisBackend')]
|
||||
visualizer = dict(
|
||||
type='ClsVisualizer', vis_backends=vis_backends, name='visualizer')
|
||||
visualizer = dict(type='ClsVisualizer', vis_backends=vis_backends)
|
||||
|
||||
# set log level
|
||||
log_level = 'INFO'
|
||||
|
|
|
@ -20,13 +20,14 @@ param_scheduler = [
|
|||
dict(
|
||||
type='LinearLR',
|
||||
start_factor=1e-3,
|
||||
by_epoch=False,
|
||||
by_epoch=True,
|
||||
begin=0,
|
||||
end=5 * 1252),
|
||||
end=5,
|
||||
convert_to_iter_based=True),
|
||||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=295,
|
||||
eta_min=1e-2,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=5,
|
||||
end=300)
|
||||
|
|
|
@ -31,7 +31,7 @@ param_scheduler = [
|
|||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=280,
|
||||
eta_min=1e-2,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=20,
|
||||
end=300)
|
||||
|
|
|
@ -6,19 +6,16 @@ optim_wrapper = dict(
|
|||
|
||||
# learning policy
|
||||
param_scheduler = [
|
||||
dict(type='ConstantLR', factor=0.1, by_epoch=False, begin=0, end=5 * 1252),
|
||||
dict(
|
||||
type='ConstantLR',
|
||||
factor=0.1,
|
||||
by_epoch=True,
|
||||
begin=0,
|
||||
end=5,
|
||||
convert_to_iter_based=True),
|
||||
dict(type='PolyLR', eta_min=0, by_epoch=True, begin=5, end=300)
|
||||
]
|
||||
|
||||
# old learning policy
|
||||
# lr_config = dict(
|
||||
# policy='poly',
|
||||
# min_lr=0,
|
||||
# by_epoch=False,
|
||||
# warmup='constant',
|
||||
# warmup_iters=5000,
|
||||
# )
|
||||
|
||||
# train, val, test setting
|
||||
train_cfg = dict(by_epoch=True, max_epochs=300)
|
||||
val_cfg = dict(interval=1) # validate every other epoch
|
||||
|
|
|
@ -26,7 +26,7 @@ param_scheduler = [
|
|||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=285,
|
||||
eta_min=1e-2,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=15,
|
||||
end=300)
|
||||
|
|
|
@ -11,4 +11,24 @@ optim_wrapper = dict(
|
|||
clip_grad=dict(max_norm=5.0),
|
||||
)
|
||||
|
||||
param_scheduler = [
|
||||
# warm up learning rate scheduler
|
||||
dict(
|
||||
type='LinearLR',
|
||||
start_factor=1e-3,
|
||||
by_epoch=True,
|
||||
begin=0,
|
||||
end=20,
|
||||
# update by iter
|
||||
convert_to_iter_based=True),
|
||||
# main learning rate scheduler
|
||||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=130,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=20,
|
||||
end=150)
|
||||
]
|
||||
|
||||
train_cfg = dict(by_epoch=True, max_epochs=150)
|
||||
|
|
|
@ -11,4 +11,24 @@ optim_wrapper = dict(
|
|||
clip_grad=dict(max_norm=5.0),
|
||||
)
|
||||
|
||||
param_scheduler = [
|
||||
# warm up learning rate scheduler
|
||||
dict(
|
||||
type='LinearLR',
|
||||
start_factor=1e-3,
|
||||
by_epoch=True,
|
||||
begin=0,
|
||||
end=20,
|
||||
# update by iter
|
||||
convert_to_iter_based=True),
|
||||
# main learning rate scheduler
|
||||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=130,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=20,
|
||||
end=150)
|
||||
]
|
||||
|
||||
train_cfg = dict(by_epoch=True, max_epochs=150)
|
||||
|
|
|
@ -31,7 +31,7 @@ train_pipeline = [
|
|||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile'),
|
||||
dict(
|
||||
type='Resize',
|
||||
type='ResizeEdge',
|
||||
scale=288,
|
||||
edge='short',
|
||||
backend='pillow',
|
||||
|
|
|
@ -6,4 +6,7 @@ _base_ = [
|
|||
]
|
||||
|
||||
# schedule settings
|
||||
param_scheduler = dict(
|
||||
type='CosineAnnealingLR', T_max=120, by_epoch=True, begin=0, end=120)
|
||||
|
||||
train_cfg = dict(by_epoch=True, max_epochs=120)
|
||||
|
|
|
@ -32,4 +32,24 @@ optim_wrapper = dict(
|
|||
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
|
||||
)
|
||||
|
||||
param_scheduler = [
|
||||
# warm up learning rate scheduler
|
||||
dict(
|
||||
type='LinearLR',
|
||||
start_factor=0.0001,
|
||||
by_epoch=True,
|
||||
begin=0,
|
||||
end=5,
|
||||
# update by iter
|
||||
convert_to_iter_based=True),
|
||||
# main learning rate scheduler
|
||||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=595,
|
||||
eta_min=1.0e-6,
|
||||
by_epoch=True,
|
||||
begin=5,
|
||||
end=600)
|
||||
]
|
||||
|
||||
train_cfg = dict(by_epoch=True, max_epochs=600)
|
||||
|
|
|
@ -24,4 +24,23 @@ train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True))
|
|||
optim_wrapper = dict(
|
||||
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.))
|
||||
|
||||
param_scheduler = [
|
||||
# warm up learning rate scheduler
|
||||
dict(
|
||||
type='LinearLR',
|
||||
start_factor=0.0001,
|
||||
by_epoch=True,
|
||||
begin=0,
|
||||
end=5,
|
||||
# update by iter
|
||||
convert_to_iter_based=True),
|
||||
# main learning rate scheduler
|
||||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=295,
|
||||
eta_min=1.0e-6,
|
||||
by_epoch=True,
|
||||
begin=5,
|
||||
end=300)
|
||||
]
|
||||
train_cfg = dict(by_epoch=True, max_epochs=300)
|
||||
|
|
|
@ -34,7 +34,7 @@ param_scheduler = [
|
|||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=295,
|
||||
eta_min=1e-2,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=5,
|
||||
end=300)
|
||||
|
|
|
@ -34,7 +34,7 @@ param_scheduler = [
|
|||
dict(
|
||||
type='CosineAnnealingLR',
|
||||
T_max=295,
|
||||
eta_min=1e-2,
|
||||
eta_min=1e-5,
|
||||
by_epoch=True,
|
||||
begin=5,
|
||||
end=300)
|
||||
|
|
|
@ -36,10 +36,11 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don'
|
|||
|
||||
| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
|
||||
| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------: | :----------------------------------------------------------------: |
|
||||
| ViT-B16 | From scratch | 224x224 | 86.86 | 33.03 | 82.37 | 96.15 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.log)|
|
||||
| ViT-B16\* | ImageNet-21k | 384x384 | 86.86 | 33.03 | 85.43 | 97.77 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) |
|
||||
| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) |
|
||||
| ViT-B32\* | ImageNet-21k | 384x384 | 88.30 | 8.56 | 84.01 | 97.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) |
|
||||
| ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) |
|
||||
| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) |
|
||||
|
||||
*Models with * are converted from the [official repo](https://github.com/google-research/vision_transformer#available-vit-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
|
||||
|
||||
|
|
|
@ -77,3 +77,18 @@ Models:
|
|||
Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz
|
||||
Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
|
||||
Config: configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
|
||||
- Name: vit-base-p16_pt-32xb128-mae_in1k
|
||||
In Collection: Vision Transformer
|
||||
Metadata:
|
||||
FLOPs: 33030000000
|
||||
Parameters: 86860000
|
||||
Training Data:
|
||||
- ImageNet-1k
|
||||
Results:
|
||||
- Dataset: ImageNet-1k
|
||||
Task: Image Classification
|
||||
Metrics:
|
||||
Top 1 Accuracy: 82.37
|
||||
Top 5 Accuracy: 96.15
|
||||
Weights: https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth
|
||||
Config: configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
_base_ = [
|
||||
'../_base_/datasets/imagenet_bs64_swin_224.py',
|
||||
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
|
||||
'../_base_/default_runtime.py'
|
||||
]
|
||||
|
||||
# model settings
|
||||
model = dict(
|
||||
type='ImageClassifier',
|
||||
backbone=dict(
|
||||
type='VisionTransformer',
|
||||
arch='base',
|
||||
img_size=224,
|
||||
patch_size=16,
|
||||
drop_path_rate=0.1),
|
||||
neck=None,
|
||||
head=dict(
|
||||
type='VisionTransformerClsHead',
|
||||
num_classes=1000,
|
||||
in_channels=768,
|
||||
loss=dict(
|
||||
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
|
||||
),
|
||||
init_cfg=[
|
||||
dict(type='TruncNormal', layer='Linear', std=.02),
|
||||
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
|
||||
],
|
||||
train_cfg=dict(augments=[
|
||||
dict(type='Mixup', alpha=0.8, num_classes=1000),
|
||||
dict(type='CutMix', alpha=1.0, num_classes=1000)
|
||||
]))
|
||||
|
||||
# dataset settings
|
||||
train_dataloader = dict(batch_size=128)
|
||||
|
||||
# schedule settings
|
||||
optim_wrapper = dict(
|
||||
optimizer=dict(
|
||||
type='AdamW',
|
||||
lr=1e-4 * 4096 / 256,
|
||||
weight_decay=0.3,
|
||||
eps=1e-8,
|
||||
betas=(0.9, 0.95)),
|
||||
paramwise_cfg=dict(
|
||||
norm_decay_mult=0.0,
|
||||
bias_decay_mult=0.0,
|
||||
custom_keys={
|
||||
'.cls_token': dict(decay_mult=0.0),
|
||||
'.pos_embed': dict(decay_mult=0.0)
|
||||
}))
|
||||
|
||||
# runtime settings
|
||||
custom_hooks = [dict(type='EMAHook', momentum=1e-4)]
|
Loading…
Reference in New Issue