Add MAE version ViT-base training results and fix some errors in configs.
parent
d8f556668e
commit
c0f3ba68a8
|
@ -21,12 +21,7 @@ train_pipeline = [
|
||||||
|
|
||||||
test_pipeline = [
|
test_pipeline = [
|
||||||
dict(type='LoadImageFromFile'),
|
dict(type='LoadImageFromFile'),
|
||||||
dict(
|
dict(type='Resize', scale=384, backend='pillow', interpolation='bicubic'),
|
||||||
type='ResizeEdge',
|
|
||||||
scale=256,
|
|
||||||
edge='short',
|
|
||||||
backend='pillow',
|
|
||||||
interpolation='bicubic'),
|
|
||||||
dict(type='PackClsInputs'),
|
dict(type='PackClsInputs'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,7 @@ env_cfg = dict(
|
||||||
|
|
||||||
# set visualizer
|
# set visualizer
|
||||||
vis_backends = [dict(type='LocalVisBackend')]
|
vis_backends = [dict(type='LocalVisBackend')]
|
||||||
visualizer = dict(
|
visualizer = dict(type='ClsVisualizer', vis_backends=vis_backends)
|
||||||
type='ClsVisualizer', vis_backends=vis_backends, name='visualizer')
|
|
||||||
|
|
||||||
# set log level
|
# set log level
|
||||||
log_level = 'INFO'
|
log_level = 'INFO'
|
||||||
|
|
|
@ -20,13 +20,14 @@ param_scheduler = [
|
||||||
dict(
|
dict(
|
||||||
type='LinearLR',
|
type='LinearLR',
|
||||||
start_factor=1e-3,
|
start_factor=1e-3,
|
||||||
by_epoch=False,
|
by_epoch=True,
|
||||||
begin=0,
|
begin=0,
|
||||||
end=5 * 1252),
|
end=5,
|
||||||
|
convert_to_iter_based=True),
|
||||||
dict(
|
dict(
|
||||||
type='CosineAnnealingLR',
|
type='CosineAnnealingLR',
|
||||||
T_max=295,
|
T_max=295,
|
||||||
eta_min=1e-2,
|
eta_min=1e-5,
|
||||||
by_epoch=True,
|
by_epoch=True,
|
||||||
begin=5,
|
begin=5,
|
||||||
end=300)
|
end=300)
|
||||||
|
|
|
@ -31,7 +31,7 @@ param_scheduler = [
|
||||||
dict(
|
dict(
|
||||||
type='CosineAnnealingLR',
|
type='CosineAnnealingLR',
|
||||||
T_max=280,
|
T_max=280,
|
||||||
eta_min=1e-2,
|
eta_min=1e-5,
|
||||||
by_epoch=True,
|
by_epoch=True,
|
||||||
begin=20,
|
begin=20,
|
||||||
end=300)
|
end=300)
|
||||||
|
|
|
@ -6,19 +6,16 @@ optim_wrapper = dict(
|
||||||
|
|
||||||
# learning policy
|
# learning policy
|
||||||
param_scheduler = [
|
param_scheduler = [
|
||||||
dict(type='ConstantLR', factor=0.1, by_epoch=False, begin=0, end=5 * 1252),
|
dict(
|
||||||
|
type='ConstantLR',
|
||||||
|
factor=0.1,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=0,
|
||||||
|
end=5,
|
||||||
|
convert_to_iter_based=True),
|
||||||
dict(type='PolyLR', eta_min=0, by_epoch=True, begin=5, end=300)
|
dict(type='PolyLR', eta_min=0, by_epoch=True, begin=5, end=300)
|
||||||
]
|
]
|
||||||
|
|
||||||
# old learning policy
|
|
||||||
# lr_config = dict(
|
|
||||||
# policy='poly',
|
|
||||||
# min_lr=0,
|
|
||||||
# by_epoch=False,
|
|
||||||
# warmup='constant',
|
|
||||||
# warmup_iters=5000,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# train, val, test setting
|
# train, val, test setting
|
||||||
train_cfg = dict(by_epoch=True, max_epochs=300)
|
train_cfg = dict(by_epoch=True, max_epochs=300)
|
||||||
val_cfg = dict(interval=1) # validate every other epoch
|
val_cfg = dict(interval=1) # validate every other epoch
|
||||||
|
|
|
@ -26,7 +26,7 @@ param_scheduler = [
|
||||||
dict(
|
dict(
|
||||||
type='CosineAnnealingLR',
|
type='CosineAnnealingLR',
|
||||||
T_max=285,
|
T_max=285,
|
||||||
eta_min=1e-2,
|
eta_min=1e-5,
|
||||||
by_epoch=True,
|
by_epoch=True,
|
||||||
begin=15,
|
begin=15,
|
||||||
end=300)
|
end=300)
|
||||||
|
|
|
@ -11,4 +11,24 @@ optim_wrapper = dict(
|
||||||
clip_grad=dict(max_norm=5.0),
|
clip_grad=dict(max_norm=5.0),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
param_scheduler = [
|
||||||
|
# warm up learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='LinearLR',
|
||||||
|
start_factor=1e-3,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=0,
|
||||||
|
end=20,
|
||||||
|
# update by iter
|
||||||
|
convert_to_iter_based=True),
|
||||||
|
# main learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='CosineAnnealingLR',
|
||||||
|
T_max=130,
|
||||||
|
eta_min=1e-5,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=20,
|
||||||
|
end=150)
|
||||||
|
]
|
||||||
|
|
||||||
train_cfg = dict(by_epoch=True, max_epochs=150)
|
train_cfg = dict(by_epoch=True, max_epochs=150)
|
||||||
|
|
|
@ -11,4 +11,24 @@ optim_wrapper = dict(
|
||||||
clip_grad=dict(max_norm=5.0),
|
clip_grad=dict(max_norm=5.0),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
param_scheduler = [
|
||||||
|
# warm up learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='LinearLR',
|
||||||
|
start_factor=1e-3,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=0,
|
||||||
|
end=20,
|
||||||
|
# update by iter
|
||||||
|
convert_to_iter_based=True),
|
||||||
|
# main learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='CosineAnnealingLR',
|
||||||
|
T_max=130,
|
||||||
|
eta_min=1e-5,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=20,
|
||||||
|
end=150)
|
||||||
|
]
|
||||||
|
|
||||||
train_cfg = dict(by_epoch=True, max_epochs=150)
|
train_cfg = dict(by_epoch=True, max_epochs=150)
|
||||||
|
|
|
@ -31,7 +31,7 @@ train_pipeline = [
|
||||||
test_pipeline = [
|
test_pipeline = [
|
||||||
dict(type='LoadImageFromFile'),
|
dict(type='LoadImageFromFile'),
|
||||||
dict(
|
dict(
|
||||||
type='Resize',
|
type='ResizeEdge',
|
||||||
scale=288,
|
scale=288,
|
||||||
edge='short',
|
edge='short',
|
||||||
backend='pillow',
|
backend='pillow',
|
||||||
|
|
|
@ -6,4 +6,7 @@ _base_ = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# schedule settings
|
# schedule settings
|
||||||
|
param_scheduler = dict(
|
||||||
|
type='CosineAnnealingLR', T_max=120, by_epoch=True, begin=0, end=120)
|
||||||
|
|
||||||
train_cfg = dict(by_epoch=True, max_epochs=120)
|
train_cfg = dict(by_epoch=True, max_epochs=120)
|
||||||
|
|
|
@ -32,4 +32,24 @@ optim_wrapper = dict(
|
||||||
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
|
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
param_scheduler = [
|
||||||
|
# warm up learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='LinearLR',
|
||||||
|
start_factor=0.0001,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=0,
|
||||||
|
end=5,
|
||||||
|
# update by iter
|
||||||
|
convert_to_iter_based=True),
|
||||||
|
# main learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='CosineAnnealingLR',
|
||||||
|
T_max=595,
|
||||||
|
eta_min=1.0e-6,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=5,
|
||||||
|
end=600)
|
||||||
|
]
|
||||||
|
|
||||||
train_cfg = dict(by_epoch=True, max_epochs=600)
|
train_cfg = dict(by_epoch=True, max_epochs=600)
|
||||||
|
|
|
@ -24,4 +24,23 @@ train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True))
|
||||||
optim_wrapper = dict(
|
optim_wrapper = dict(
|
||||||
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.))
|
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.))
|
||||||
|
|
||||||
|
param_scheduler = [
|
||||||
|
# warm up learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='LinearLR',
|
||||||
|
start_factor=0.0001,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=0,
|
||||||
|
end=5,
|
||||||
|
# update by iter
|
||||||
|
convert_to_iter_based=True),
|
||||||
|
# main learning rate scheduler
|
||||||
|
dict(
|
||||||
|
type='CosineAnnealingLR',
|
||||||
|
T_max=295,
|
||||||
|
eta_min=1.0e-6,
|
||||||
|
by_epoch=True,
|
||||||
|
begin=5,
|
||||||
|
end=300)
|
||||||
|
]
|
||||||
train_cfg = dict(by_epoch=True, max_epochs=300)
|
train_cfg = dict(by_epoch=True, max_epochs=300)
|
||||||
|
|
|
@ -34,7 +34,7 @@ param_scheduler = [
|
||||||
dict(
|
dict(
|
||||||
type='CosineAnnealingLR',
|
type='CosineAnnealingLR',
|
||||||
T_max=295,
|
T_max=295,
|
||||||
eta_min=1e-2,
|
eta_min=1e-5,
|
||||||
by_epoch=True,
|
by_epoch=True,
|
||||||
begin=5,
|
begin=5,
|
||||||
end=300)
|
end=300)
|
||||||
|
|
|
@ -34,7 +34,7 @@ param_scheduler = [
|
||||||
dict(
|
dict(
|
||||||
type='CosineAnnealingLR',
|
type='CosineAnnealingLR',
|
||||||
T_max=295,
|
T_max=295,
|
||||||
eta_min=1e-2,
|
eta_min=1e-5,
|
||||||
by_epoch=True,
|
by_epoch=True,
|
||||||
begin=5,
|
begin=5,
|
||||||
end=300)
|
end=300)
|
||||||
|
|
|
@ -36,10 +36,11 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don'
|
||||||
|
|
||||||
| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
|
| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
|
||||||
| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------: | :----------------------------------------------------------------: |
|
| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------: | :----------------------------------------------------------------: |
|
||||||
|
| ViT-B16 | From scratch | 224x224 | 86.86 | 33.03 | 82.37 | 96.15 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.log)|
|
||||||
| ViT-B16\* | ImageNet-21k | 384x384 | 86.86 | 33.03 | 85.43 | 97.77 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) |
|
| ViT-B16\* | ImageNet-21k | 384x384 | 86.86 | 33.03 | 85.43 | 97.77 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) |
|
||||||
|
| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) |
|
||||||
| ViT-B32\* | ImageNet-21k | 384x384 | 88.30 | 8.56 | 84.01 | 97.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) |
|
| ViT-B32\* | ImageNet-21k | 384x384 | 88.30 | 8.56 | 84.01 | 97.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) |
|
||||||
| ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) |
|
| ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) |
|
||||||
| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) |
|
|
||||||
|
|
||||||
*Models with * are converted from the [official repo](https://github.com/google-research/vision_transformer#available-vit-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
|
*Models with * are converted from the [official repo](https://github.com/google-research/vision_transformer#available-vit-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
|
||||||
|
|
||||||
|
|
|
@ -77,3 +77,18 @@ Models:
|
||||||
Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz
|
Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz
|
||||||
Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
|
Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
|
||||||
Config: configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
|
Config: configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
|
||||||
|
- Name: vit-base-p16_pt-32xb128-mae_in1k
|
||||||
|
In Collection: Vision Transformer
|
||||||
|
Metadata:
|
||||||
|
FLOPs: 33030000000
|
||||||
|
Parameters: 86860000
|
||||||
|
Training Data:
|
||||||
|
- ImageNet-1k
|
||||||
|
Results:
|
||||||
|
- Dataset: ImageNet-1k
|
||||||
|
Task: Image Classification
|
||||||
|
Metrics:
|
||||||
|
Top 1 Accuracy: 82.37
|
||||||
|
Top 5 Accuracy: 96.15
|
||||||
|
Weights: https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth
|
||||||
|
Config: configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
_base_ = [
|
||||||
|
'../_base_/datasets/imagenet_bs64_swin_224.py',
|
||||||
|
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
|
||||||
|
'../_base_/default_runtime.py'
|
||||||
|
]
|
||||||
|
|
||||||
|
# model settings
|
||||||
|
model = dict(
|
||||||
|
type='ImageClassifier',
|
||||||
|
backbone=dict(
|
||||||
|
type='VisionTransformer',
|
||||||
|
arch='base',
|
||||||
|
img_size=224,
|
||||||
|
patch_size=16,
|
||||||
|
drop_path_rate=0.1),
|
||||||
|
neck=None,
|
||||||
|
head=dict(
|
||||||
|
type='VisionTransformerClsHead',
|
||||||
|
num_classes=1000,
|
||||||
|
in_channels=768,
|
||||||
|
loss=dict(
|
||||||
|
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
|
||||||
|
),
|
||||||
|
init_cfg=[
|
||||||
|
dict(type='TruncNormal', layer='Linear', std=.02),
|
||||||
|
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
|
||||||
|
],
|
||||||
|
train_cfg=dict(augments=[
|
||||||
|
dict(type='Mixup', alpha=0.8, num_classes=1000),
|
||||||
|
dict(type='CutMix', alpha=1.0, num_classes=1000)
|
||||||
|
]))
|
||||||
|
|
||||||
|
# dataset settings
|
||||||
|
train_dataloader = dict(batch_size=128)
|
||||||
|
|
||||||
|
# schedule settings
|
||||||
|
optim_wrapper = dict(
|
||||||
|
optimizer=dict(
|
||||||
|
type='AdamW',
|
||||||
|
lr=1e-4 * 4096 / 256,
|
||||||
|
weight_decay=0.3,
|
||||||
|
eps=1e-8,
|
||||||
|
betas=(0.9, 0.95)),
|
||||||
|
paramwise_cfg=dict(
|
||||||
|
norm_decay_mult=0.0,
|
||||||
|
bias_decay_mult=0.0,
|
||||||
|
custom_keys={
|
||||||
|
'.cls_token': dict(decay_mult=0.0),
|
||||||
|
'.pos_embed': dict(decay_mult=0.0)
|
||||||
|
}))
|
||||||
|
|
||||||
|
# runtime settings
|
||||||
|
custom_hooks = [dict(type='EMAHook', momentum=1e-4)]
|
Loading…
Reference in New Issue