Add MAE version ViT-base training results and fix some errors in configs.

pull/913/head
mzr1996 2022-06-30 08:21:10 +00:00
parent d8f556668e
commit c0f3ba68a8
17 changed files with 170 additions and 27 deletions

View File

@ -21,12 +21,7 @@ train_pipeline = [
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=256,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='Resize', scale=384, backend='pillow', interpolation='bicubic'),
dict(type='PackClsInputs'),
]

View File

@ -36,8 +36,7 @@ env_cfg = dict(
# set visualizer
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='ClsVisualizer', vis_backends=vis_backends, name='visualizer')
visualizer = dict(type='ClsVisualizer', vis_backends=vis_backends)
# set log level
log_level = 'INFO'

View File

@ -20,13 +20,14 @@ param_scheduler = [
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=False,
by_epoch=True,
begin=0,
end=5 * 1252),
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=295,
eta_min=1e-2,
eta_min=1e-5,
by_epoch=True,
begin=5,
end=300)

View File

@ -31,7 +31,7 @@ param_scheduler = [
dict(
type='CosineAnnealingLR',
T_max=280,
eta_min=1e-2,
eta_min=1e-5,
by_epoch=True,
begin=20,
end=300)

View File

@ -6,19 +6,16 @@ optim_wrapper = dict(
# learning policy
param_scheduler = [
dict(type='ConstantLR', factor=0.1, by_epoch=False, begin=0, end=5 * 1252),
dict(
type='ConstantLR',
factor=0.1,
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(type='PolyLR', eta_min=0, by_epoch=True, begin=5, end=300)
]
# old learning policy
# lr_config = dict(
# policy='poly',
# min_lr=0,
# by_epoch=False,
# warmup='constant',
# warmup_iters=5000,
# )
# train, val, test setting
train_cfg = dict(by_epoch=True, max_epochs=300)
val_cfg = dict(interval=1) # validate every other epoch

View File

@ -26,7 +26,7 @@ param_scheduler = [
dict(
type='CosineAnnealingLR',
T_max=285,
eta_min=1e-2,
eta_min=1e-5,
by_epoch=True,
begin=15,
end=300)

View File

@ -11,4 +11,24 @@ optim_wrapper = dict(
clip_grad=dict(max_norm=5.0),
)
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
begin=0,
end=20,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(
type='CosineAnnealingLR',
T_max=130,
eta_min=1e-5,
by_epoch=True,
begin=20,
end=150)
]
train_cfg = dict(by_epoch=True, max_epochs=150)

View File

@ -11,4 +11,24 @@ optim_wrapper = dict(
clip_grad=dict(max_norm=5.0),
)
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=1e-3,
by_epoch=True,
begin=0,
end=20,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(
type='CosineAnnealingLR',
T_max=130,
eta_min=1e-5,
by_epoch=True,
begin=20,
end=150)
]
train_cfg = dict(by_epoch=True, max_epochs=150)

View File

@ -31,7 +31,7 @@ train_pipeline = [
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='Resize',
type='ResizeEdge',
scale=288,
edge='short',
backend='pillow',

View File

@ -6,4 +6,7 @@ _base_ = [
]
# schedule settings
param_scheduler = dict(
type='CosineAnnealingLR', T_max=120, by_epoch=True, begin=0, end=120)
train_cfg = dict(by_epoch=True, max_epochs=120)

View File

@ -32,4 +32,24 @@ optim_wrapper = dict(
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.),
)
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=0.0001,
by_epoch=True,
begin=0,
end=5,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(
type='CosineAnnealingLR',
T_max=595,
eta_min=1.0e-6,
by_epoch=True,
begin=5,
end=600)
]
train_cfg = dict(by_epoch=True, max_epochs=600)

View File

@ -24,4 +24,23 @@ train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True))
optim_wrapper = dict(
paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.))
param_scheduler = [
# warm up learning rate scheduler
dict(
type='LinearLR',
start_factor=0.0001,
by_epoch=True,
begin=0,
end=5,
# update by iter
convert_to_iter_based=True),
# main learning rate scheduler
dict(
type='CosineAnnealingLR',
T_max=295,
eta_min=1.0e-6,
by_epoch=True,
begin=5,
end=300)
]
train_cfg = dict(by_epoch=True, max_epochs=300)

View File

@ -34,7 +34,7 @@ param_scheduler = [
dict(
type='CosineAnnealingLR',
T_max=295,
eta_min=1e-2,
eta_min=1e-5,
by_epoch=True,
begin=5,
end=300)

View File

@ -34,7 +34,7 @@ param_scheduler = [
dict(
type='CosineAnnealingLR',
T_max=295,
eta_min=1e-2,
eta_min=1e-5,
by_epoch=True,
begin=5,
end=300)

View File

@ -36,10 +36,11 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don'
| Model | Pretrain | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------: | :----------------------------------------------------------------: |
| ViT-B16 | From scratch | 224x224 | 86.86 | 33.03 | 82.37 | 96.15 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.log)|
| ViT-B16\* | ImageNet-21k | 384x384 | 86.86 | 33.03 | 85.43 | 97.77 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth) |
| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) |
| ViT-B32\* | ImageNet-21k | 384x384 | 88.30 | 8.56 | 84.01 | 97.08 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p32_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth) |
| ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth) |
| ViT-B16 (IPU) | ImageNet-21k | 224x224 | 86.86 | 33.03 | 81.22 | 95.56 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-base-p16_ft-4xb544-ipu_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k_20220603-c215811a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_ft-4xb544-ipu_in1k.log) |
*Models with * are converted from the [official repo](https://github.com/google-research/vision_transformer#available-vit-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*

View File

@ -77,3 +77,18 @@ Models:
Weights: https://console.cloud.google.com/storage/browser/_details/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_strong1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz
Code: https://github.com/google-research/vision_transformer/blob/88a52f8892c80c10de99194990a517b4d80485fd/vit_jax/models.py#L208
Config: configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py
- Name: vit-base-p16_pt-32xb128-mae_in1k
In Collection: Vision Transformer
Metadata:
FLOPs: 33030000000
Parameters: 86860000
Training Data:
- ImageNet-1k
Results:
- Dataset: ImageNet-1k
Task: Image Classification
Metrics:
Top 1 Accuracy: 82.37
Top 5 Accuracy: 96.15
Weights: https://download.openmmlab.com/mmclassification/v0/vit/vit-base-p16_pt-32xb128-mae_in1k_20220623-4c544545.pth
Config: configs/vision_transformer/vit-base-p16_pt-32xb128-mae_in1k-224.py

View File

@ -0,0 +1,53 @@
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
# model settings
model = dict(
type='ImageClassifier',
backbone=dict(
type='VisionTransformer',
arch='base',
img_size=224,
patch_size=16,
drop_path_rate=0.1),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
# dataset settings
train_dataloader = dict(batch_size=128)
# schedule settings
optim_wrapper = dict(
optimizer=dict(
type='AdamW',
lr=1e-4 * 4096 / 256,
weight_decay=0.3,
eps=1e-8,
betas=(0.9, 0.95)),
paramwise_cfg=dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
}))
# runtime settings
custom_hooks = [dict(type='EMAHook', momentum=1e-4)]