[CodeCamp2023-335]New version of config adapting BeitV2 Algorithm (#1755)

2023-08-14 15:04:42 +08:00 · 2023-08-14 15:04:42 +08:00 · bff80d3c48
parent 827a216155
commit bff80d3c48
8 changed files with 766 additions and 0 deletions
--- a/mmpretrain/configs/_base_/datasets/imagenet_bs256_beitv2.py
+++ b/mmpretrain/configs/_base_/datasets/imagenet_bs256_beitv2.py
@ -0,0 +1,53 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.dataset import DefaultSampler, default_collate
 from mmpretrain.datasets import (BEiTMaskGenerator, ColorJitter, ImageNet,
                                 LoadImageFromFile, PackInputs, RandomFlip,
                                 RandomResizedCropAndInterpolationWithTwoPic)
 from mmpretrain.models import TwoNormDataPreprocessor
 dataset_type = ImageNet
 data_root = 'data/imagenet/'
 data_preprocessor = dict(
    type=TwoNormDataPreprocessor,
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    second_mean=[127.5, 127.5, 127.5],
    second_std=[127.5, 127.5, 127.5],
    to_rgb=True)
 train_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=ColorJitter, brightness=0.4, contrast=0.4, saturation=0.4,
        hue=0.),
    dict(type=RandomFlip, prob=0.5, direction='horizontal'),
    dict(
        type=RandomResizedCropAndInterpolationWithTwoPic,
        size=224,
        second_size=224,
        interpolation='bicubic',
        second_interpolation='bicubic',
        scale=(0.2, 1.0)),
    dict(
        type=BEiTMaskGenerator,
        input_size=(14, 14),
        num_masking_patches=75,
        max_num_patches=75,
        min_num_patches=16),
    dict(type=PackInputs)
 ]
 train_dataloader = dict(
    batch_size=256,
    num_workers=8,
    persistent_workers=True,
    sampler=dict(type=DefaultSampler, shuffle=True),
    collate_fn=dict(type=default_collate),
    dataset=dict(
        type=dataset_type,
        data_root=data_root,
        split='train',
        pipeline=train_pipeline))
--- a/mmpretrain/configs/_base_/datasets/imagenet_bs64_swin_224.py
+++ b/mmpretrain/configs/_base_/datasets/imagenet_bs64_swin_224.py
@ -0,0 +1,90 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.dataset import DefaultSampler, default_collate
 from mmpretrain.datasets import (CenterCrop, ImageNet, LoadImageFromFile,
                                 PackInputs, RandAugment, RandomErasing,
                                 RandomFlip, RandomResizedCrop, Resize,
                                 ResizeEdge)
 from mmpretrain.evaluation import Accuracy
 # dataset settings
 dataset_type = ImageNet
 data_preprocessor = dict(
    num_classes=1000,
    # RGB format normalization parameters
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    # convert image from BGR to RGB
    to_rgb=True,
 )
 bgr_mean = data_preprocessor['mean'][::-1]
 bgr_std = data_preprocessor['std'][::-1]
 train_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=RandomResizedCrop,
        scale=224,
        backend='pillow',
        interpolation='bicubic'),
    dict(type=RandomFlip, prob=0.5, direction='horizontal'),
    dict(
        type=RandAugment,
        policies='timm_increasing',
        num_policies=2,
        total_level=10,
        magnitude_level=9,
        magnitude_std=0.5,
        hparams=dict(
            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
    dict(
        type=RandomErasing,
        erase_prob=0.25,
        mode='rand',
        min_area_ratio=0.02,
        max_area_ratio=1 / 3,
        fill_color=bgr_mean,
        fill_std=bgr_std),
    dict(type=PackInputs),
 ]
 test_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=ResizeEdge,
        scale=256,
        edge='short',
        backend='pillow',
        interpolation='bicubic'),
    dict(type=CenterCrop, crop_size=224),
    dict(type=PackInputs),
 ]
 train_dataloader = dict(
    batch_size=64,
    num_workers=5,
    dataset=dict(
        type=dataset_type,
        data_root='data/imagenet',
        split='train',
        pipeline=train_pipeline),
    sampler=dict(type=DefaultSampler, shuffle=True),
 )
 val_dataloader = dict(
    batch_size=64,
    num_workers=5,
    dataset=dict(
        type=dataset_type,
        data_root='data/imagenet',
        split='val',
        pipeline=test_pipeline),
    sampler=dict(type=DefaultSampler, shuffle=False),
 )
 val_evaluator = dict(type=Accuracy, topk=(1, 5))
 # If you want standard test, please manually configure the test dataset
 test_dataloader = val_dataloader
 test_evaluator = val_evaluator
--- a/mmpretrain/configs/beit/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
+++ b/mmpretrain/configs/beit/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
@ -0,0 +1,139 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.config import read_base
 with read_base():
    from ..._base_.datasets.imagenet_bs64_swin_224 import *
    from ..._base_.schedules.imagenet_bs1024_adamw_swin import *
    from ..._base_.default_runtime import *
 from mmengine.hooks import CheckpointHook
 from mmengine.model import PretrainedInit, TruncNormalInit
 from mmengine.optim import CosineAnnealingLR, LinearLR
 from torch.optim import AdamW
 from mmpretrain.datasets import LoadImageFromFile, PackInputs, RandomFlip
 from mmpretrain.engine.optimizers import \
    LearningRateDecayOptimWrapperConstructor
 from mmpretrain.models import (BEiTViT, ImageClassifier, LabelSmoothLoss,
                               LinearClsHead)
 from mmpretrain.models.utils.batch_augments import CutMix, Mixup
 data_preprocessor = dict(
    num_classes=1000,
    mean=[127.5, 127.5, 127.5],
    std=[127.5, 127.5, 127.5],
    to_rgb=True,
 )
 # model settings
 model = dict(
    type=ImageClassifier,
    backbone=dict(
        type=BEiTViT,
        arch='base',
        img_size=224,
        patch_size=16,
        drop_path_rate=0.1,
        out_type='avg_featmap',
        use_abs_pos_emb=False,
        use_rel_pos_bias=True,
        use_shared_rel_pos_bias=False,
        init_cfg=dict(type=PretrainedInit, checkpoint='', prefix='backbone.')),
    neck=None,
    head=dict(
        type=LinearClsHead,
        num_classes=1000,
        in_channels=768,
        loss=dict(type=LabelSmoothLoss, label_smooth_val=0.1, mode='original'),
        init_cfg=[dict(type=TruncNormalInit, layer='Linear', std=0.02)]),
    train_cfg=dict(
        augments=[dict(type=Mixup, alpha=0.8),
                  dict(type=CutMix, alpha=1.0)]))
 train_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=RandomResizedCrop,
        scale=224,
        backend='pillow',
        interpolation='bicubic'),
    dict(type=RandomFlip, prob=0.5, direction='horizontal'),
    dict(
        type=RandAugment,
        policies='timm_increasing',
        num_policies=2,
        total_level=10,
        magnitude_level=9,
        magnitude_std=0.5,
        hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
    dict(
        type=RandomErasing,
        erase_prob=0.25,
        mode='rand',
        min_area_ratio=0.02,
        max_area_ratio=0.3333333333333333,
        fill_color=[103.53, 116.28, 123.675],
        fill_std=[57.375, 57.12, 58.395]),
    dict(type=PackInputs)
 ]
 test_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=ResizeEdge,
        scale=256,
        edge='short',
        backend='pillow',
        interpolation='bicubic'),
    dict(type=CenterCrop, crop_size=224),
    dict(type=PackInputs)
 ]
 train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
 test_dataloader = val_dataloader
 # optimizer wrapper
 optim_wrapper = dict(
    optimizer=dict(type=AdamW, lr=4e-3, weight_decay=0.05, betas=(0.9, 0.999)),
    constructor=LearningRateDecayOptimWrapperConstructor,
    paramwise_cfg=dict(
        _delete_=True,
        layer_decay_rate=0.65,
        custom_keys={
            # the following configurations are designed for BEiT
            '.ln': dict(decay_mult=0.0),
            '.bias': dict(decay_mult=0.0),
            'q_bias': dict(decay_mult=0.0),
            'v_bias': dict(decay_mult=0.0),
            '.cls_token': dict(decay_mult=0.0),
            '.pos_embed': dict(decay_mult=0.0),
            '.gamma': dict(decay_mult=0.0),
        }))
 # learning rate scheduler
 param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-4,
        by_epoch=True,
        begin=0,
        end=20,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        by_epoch=True,
        begin=20,
        end=100,
        eta_min=1e-6,
        convert_to_iter_based=True)
 ]
 # runtime settings
 default_hooks = dict(
    # save checkpoint per epoch.
    checkpoint=dict(type=CheckpointHook, interval=1, max_keep_ckpts=2))
 train_cfg = dict(by_epoch=True, max_epochs=100)
 randomness = dict(seed=0)
--- a/mmpretrain/configs/beit/benchmarks/beit-base-p16_8xb64_in1k.py
+++ b/mmpretrain/configs/beit/benchmarks/beit-base-p16_8xb64_in1k.py
@ -0,0 +1,50 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.config import read_base
 with read_base():
    from ..._base_.datasets.imagenet_bs64_swin_224 import *
    from ..._base_.schedules.imagenet_bs1024_adamw_swin import *
    from ..._base_.default_runtime import *
 from mmengine.model import ConstantInit, TruncNormalInit
 from mmpretrain.models import (BEiTViT, ImageClassifier, LabelSmoothLoss,
                               LinearClsHead)
 from mmpretrain.models.utils.batch_augments import CutMix, Mixup
 data_preprocessor = dict(
    num_classes=1000,
    # RGB format normalization parameters
    mean=[127.5, 127.5, 127.5],
    std=[127.5, 127.5, 127.5],
    # convert image from BGR to RGB
    to_rgb=True,
 )
 model = dict(
    type=ImageClassifier,
    backbone=dict(
        type=BEiTViT,
        arch='base',
        img_size=224,
        patch_size=16,
        out_type='avg_featmap',
        use_abs_pos_emb=False,
        use_rel_pos_bias=True,
        use_shared_rel_pos_bias=False,
    ),
    neck=None,
    head=dict(
        type=LinearClsHead,
        num_classes=1000,
        in_channels=768,
        loss=dict(type=LabelSmoothLoss, label_smooth_val=0.1, mode='original'),
    ),
    init_cfg=[
        dict(type=TruncNormalInit, layer='Linear', std=.02),
        dict(type=ConstantInit, layer='LayerNorm', val=1., bias=0.),
    ],
    train_cfg=dict(
        augments=[dict(type=Mixup, alpha=0.8),
                  dict(type=CutMix, alpha=1.0)]))
--- a/mmpretrain/configs/beitv2/beitv2_beit-base-p16_8xb256-amp-coslr-1600e_in1k.py
+++ b/mmpretrain/configs/beitv2/beitv2_beit-base-p16_8xb256-amp-coslr-1600e_in1k.py
@ -0,0 +1,130 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.config import read_base
 with read_base():
    from .._base_.datasets.imagenet_bs256_beitv2 import *
    from .._base_.default_runtime import *
 from mmengine.model import ConstantInit, PretrainedInit, TruncNormalInit
 from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
 from mmengine.runner import EpochBasedTrainLoop
 from torch.optim import AdamW
 from mmpretrain.models import (VQKD, BEiT, BEiTPretrainViT, BEiTV2Head,
                               BEiTV2Neck, CrossEntropyLoss)
 vqkd_encoder = dict(
    arch='base',
    img_size=224,
    patch_size=16,
    in_channels=3,
    out_indices=-1,
    drop_rate=0.,
    drop_path_rate=0.,
    norm_cfg=dict(type='LN', eps=1e-6),
    final_norm=True,
    out_type='featmap',
    with_cls_token=True,
    frozen_stages=-1,
    use_abs_pos_emb=True,
    use_rel_pos_bias=False,
    use_shared_rel_pos_bias=False,
    layer_scale_init_value=0.,
    interpolate_mode='bicubic',
    patch_cfg=dict(),
    layer_cfgs=dict(),
    init_cfg=None)
 layer_scale_init_value = 0.1
 drop_path_rate = 0.1  # 0. for 300 epochs and 0.1 for 1600 epochs.
 model = dict(
    type=BEiT,
    backbone=dict(
        type=BEiTPretrainViT,
        arch='base',
        patch_size=16,
        out_indices=[-4, -1],
        drop_path_rate=drop_path_rate,
        final_norm=False,
        out_type='raw',
        layer_scale_init_value=layer_scale_init_value,
        init_cfg=[
            dict(type=TruncNormalInit, std=0.02, layer='Linear'),
            dict(type=TruncNormalInit, std=0.02, layer='Conv2d'),
            dict(type=ConstantInit, layer='LayerNorm', val=1.0, bias=0.0)
        ]),
    neck=dict(
        type=BEiTV2Neck,
        num_layers=2,
        early_layers=9,
        backbone_arch='base',
        drop_path_rate=drop_path_rate,
        layer_scale_init_value=layer_scale_init_value,
    ),
    head=dict(
        type=BEiTV2Head,
        embed_dims=768,
        num_embed=8192,
        loss=dict(type=CrossEntropyLoss)),
    target_generator=dict(
        type=VQKD,
        encoder_config=vqkd_encoder,
        init_cfg=dict(
            type=PretrainedInit,
            checkpoint=  # noqa
            'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/vqkd_encoder.pth'  # noqa
        )))
 # optimizer wrapper
 optim_wrapper = dict(
    type=AmpOptimWrapper,
    loss_scale='dynamic',
    # betas: (0.9, 0.98) for 300 epochs and (0.9, 0.999) for 1600 epochs.
    optimizer=dict(
        type=AdamW, lr=1.5e-3, betas=(0.9, 0.999), weight_decay=0.05),
    clip_grad=dict(max_norm=3.0),
    paramwise_cfg=dict(
        custom_keys={
            # the following configurations are designed for BEiT
            '.ln': dict(decay_mult=0.0),
            '.bias': dict(decay_mult=0.0),
            'q_bias': dict(decay_mult=0.0),
            'v_bias': dict(decay_mult=0.0),
            '.cls_token': dict(decay_mult=0.0),
            '.pos_embed': dict(decay_mult=0.0),
            '.gamma': dict(decay_mult=0.0),
        }))
 # learning rate scheduler
 param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-4,
        by_epoch=True,
        begin=0,
        end=10,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=1e-5,
        by_epoch=True,
        begin=10,
        end=1600,
        convert_to_iter_based=True)
 ]
 # runtime settings
 train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=1600)
 default_hooks = dict(
    # only keeps the latest 3 checkpoints
    checkpoint=dict(type=CheckpointHook, interval=1, max_keep_ckpts=3))
 randomness = dict(seed=0, diff_rank_seed=True)
 find_unused_parameters = True
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
 auto_scale_lr = dict(base_batch_size=2048)
--- a/mmpretrain/configs/beitv2/beitv2_beit-base-p16_8xb256-amp-coslr-300e_in1k.py
+++ b/mmpretrain/configs/beitv2/beitv2_beit-base-p16_8xb256-amp-coslr-300e_in1k.py
@ -0,0 +1,130 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.config import read_base
 with read_base():
    from .._base_.datasets.imagenet_bs256_beitv2 import *
    from .._base_.default_runtime import *
 from mmengine.model import ConstantInit, PretrainedInit, TruncNormalInit
 from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
 from mmengine.runner import EpochBasedTrainLoop
 from torch.optim import AdamW
 from mmpretrain.models import (VQKD, BEiT, BEiTPretrainViT, BEiTV2Head,
                               BEiTV2Neck, CrossEntropyLoss)
 # model settings
 vqkd_encoder = dict(
    arch='base',
    img_size=224,
    patch_size=16,
    in_channels=3,
    out_indices=-1,
    drop_rate=0.,
    drop_path_rate=0.,
    norm_cfg=dict(type='LN', eps=1e-6),
    final_norm=True,
    out_type='featmap',
    with_cls_token=True,
    frozen_stages=-1,
    use_abs_pos_emb=True,
    use_rel_pos_bias=False,
    use_shared_rel_pos_bias=False,
    layer_scale_init_value=0.,
    interpolate_mode='bicubic',
    patch_cfg=dict(),
    layer_cfgs=dict(),
    init_cfg=None)
 layer_scale_init_value = 0.1
 drop_path_rate = 0.  # 0. for 300 epochs and 0.1 for 1600 epochs.
 model = dict(
    type=BEiT,
    backbone=dict(
        type=BEiTPretrainViT,
        arch='base',
        patch_size=16,
        out_indices=[-4, -1],
        drop_path_rate=drop_path_rate,
        final_norm=False,
        out_type='raw',
        layer_scale_init_value=layer_scale_init_value,
        init_cfg=[
            dict(type=TruncNormalInit, std=0.02, layer='Linear'),
            dict(type=TruncNormalInit, std=0.02, layer='Conv2d'),
            dict(type=ConstantInit, layer='LayerNorm', val=1.0, bias=0.0)
        ]),
    neck=dict(
        type=BEiTV2Neck,
        num_layers=2,
        early_layers=9,
        backbone_arch='base',
        drop_path_rate=drop_path_rate,
        layer_scale_init_value=layer_scale_init_value,
    ),
    head=dict(
        type=BEiTV2Head,
        embed_dims=768,
        num_embed=8192,
        loss=dict(type=CrossEntropyLoss)),
    target_generator=dict(
        type=VQKD,
        encoder_config=vqkd_encoder,
        init_cfg=dict(
            type=PretrainedInit,
            checkpoint=  # noqa
            'https://download.openmmlab.com/mmselfsup/1.x/target_generator_ckpt/vqkd_encoder.pth'  # noqa
        )))
 # optimizer wrapper
 optim_wrapper = dict(
    type=AmpOptimWrapper,
    loss_scale='dynamic',
    # betas: (0.9, 0.98) for 300 epochs and (0.9, 0.999) for 1600 epochs.
    optimizer=dict(
        type=AdamW, lr=1.5e-3, betas=(0.9, 0.98), weight_decay=0.05),
    clip_grad=dict(max_norm=3.0),
    paramwise_cfg=dict(
        custom_keys={
            # the following configurations are designed for BEiT
            '.ln': dict(decay_mult=0.0),
            '.bias': dict(decay_mult=0.0),
            'q_bias': dict(decay_mult=0.0),
            'v_bias': dict(decay_mult=0.0),
            '.cls_token': dict(decay_mult=0.0),
            '.pos_embed': dict(decay_mult=0.0),
            '.gamma': dict(decay_mult=0.0),
        }))
 # learning rate scheduler
 param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-4,
        by_epoch=True,
        begin=0,
        end=10,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=1e-5,
        by_epoch=True,
        begin=10,
        end=300,
        convert_to_iter_based=True)
 ]
 # runtime settings
 train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=300)
 default_hooks = dict(
    # only keeps the latest 3 checkpoints
    checkpoint=dict(type=CheckpointHook, interval=1, max_keep_ckpts=3))
 randomness = dict(seed=0, diff_rank_seed=True)
 find_unused_parameters = True
 # NOTE: `auto_scale_lr` is for automatically scaling LR
 # based on the actual training batch size.
 auto_scale_lr = dict(base_batch_size=2048)
--- a/mmpretrain/configs/beitv2/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
+++ b/mmpretrain/configs/beitv2/benchmarks/beit-base-p16_8xb128-coslr-100e_in1k.py
@ -0,0 +1,132 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.config import read_base
 with read_base():
    from ..._base_.datasets.imagenet_bs64_swin_224 import *
    from ..._base_.schedules.imagenet_bs1024_adamw_swin import *
    from ..._base_.default_runtime import *
 from mmengine.model import PretrainedInit, TruncNormalInit
 from mmengine.optim import CosineAnnealingLR, LinearLR
 from torch.optim import AdamW
 from mmpretrain.engine.optimizers import \
    LearningRateDecayOptimWrapperConstructor
 from mmpretrain.models import (BEiTViT, ImageClassifier, LabelSmoothLoss,
                               LinearClsHead)
 from mmpretrain.models.utils.batch_augments import CutMix, Mixup
 # model settings
 model = dict(
    type=ImageClassifier,
    backbone=dict(
        type=BEiTViT,
        arch='base',
        img_size=224,
        patch_size=16,
        # 0.2 for 1600 epochs pretrained models and 0.1 for 300 epochs.
        drop_path_rate=0.1,
        out_type='avg_featmap',
        use_abs_pos_emb=False,
        use_rel_pos_bias=True,
        use_shared_rel_pos_bias=False,
        init_cfg=dict(type=PretrainedInit, checkpoint='', prefix='backbone.')),
    neck=None,
    head=dict(
        type=LinearClsHead,
        num_classes=1000,
        in_channels=768,
        loss=dict(type=LabelSmoothLoss, label_smooth_val=0.1, mode='original'),
        init_cfg=[dict(type=TruncNormalInit, layer='Linear', std=0.02)]),
    train_cfg=dict(
        augments=[dict(type=Mixup, alpha=0.8),
                  dict(type=CutMix, alpha=1.0)]))
 train_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=RandomResizedCrop,
        scale=224,
        backend='pillow',
        interpolation='bicubic'),
    dict(type=RandomFlip, prob=0.5, direction='horizontal'),
    dict(
        type=RandAugment,
        policies='timm_increasing',
        num_policies=2,
        total_level=10,
        magnitude_level=9,
        magnitude_std=0.5,
        hparams=dict(pad_val=[104, 116, 124], interpolation='bicubic')),
    dict(
        type=RandomErasing,
        erase_prob=0.25,
        mode='rand',
        min_area_ratio=0.02,
        max_area_ratio=0.3333333333333333,
        fill_color=[103.53, 116.28, 123.675],
        fill_std=[57.375, 57.12, 58.395]),
    dict(type=PackInputs)
 ]
 test_pipeline = [
    dict(type=LoadImageFromFile),
    dict(
        type=ResizeEdge,
        scale=256,
        edge='short',
        backend='pillow',
        interpolation='bicubic'),
    dict(type=CenterCrop, crop_size=224),
    dict(type=PackInputs)
 ]
 train_dataloader = dict(batch_size=128, dataset=dict(pipeline=train_pipeline))
 val_dataloader = dict(batch_size=128, dataset=dict(pipeline=test_pipeline))
 test_dataloader = val_dataloader
 # optimizer wrapper
 optim_wrapper = dict(
    optimizer=dict(type=AdamW, lr=5e-4, weight_decay=0.05, betas=(0.9, 0.999)),
    constructor=LearningRateDecayOptimWrapperConstructor,
    paramwise_cfg=dict(
        _delete_=True,
        # 0.6 for 1600 epochs pretrained models and 0.65 for 300 epochs
        layer_decay_rate=0.65,
        custom_keys={
            # the following configurations are designed for BEiT
            '.ln': dict(decay_mult=0.0),
            '.bias': dict(decay_mult=0.0),
            'q_bias': dict(decay_mult=0.0),
            'v_bias': dict(decay_mult=0.0),
            '.cls_token': dict(decay_mult=0.0),
            '.pos_embed': dict(decay_mult=0.0),
            '.gamma': dict(decay_mult=0.0),
        }))
 # learning rate scheduler
 param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-4,
        by_epoch=True,
        begin=0,
        end=20,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        by_epoch=True,
        begin=20,
        end=100,
        eta_min=1e-6,
        convert_to_iter_based=True)
 ]
 # runtime settings
 default_hooks = dict(
    # save checkpoint per epoch.
    checkpoint=dict(type=CheckpointHook, interval=1, max_keep_ckpts=2))
 train_cfg = dict(by_epoch=True, max_epochs=100)
 randomness = dict(seed=0)
--- a/mmpretrain/configs/beitv2/benchmarks/beit-base-p16_8xb64_in1k.py
+++ b/mmpretrain/configs/beitv2/benchmarks/beit-base-p16_8xb64_in1k.py
@ -0,0 +1,42 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # This is a BETA new format config file, and the usage may change recently.
 from mmengine.config import read_base
 with read_base():
    from ..._base_.datasets.imagenet_bs64_swin_224 import *
    from ..._base_.schedules.imagenet_bs1024_adamw_swin import *
    from ..._base_.default_runtime import *
 from mmengine.model import ConstantInit, TruncNormalInit
 from mmpretrain.models import (BEiTViT, ImageClassifier, LabelSmoothLoss,
                               LinearClsHead)
 from mmpretrain.models.utils.batch_augments.cutmix import CutMix
 from mmpretrain.models.utils.batch_augments.mixup import Mixup
 model = dict(
    type=ImageClassifier,
    backbone=dict(
        type=BEiTViT,
        arch='base',
        img_size=224,
        patch_size=16,
        out_type='avg_featmap',
        use_abs_pos_emb=False,
        use_rel_pos_bias=True,
        use_shared_rel_pos_bias=False,
    ),
    neck=None,
    head=dict(
        type=LinearClsHead,
        num_classes=1000,
        in_channels=768,
        loss=dict(type=LabelSmoothLoss, label_smooth_val=0.1, mode='original'),
    ),
    init_cfg=[
        dict(type=TruncNormalInit, layer='Linear', std=.02),
        dict(type=ConstantInit, layer='LayerNorm', val=1., bias=0.),
    ],
    train_cfg=dict(
        augments=[dict(type=Mixup, alpha=0.8),
                  dict(type=CutMix, alpha=1.0)]))