Merge remote-tracking branch 'origin/dev'

2022-09-30 18:06:17 +08:00 · 2022-09-30 18:06:17 +08:00 · 91b85bb4a5
parent c94e9b3669 7b45eb10cd
commit 91b85bb4a5
139 changed files with 8062 additions and 1664 deletions
--- a/README.md
+++ b/README.md
@ -58,11 +58,15 @@ The master branch works with **PyTorch 1.5+**.

 ## What's new

-**The MMClassification 1.0 is in developing!**
+The MMClassification 1.0 has released! It's still unstable and in release candidate. If you want to try it, go
+to [the 1.x branch](https://github.com/open-mmlab/mmclassification/tree/1.x) and discuss it with us in
+[the discussion](https://github.com/open-mmlab/mmclassification/discussions).

-We are developing MMClassification 1.0 by main force. Welcome to try it and give your suggestions! You can checkout to [the `1.x` branch](https://github.com/open-mmlab/mmclassification/tree/1.x), and read [the documentation site](https://mmclassification.readthedocs.io/en/1.x/).
+v0.24.0 was released in 30/9/2022.
+Highlights of the new version:

-The MMClassification 1.0 is still in the release candidate, and it will last until the end of 2022. And we will still maintain 0.x version still at least until the end of 2023.
+- Support **HorNet**, **EfficientFormerm**, **SwinTransformer V2** and **MViT** backbones.
+- Support Standford Cars dataset.

 v0.23.0 was released in 1/5/2022.
 Highlights of the new version:
@ -78,7 +82,7 @@ Please refer to [changelog.md](docs/en/changelog.md) for more details and other
 Below are quick steps for installation:

 ```shell
-conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision -c pytorch -y
+conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision==0.11.0 -c pytorch -y
 conda activate open-mmlab
 pip3 install openmim
 mim install mmcv-full
@ -140,6 +144,9 @@ Results and models are available in the [model zoo](https://mmclassification.rea
 - [x] [ConvMixer](https://github.com/open-mmlab/mmclassification/tree/master/configs/convmixer)
 - [x] [CSPNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/cspnet)
 - [x] [PoolFormer](https://github.com/open-mmlab/mmclassification/tree/master/configs/poolformer)
+- [x] [MViT](https://github.com/open-mmlab/mmclassification/tree/master/configs/mvit)
+- [x] [EfficientFormer](https://github.com/open-mmlab/mmclassification/tree/master/configs/efficientformer)
+- [x] [HorNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/hornet)

 </details>

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -57,11 +57,12 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱，是 [O

 ## 更新日志

-**MMClassification 1.0 正在开发中！**
+MMClassification 1.0 已经发布！目前仍在公测中，如果希望试用，请切换到 [1.x 分支](https://github.com/open-mmlab/mmclassification/tree/1.x)，并在[讨论版](https://github.com/open-mmlab/mmclassification/discussions) 参加开发讨论！

-我们目前正在主力开发 MMClassification 1.0。欢迎大家前来试用，并提出你的建议！请签出到 [`1.x` 分支](https://github.com/open-mmlab/mmclassification/tree/1.x)，并阅读[文档页面](https://mmclassification.readthedocs.io/zh_CN/1.x/)以进行试用。
+2022/9/30 发布了 v0.24.0 版本

-新版本目前仍处于公测期，公测将持续到 2022 年末。另外，至少截止 2023 年末，我们仍将保持对 0.x 版本的维护。
+- 支持了 **HorNet**，**EfficientFormerm**，**SwinTransformer V2**，**MViT** 等主干网络。
+- 支持了 Support Standford Cars 数据集。

 2022/5/1 发布了 v0.23.0 版本

@ -78,7 +79,7 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱，是 [O
 以下是安装的简要步骤：

 ```shell
-conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision -c pytorch -y
+conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision==0.11.0 -c pytorch -y
 conda activate open-mmlab
 pip3 install openmim
 mim install mmcv-full
@ -140,6 +141,9 @@ pip3 install -e .
 - [x] [ConvMixer](https://github.com/open-mmlab/mmclassification/tree/master/configs/convmixer)
 - [x] [CSPNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/cspnet)
 - [x] [PoolFormer](https://github.com/open-mmlab/mmclassification/tree/master/configs/poolformer)
+- [x] [MViT](https://github.com/open-mmlab/mmclassification/tree/master/configs/mvit)
+- [x] [EfficientFormer](https://github.com/open-mmlab/mmclassification/tree/master/configs/efficientformer)
+- [x] [HorNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/hornet)

 </details>

--- a/configs/_base_/datasets/imagenet_bs64_swin_256.py
+++ b/configs/_base_/datasets/imagenet_bs64_swin_256.py
@ -0,0 +1,71 @@
+_base_ = ['./pipelines/rand_aug.py']
+
+# dataset settings
+dataset_type = 'ImageNet'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=256,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(292, -1),  # ( 256 / 224 * 256 )
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/train',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline),
+    test=dict(
+        # replace `data/val` with `data/test` for standard test
+        type=dataset_type,
+        data_prefix='data/imagenet/val',
+        ann_file='data/imagenet/meta/val.txt',
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=10, metric='accuracy')
--- a/configs/_base_/datasets/stanford_cars_bs8_448.py
+++ b/configs/_base_/datasets/stanford_cars_bs8_448.py
@ -0,0 +1,46 @@
+# dataset settings
+dataset_type = 'StanfordCars'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', size=512),
+    dict(type='RandomCrop', size=448),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', size=512),
+    dict(type='CenterCrop', crop_size=448),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data_root = 'data/stanfordcars'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_prefix=data_root,
+        test_mode=False,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_prefix=data_root,
+        test_mode=True,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_prefix=data_root,
+        test_mode=True,
+        pipeline=test_pipeline))
+
+evaluation = dict(
+    interval=1, metric='accuracy',
+    save_best='auto')  # save the checkpoint with highest accuracy
--- a/configs/_base_/models/hornet/hornet-base-gf.py
+++ b/configs/_base_/models/hornet/hornet-base-gf.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='base-gf', drop_path_rate=0.5),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-base.py
+++ b/configs/_base_/models/hornet/hornet-base.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='base', drop_path_rate=0.5),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-large-gf.py
+++ b/configs/_base_/models/hornet/hornet-large-gf.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='large-gf', drop_path_rate=0.2),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-large-gf384.py
+++ b/configs/_base_/models/hornet/hornet-large-gf384.py
@ -0,0 +1,17 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='large-gf384', drop_path_rate=0.4),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ])
--- a/configs/_base_/models/hornet/hornet-large.py
+++ b/configs/_base_/models/hornet/hornet-large.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='large', drop_path_rate=0.2),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-small-gf.py
+++ b/configs/_base_/models/hornet/hornet-small-gf.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='small-gf', drop_path_rate=0.4),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-small.py
+++ b/configs/_base_/models/hornet/hornet-small.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='small', drop_path_rate=0.4),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-tiny-gf.py
+++ b/configs/_base_/models/hornet/hornet-tiny-gf.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='tiny-gf', drop_path_rate=0.2),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/hornet/hornet-tiny.py
+++ b/configs/_base_/models/hornet/hornet-tiny.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='HorNet', arch='tiny', drop_path_rate=0.2),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+        dict(type='Constant', layer=['LayerScale'], val=1e-6)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/mvit/mvitv2-base.py
+++ b/configs/_base_/models/mvit/mvitv2-base.py
@ -0,0 +1,19 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='MViT', arch='base', drop_path_rate=0.3),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/mvit/mvitv2-large.py
+++ b/configs/_base_/models/mvit/mvitv2-large.py
@ -0,0 +1,23 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='MViT',
+        arch='large',
+        drop_path_rate=0.5,
+        dim_mul_in_attention=False),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=1152,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/mvit/mvitv2-small.py
+++ b/configs/_base_/models/mvit/mvitv2-small.py
@ -0,0 +1,19 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='MViT', arch='small', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/mvit/mvitv2-tiny.py
+++ b/configs/_base_/models/mvit/mvitv2-tiny.py
@ -0,0 +1,19 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='MViT', arch='tiny', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        in_channels=768,
+        num_classes=1000,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/swin_transformer_v2/base_256.py
+++ b/configs/_base_/models/swin_transformer_v2/base_256.py
@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='base',
+        img_size=256,
+        drop_path_rate=0.5),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/swin_transformer_v2/base_384.py
+++ b/configs/_base_/models/swin_transformer_v2/base_384.py
@ -0,0 +1,17 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='base',
+        img_size=384,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
--- a/configs/_base_/models/swin_transformer_v2/large_256.py
+++ b/configs/_base_/models/swin_transformer_v2/large_256.py
@ -0,0 +1,16 @@
+# model settings
+# Only for evaluation
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='large',
+        img_size=256,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5)))
--- a/configs/_base_/models/swin_transformer_v2/large_384.py
+++ b/configs/_base_/models/swin_transformer_v2/large_384.py
@ -0,0 +1,16 @@
+# model settings
+# Only for evaluation
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='large',
+        img_size=384,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='CrossEntropyLoss', loss_weight=1.0),
+        topk=(1, 5)))
--- a/configs/_base_/models/swin_transformer_v2/small_256.py
+++ b/configs/_base_/models/swin_transformer_v2/small_256.py
@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='small',
+        img_size=256,
+        drop_path_rate=0.3),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/swin_transformer_v2/tiny_256.py
+++ b/configs/_base_/models/swin_transformer_v2/tiny_256.py
@ -0,0 +1,25 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='SwinTransformerV2',
+        arch='tiny',
+        img_size=256,
+        drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/van/van_b0.py
+++ b/configs/_base_/models/van/van_b0.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b0', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=256,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/van/van_b1.py
+++ b/configs/_base_/models/van/van_b1.py
@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b1', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/van/van_b2.py
+++ b/configs/_base_/models/van/van_b2.py
@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b2', drop_path_rate=0.1),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
--- a/configs/_base_/models/van/van_b3.py
+++ b/configs/_base_/models/van/van_b3.py
@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b3', drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
--- a/configs/_base_/models/van/van_b4.py
+++ b/configs/_base_/models/van/van_b4.py
@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b4', drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
--- a/configs/_base_/models/van/van_b5.py
+++ b/configs/_base_/models/van/van_b5.py
@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b5', drop_path_rate=0.2),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
--- a/configs/_base_/models/van/van_b6.py
+++ b/configs/_base_/models/van/van_b6.py
@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(type='VAN', arch='b6', drop_path_rate=0.3),
+    neck=dict(type='GlobalAveragePooling'),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False))
--- a/configs/_base_/models/van/van_base.py
+++ b/configs/_base_/models/van/van_base.py
@ -1,13 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='base', drop_path_rate=0.1),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=512,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False))
+_base_ = ['./van-b2.py']
--- a/configs/_base_/models/van/van_large.py
+++ b/configs/_base_/models/van/van_large.py
@ -1,13 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='large', drop_path_rate=0.2),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=512,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False))
+_base_ = ['./van-b3.py']
--- a/configs/_base_/models/van/van_small.py
+++ b/configs/_base_/models/van/van_small.py
@ -1,21 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='small', drop_path_rate=0.1),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=512,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False),
-    init_cfg=[
-        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
-        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
-    ],
-    train_cfg=dict(augments=[
-        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
-        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
-    ]))
+_base_ = ['./van-b1.py']
--- a/configs/_base_/models/van/van_tiny.py
+++ b/configs/_base_/models/van/van_tiny.py
@ -1,21 +1 @@
-# model settings
-model = dict(
-    type='ImageClassifier',
-    backbone=dict(type='VAN', arch='tiny', drop_path_rate=0.1),
-    neck=dict(type='GlobalAveragePooling'),
-    head=dict(
-        type='LinearClsHead',
-        num_classes=1000,
-        in_channels=256,
-        init_cfg=None,  # suppress the default init_cfg of LinearClsHead.
-        loss=dict(
-            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
-        cal_acc=False),
-    init_cfg=[
-        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
-        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
-    ],
-    train_cfg=dict(augments=[
-        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
-        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
-    ]))
+_base_ = ['./van-b0.py']
--- a/configs/_base_/schedules/stanford_cars_bs8.py
+++ b/configs/_base_/schedules/stanford_cars_bs8.py
@ -0,0 +1,7 @@
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.003, momentum=0.9, weight_decay=0.0005, nesterov=True)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step', step=[40, 70, 90])
+runner = dict(type='EpochBasedRunner', max_epochs=100)
--- a/configs/csra/README.md
+++ b/configs/csra/README.md
@ -0,0 +1,36 @@
+# CSRA
+
+> [Residual Attention: A Simple but Effective Method for Multi-Label Recognition](https://arxiv.org/abs/2108.02456)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Multi-label image recognition is a challenging computer vision task of practical use. Progresses in this area, however, are often characterized by complicated methods, heavy computations, and lack of intuitive explanations. To effectively capture different spatial regions occupied by objects from different categories, we propose an embarrassingly simple module, named class-specific residual attention (CSRA). CSRA generates class-specific features for every category by proposing a simple spatial attention score, and then combines it with the class-agnostic average pooling feature. CSRA achieves state-of-the-art results on multilabel recognition, and at the same time is much simpler than them. Furthermore, with only 4 lines of code, CSRA also leads to consistent improvement across many diverse pretrained models and datasets without any extra training. CSRA is both easy to implement and light in computations, which also enjoys intuitive explanations and visualizations.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/84259897/176982245-3ffcff56-a4ea-4474-9967-bc2b612bbaa3.png" width="80%"/>
+</div>
+
+## Results and models
+
+### VOC2007
+
+|     Model      |                      Pretrain                      | Params(M) | Flops(G) |  mAP  | OF1 (%) | CF1 (%) |                      Config                       |                      Download                       |
+| :------------: | :------------------------------------------------: | :-------: | :------: | :---: | :-----: | :-----: | :-----------------------------------------------: | :-------------------------------------------------: |
+| Resnet101-CSRA | [ImageNet-1k](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth) |   23.55   |   4.12   | 94.98 |  90.80  |  89.16  | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/csra/resnet101-csra_1xb16_voc07-448px.py) | [model](https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.log.json) |
+
+## Citation
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2108.02456,
+  doi = {10.48550/ARXIV.2108.02456},
+  url = {https://arxiv.org/abs/2108.02456},
+  author = {Zhu, Ke and Wu, Jianxin},
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Residual Attention: A Simple but Effective Method for Multi-Label Recognition},
+  publisher = {arXiv},
+  year = {2021},
+  copyright = {arXiv.org perpetual, non-exclusive license}
+}
+```
--- a/configs/csra/metafile.yml
+++ b/configs/csra/metafile.yml
@ -0,0 +1,29 @@
+Collections:
+  - Name: CSRA
+    Metadata:
+      Training Data: PASCAL VOC 2007
+      Architecture:
+        - Class-specific Residual Attention
+    Paper:
+      URL: https://arxiv.org/abs/1911.11929
+      Title: 'Residual Attention: A Simple but Effective Method for Multi-Label Recognition'
+    README: configs/csra/README.md
+    Code:
+      Version: v0.24.0
+      URL: https://github.com/open-mmlab/mmclassification/blob/v0.24.0/mmcls/models/heads/multi_label_csra_head.py
+
+Models:
+  - Name: resnet101-csra_1xb16_voc07-448px
+    Metadata:
+      FLOPs: 4120000000
+      Parameters: 23550000
+    In Collections: CSRA
+    Results:
+      - Dataset: PASCAL VOC 2007
+        Metrics:
+          mAP: 94.98
+          OF1: 90.80
+          CF1: 89.16
+        Task: Multi-Label Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/csra/resnet101-csra_1xb16_voc07-448px_20220722-29efb40a.pth
+    Config: configs/csra/resnet101-csra_1xb16_voc07-448px.py
--- a/configs/csra/resnet101-csra_1xb16_voc07-448px.py
+++ b/configs/csra/resnet101-csra_1xb16_voc07-448px.py
@ -0,0 +1,75 @@
+_base_ = ['../_base_/datasets/voc_bs16.py', '../_base_/default_runtime.py']
+
+# Pre-trained Checkpoint Path
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth'  # noqa
+# If you want to use the pre-trained weight of ResNet101-CutMix from
+# the originary repo(https://github.com/Kevinz-code/CSRA). Script of
+# 'tools/convert_models/torchvision_to_mmcls.py' can help you convert weight
+# into mmcls format. The mAP result would hit 95.5 by using the weight.
+# checkpoint = 'PATH/TO/PRE-TRAINED_WEIGHT'
+
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(3, ),
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint, prefix='backbone')),
+    neck=None,
+    head=dict(
+        type='CSRAClsHead',
+        num_classes=20,
+        in_channels=2048,
+        num_heads=1,
+        lam=0.1,
+        loss=dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)))
+
+# dataset setting
+img_norm_cfg = dict(mean=[0, 0, 0], std=[255, 255, 255], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='RandomResizedCrop', size=448, scale=(0.7, 1.0)),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', size=448),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+data = dict(
+    # map the difficult examples as negative ones(0)
+    train=dict(pipeline=train_pipeline, difficult_as_postive=False),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+# the lr of classifier.head is 10 * base_lr, which help convergence.
+optimizer = dict(
+    type='SGD',
+    lr=0.0002,
+    momentum=0.9,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10)}))
+
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    step=6,
+    gamma=0.1,
+    warmup='linear',
+    warmup_iters=1,
+    warmup_ratio=1e-7,
+    warmup_by_epoch=True)
+runner = dict(type='EpochBasedRunner', max_epochs=20)
--- a/configs/efficientformer/README.md
+++ b/configs/efficientformer/README.md
@ -0,0 +1,47 @@
+# EfficientFormer
+
+> [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm.  Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on iPhone 12 (compiled with CoreML), which runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1), and our largest model, EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can reach extremely low latency on mobile devices while maintaining high performance.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/18586273/180713426-9d3d77e3-3584-42d8-9098-625b4170d796.png" width="100%"/>
+</div>
+
+## Results and models
+
+### ImageNet-1k
+
+|        Model         | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                                 Config                                  |                                  Download                                  |
+| :------------------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------: | :------------------------------------------------------------------------: |
+| EfficientFormer-l1\* |   12.19   |   1.30   |   80.46   |   94.99   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientformer/efficientformer-l1_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l1_3rdparty_in1k_20220803-d66e61df.pth) |
+| EfficientFormer-l3\* |   31.41   |   3.93   |   82.45   |   96.18   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientformer/efficientformer-l3_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l3_3rdparty_in1k_20220803-dde1c8c5.pth) |
+| EfficientFormer-l7\* |   82.23   |  10.16   |   83.40   |   96.60   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientformer/efficientformer-l7_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l7_3rdparty_in1k_20220803-41a552bb.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/snap-research/EfficientFormer). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2206.01191,
+  doi = {10.48550/ARXIV.2206.01191},
+
+  url = {https://arxiv.org/abs/2206.01191},
+
+  author = {Li, Yanyu and Yuan, Geng and Wen, Yang and Hu, Eric and Evangelidis, Georgios and Tulyakov, Sergey and Wang, Yanzhi and Ren, Jian},
+
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+
+  title = {EfficientFormer: Vision Transformers at MobileNet Speed},
+
+  publisher = {arXiv},
+
+  year = {2022},
+
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```
--- a/configs/efficientformer/efficientformer-l1_8xb128_in1k.py
+++ b/configs/efficientformer/efficientformer-l1_8xb128_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='EfficientFormer',
+        arch='l1',
+        drop_path_rate=0,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+            dict(type='Constant', layer=['LayerScale'], val=1e-5)
+        ]),
+    neck=dict(type='GlobalAveragePooling', dim=1),
+    head=dict(
+        type='EfficientFormerClsHead', in_channels=448, num_classes=1000))
--- a/configs/efficientformer/efficientformer-l3_8xb128_in1k.py
+++ b/configs/efficientformer/efficientformer-l3_8xb128_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='EfficientFormer',
+        arch='l3',
+        drop_path_rate=0,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+            dict(type='Constant', layer=['LayerScale'], val=1e-5)
+        ]),
+    neck=dict(type='GlobalAveragePooling', dim=1),
+    head=dict(
+        type='EfficientFormerClsHead', in_channels=512, num_classes=1000))
--- a/configs/efficientformer/efficientformer-l7_8xb128_in1k.py
+++ b/configs/efficientformer/efficientformer-l7_8xb128_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/datasets/imagenet_bs128_poolformer_small_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='EfficientFormer',
+        arch='l7',
+        drop_path_rate=0,
+        init_cfg=[
+            dict(
+                type='TruncNormal',
+                layer=['Conv2d', 'Linear'],
+                std=.02,
+                bias=0.),
+            dict(type='Constant', layer=['GroupNorm'], val=1., bias=0.),
+            dict(type='Constant', layer=['LayerScale'], val=1e-5)
+        ]),
+    neck=dict(type='GlobalAveragePooling', dim=1),
+    head=dict(
+        type='EfficientFormerClsHead', in_channels=768, num_classes=1000))
--- a/configs/efficientformer/metafile.yml
+++ b/configs/efficientformer/metafile.yml
@ -0,0 +1,67 @@
+Collections:
+  - Name: EfficientFormer
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Pooling
+        - 1x1 Convolution
+        - LayerScale
+        - MetaFormer
+    Paper:
+      URL: https://arxiv.org/pdf/2206.01191.pdf
+      Title: "EfficientFormer: Vision Transformers at MobileNet Speed"
+    README: configs/efficientformer/README.md
+    Code:
+      Version: v0.24.0
+      URL: https://github.com/open-mmlab/mmclassification/blob/v0.24.0/mmcls/models/backbones/efficientformer.py
+
+Models:
+  - Name: efficientformer-l1_3rdparty_8xb128_in1k
+    Metadata:
+      FLOPs: 1304601088     # 1.3G
+      Parameters: 12278696   # 12M
+    In Collections: EfficientFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.46
+          Top 5 Accuracy: 94.99
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l1_3rdparty_in1k_20220803-d66e61df.pth
+    Config: configs/efficientformer/efficientformer-l1_8xb128_in1k.py
+    Converted From:
+      Weights: https://drive.google.com/file/d/11SbX-3cfqTOc247xKYubrAjBiUmr818y/view?usp=sharing
+      Code: https://github.com/snap-research/EfficientFormer
+  - Name: efficientformer-l3_3rdparty_8xb128_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 3737045760        # 3.7G
+      Parameters: 31406000     # 31M
+    In Collections: EfficientFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.45
+          Top 5 Accuracy: 96.18
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l3_3rdparty_in1k_20220803-dde1c8c5.pth
+    Config: configs/efficientformer/efficientformer-l3_8xb128_in1k.py
+    Converted From:
+      Weights: https://drive.google.com/file/d/1OyyjKKxDyMj-BcfInp4GlDdwLu3hc30m/view?usp=sharing
+      Code: https://github.com/snap-research/EfficientFormer
+  - Name: efficientformer-l7_3rdparty_8xb128_in1k
+    Metadata:
+      FLOPs: 10163951616      # 10.2G
+      Parameters: 82229328    # 82M
+    In Collections: EfficientFormer
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.40
+          Top 5 Accuracy: 96.60
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l7_3rdparty_in1k_20220803-41a552bb.pth
+    Config: configs/efficientformer/efficientformer-l7_8xb128_in1k.py
+    Converted From:
+      Weights: https://drive.google.com/file/d/1cVw-pctJwgvGafeouynqWWCwgkcoFMM5/view?usp=sharing
+      Code: https://github.com/snap-research/EfficientFormer
--- a/configs/hornet/README.md
+++ b/configs/hornet/README.md
@ -0,0 +1,51 @@
+# HorNet
+
+> [HorNet: Efficient High-Order Spatial Interactions with Recursive Gated Convolutions](https://arxiv.org/pdf/2207.14284v2.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Recent progress in vision Transformers exhibits great success in various tasks driven by the new spatial modeling mechanism based on dot-product self-attention. In this paper, we show that the key ingredients behind the vision Transformers, namely input-adaptive, long-range and high-order spatial interactions, can also be efficiently implemented with a convolution-based framework. We present the Recursive Gated Convolution (g nConv) that performs high-order spatial interactions with gated convolutions and recursive designs. The new operation is highly flexible and customizable, which is compatible with various variants of convolution and extends the two-order interactions in self-attention to arbitrary orders without introducing significant extra computation. g nConv can serve as a plug-and-play module to improve various vision Transformers and convolution-based models. Based on the operation, we construct a new family of generic vision backbones named HorNet. Extensive experiments on ImageNet classification, COCO object detection and ADE20K semantic segmentation show HorNet outperform Swin Transformers and ConvNeXt by a significant margin with similar overall architecture and training configurations. HorNet also shows favorable scalability to more training data and a larger model size. Apart from the effectiveness in visual encoders, we also show g nConv can be applied to task-specific decoders and consistently improve dense prediction performance with less computation. Our results demonstrate that g nConv can be a new basic module for visual modeling that effectively combines the merits of both vision Transformers and CNNs. Code is available at https://github.com/raoyongming/HorNet.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24734142/188356236-b8e3db94-eaa6-48e9-b323-15e5ba7f2991.png" width="80%"/>
+</div>
+
+## Results and models
+
+### ImageNet-1k
+
+|     Model     |   Pretrain   | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                              Config                              |                              Download                              |
+| :-----------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :--------------------------------------------------------------: | :----------------------------------------------------------------: |
+|  HorNet-T\*   | From scratch |  224x224   |   22.41   |   3.98   |   82.84   |   96.24   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hornet/hornet-tiny_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-tiny_3rdparty_in1k_20220915-0e8eedff.pth) |
+| HorNet-T-GF\* | From scratch |  224x224   |   22.99   |   3.9    |   82.98   |   96.38   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hornet/hornet-tiny-gf_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-tiny-gf_3rdparty_in1k_20220915-4c35a66b.pth) |
+|  HorNet-S\*   | From scratch |  224x224   |   49.53   |   8.83   |   83.79   |   96.75   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hornet/hornet-small_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-small_3rdparty_in1k_20220915-5935f60f.pth) |
+| HorNet-S-GF\* | From scratch |  224x224   |   50.4    |   8.71   |   83.98   |   96.77   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hornet/hornet-small-gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-small-gf_3rdparty_in1k_20220915-649ca492.pth) |
+|  HorNet-B\*   | From scratch |  224x224   |   87.26   |  15.59   |   84.24   |   96.94   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hornet/hornet-base_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-base_3rdparty_in1k_20220915-a06176bb.pth) |
+| HorNet-B-GF\* | From scratch |  224x224   |   88.42   |  15.42   |   84.32   |   96.95   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hornet/hornet-base-gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-base-gf_3rdparty_in1k_20220915-82c06fa7.pth) |
+
+\*Models with * are converted from [the official repo](https://github.com/raoyongming/HorNet). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.
+
+### Pre-trained Models
+
+The pre-trained models on ImageNet-21k are used to fine-tune on the downstream tasks.
+
+|      Model       |   Pretrain   | resolution | Params(M) | Flops(G) |                                                          Download                                                          |
+| :--------------: | :----------: | :--------: | :-------: | :------: | :------------------------------------------------------------------------------------------------------------------------: |
+|    HorNet-L\*    | ImageNet-21k |  224x224   |  194.54   |  34.83   |    [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-large_3rdparty_in21k_20220909-9ccef421.pth)    |
+|  HorNet-L-GF\*   | ImageNet-21k |  224x224   |  196.29   |  34.58   |  [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-large-gf_3rdparty_in21k_20220909-3aea3b61.pth)   |
+| HorNet-L-GF384\* | ImageNet-21k |  384x384   |  201.23   |  101.63  | [model](https://download.openmmlab.com/mmclassification/v0/hornet/hornet-large-gf384_3rdparty_in21k_20220909-80894290.pth) |
+
+\*Models with * are converted from [the official repo](https://github.com/raoyongming/HorNet).
+
+## Citation
+
+```
+@article{rao2022hornet,
+  title={HorNet: Efficient High-Order Spatial Interactions with Recursive Gated Convolutions},
+  author={Rao, Yongming and Zhao, Wenliang and Tang, Yansong and Zhou, Jie and Lim, Ser-Lam and Lu, Jiwen},
+  journal={arXiv preprint arXiv:2207.14284},
+  year={2022}
+}
+```
--- a/configs/hornet/hornet-base-gf_8xb64_in1k.py
+++ b/configs/hornet/hornet-base-gf_8xb64_in1k.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/hornet/hornet-base-gf.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=64)
+
+optimizer = dict(lr=4e-3)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0), _delete_=True)
+
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
--- a/configs/hornet/hornet-base_8xb64_in1k.py
+++ b/configs/hornet/hornet-base_8xb64_in1k.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/hornet/hornet-base.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=64)
+
+optimizer = dict(lr=4e-3)
+optimizer_config = dict(grad_clip=dict(max_norm=5.0), _delete_=True)
+
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
--- a/configs/hornet/hornet-small-gf_8xb64_in1k.py
+++ b/configs/hornet/hornet-small-gf_8xb64_in1k.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/hornet/hornet-small-gf.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=64)
+
+optimizer = dict(lr=4e-3)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0), _delete_=True)
+
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
--- a/configs/hornet/hornet-small_8xb64_in1k.py
+++ b/configs/hornet/hornet-small_8xb64_in1k.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/hornet/hornet-small.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=64)
+
+optimizer = dict(lr=4e-3)
+optimizer_config = dict(grad_clip=dict(max_norm=5.0), _delete_=True)
+
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
--- a/configs/hornet/hornet-tiny-gf_8xb128_in1k.py
+++ b/configs/hornet/hornet-tiny-gf_8xb128_in1k.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/hornet/hornet-tiny-gf.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=128)
+
+optimizer = dict(lr=4e-3)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0), _delete_=True)
+
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
--- a/configs/hornet/hornet-tiny_8xb128_in1k.py
+++ b/configs/hornet/hornet-tiny_8xb128_in1k.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/hornet/hornet-tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+data = dict(samples_per_gpu=128)
+
+optimizer = dict(lr=4e-3)
+optimizer_config = dict(grad_clip=dict(max_norm=100.0), _delete_=True)
+
+custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')]
--- a/configs/hornet/metafile.yml
+++ b/configs/hornet/metafile.yml
@ -0,0 +1,97 @@
+Collections:
+  - Name: HorNet
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Architecture:
+        - HorNet
+        - gnConv
+    Paper:
+      URL: https://arxiv.org/pdf/2207.14284v2.pdf
+      Title: "HorNet: Efficient High-Order Spatial Interactions with Recursive Gated Convolutions"
+    README: configs/hornet/README.md
+    Code:
+      Version: v0.24.0
+      URL: https://github.com/open-mmlab/mmclassification/blob/v0.24.0/mmcls/models/backbones/hornet.py
+
+Models:
+  - Name: hornet-tiny_3rdparty_in1k
+    Metadata:
+      FLOPs: 3980000000   # 3.98G
+      Parameters: 22410000      # 22.41M
+    In Collection: HorNet
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.84
+          Top 5 Accuracy: 96.24
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/hornet/hornet-tiny_3rdparty_in1k_20220915-0e8eedff.pth
+    Config: configs/hornet/hornet-tiny_8xb128_in1k.py
+  - Name: hornet-tiny-gf_3rdparty_in1k
+    Metadata:
+      FLOPs: 3900000000   # 3.9G
+      Parameters: 22990000      # 22.99M
+    In Collection: HorNet
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.98
+          Top 5 Accuracy: 96.38
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/hornet/hornet-tiny-gf_3rdparty_in1k_20220915-4c35a66b.pth
+    Config: configs/hornet/hornet-tiny-gf_8xb128_in1k.py
+  - Name: hornet-small_3rdparty_in1k
+    Metadata:
+      FLOPs:  8830000000    # 8.83G
+      Parameters: 49530000          # 49.53M
+    In Collection: HorNet
+    Results:
+        - Dataset: ImageNet-1k
+          Metrics:
+            Top 1 Accuracy: 83.79
+            Top 5 Accuracy: 96.75
+          Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/hornet/hornet-small_3rdparty_in1k_20220915-5935f60f.pth
+    Config: configs/hornet/hornet-small_8xb64_in1k.py
+  - Name: hornet-small-gf_3rdparty_in1k
+    Metadata:
+      FLOPs:  8710000000    # 8.71G
+      Parameters: 50400000          # 50.4M
+    In Collection: HorNet
+    Results:
+        - Dataset: ImageNet-1k
+          Metrics:
+            Top 1 Accuracy: 83.98
+            Top 5 Accuracy: 96.77
+          Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/hornet/hornet-small-gf_3rdparty_in1k_20220915-649ca492.pth
+    Config: configs/hornet/hornet-small-gf_8xb64_in1k.py
+  - Name: hornet-base_3rdparty_in1k
+    Metadata:
+      FLOPs: 15590000000              # 15.59G
+      Parameters: 87260000            # 87.26M
+    In Collection: HorNet
+    Results:
+        - Dataset: ImageNet-1k
+          Metrics:
+            Top 1 Accuracy: 84.24
+            Top 5 Accuracy: 96.94
+          Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/hornet/hornet-base_3rdparty_in1k_20220915-a06176bb.pth
+    Config: configs/hornet/hornet-base_8xb64_in1k.py
+  - Name: hornet-base-gf_3rdparty_in1k
+    Metadata:
+      FLOPs: 15420000000              # 15.42G
+      Parameters: 88420000            # 88.42M
+    In Collection: HorNet
+    Results:
+        - Dataset: ImageNet-1k
+          Metrics:
+            Top 1 Accuracy: 84.32
+            Top 5 Accuracy: 96.95
+          Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/hornet/hornet-base-gf_3rdparty_in1k_20220915-82c06fa7.pth
+    Config: configs/hornet/hornet-base-gf_8xb64_in1k.py
--- a/configs/mobilenet_v3/mobilenet-v3-small_8xb16_cifar10.py
+++ b/configs/mobilenet_v3/mobilenet-v3-small_8xb16_cifar10.py
@ -1,5 +1,5 @@
 _base_ = [
-    '../_base_/models/mobilenet-v3-small_8xb16_cifar.py',
+    '../_base_/models/mobilenet-v3-small_cifar.py',
    '../_base_/datasets/cifar10_bs16.py',
    '../_base_/schedules/cifar10_bs128.py', '../_base_/default_runtime.py'
 ]
--- a/configs/mvit/README.md
+++ b/configs/mvit/README.md
@ -0,0 +1,44 @@
+# MViT V2
+
+> [MViTv2: Improved Multiscale Vision Transformers for Classification and Detection](http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this paper, we study Multiscale Vision Transformers (MViTv2) as a unified architecture for image and video
+classification, as well as object detection. We present an improved version of MViT that incorporates
+decomposed relative positional embeddings and residual pooling connections. We instantiate this architecture
+in five sizes and evaluate it for ImageNet classification, COCO detection and Kinetics video recognition where
+it outperforms prior work. We further compare MViTv2s' pooling attention to window attention mechanisms where
+it outperforms the latter in accuracy/compute. Without bells-and-whistles, MViTv2 has state-of-the-art
+performance in 3 domains: 88.8% accuracy on ImageNet classification, 58.7 boxAP on COCO object detection as
+well as 86.1% on Kinetics-400 video classification.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/180376227-755243fa-158e-4068-940a-416036519665.png" width="50%"/>
+</div>
+
+## Results and models
+
+### ImageNet-1k
+
+|     Model      |   Pretrain   | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                                Config                                |                                Download                                 |
+| :------------: | :----------: | :-------: | :------: | :-------: | :-------: | :------------------------------------------------------------------: | :---------------------------------------------------------------------: |
+| MViTv2-tiny\*  | From scratch |   24.17   |   4.70   |   82.33   |   96.15   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-tiny_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth) |
+| MViTv2-small\* | From scratch |   34.87   |   7.00   |   83.63   |   96.51   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth) |
+| MViTv2-base\*  | From scratch |   51.47   |  10.20   |   84.34   |   96.86   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-base_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth) |
+| MViTv2-large\* | From scratch |  217.99   |  42.10   |   85.25   |   97.14   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-large_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/mvit). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+```bibtex
+@inproceedings{li2021improved,
+  title={MViTv2: Improved multiscale vision transformers for classification and detection},
+  author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph},
+  booktitle={CVPR},
+  year={2022}
+}
+```
--- a/configs/mvit/metafile.yml
+++ b/configs/mvit/metafile.yml
@ -0,0 +1,95 @@
+Collections:
+  - Name: MViT V2
+    Metadata:
+      Architecture:
+        - Attention Dropout
+        - Convolution
+        - Dense Connections
+        - GELU
+        - Layer Normalization
+        - Scaled Dot-Product Attention
+        - Attention Pooling
+    Paper:
+      URL: http://openaccess.thecvf.com//content/CVPR2022/papers/Li_MViTv2_Improved_Multiscale_Vision_Transformers_for_Classification_and_Detection_CVPR_2022_paper.pdf
+      Title: 'MViTv2: Improved Multiscale Vision Transformers for Classification and Detection'
+    README: configs/mvit/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmclassification/blob/v0.24.0/mmcls/models/backbones/mvit.py
+      Version: v0.24.0
+
+Models:
+  - Name: mvitv2-tiny_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 4700000000
+      Parameters: 24173320
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 82.33
+        Top 5 Accuracy: 96.15
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-tiny_8xb256_in1k.py
+
+  - Name: mvitv2-small_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 7000000000
+      Parameters: 34870216
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 83.63
+        Top 5 Accuracy: 96.51
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-small_8xb256_in1k.py
+
+  - Name: mvitv2-base_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 10200000000
+      Parameters: 51472744
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 84.34
+        Top 5 Accuracy: 96.86
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-base_8xb256_in1k.py
+
+  - Name: mvitv2-large_3rdparty_in1k
+    In Collection: MViT V2
+    Metadata:
+      FLOPs: 42100000000
+      Parameters: 217992952
+      Training Data:
+        - ImageNet-1k
+    Results:
+    - Dataset: ImageNet-1k
+      Task: Image Classification
+      Metrics:
+        Top 1 Accuracy: 85.25
+        Top 5 Accuracy: 97.14
+    Weights: https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth
+      Code: https://github.com/facebookresearch/mvit
+    Config: configs/mvit/mvitv2-large_8xb256_in1k.py
--- a/configs/mvit/mvitv2-base_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-base_8xb256_in1k.py
@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/mvit/mvitv2-base.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+data = dict(samples_per_gpu=256)
+
+# schedule settings
+paramwise_cfg = dict(
+    norm_decay_mult=0.0,
+    bias_decay_mult=0.0,
+    custom_keys={
+        '.pos_embed': dict(decay_mult=0.0),
+        '.rel_pos_h': dict(decay_mult=0.0),
+        '.rel_pos_w': dict(decay_mult=0.0)
+    })
+
+optimizer = dict(lr=0.00025, paramwise_cfg=paramwise_cfg)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=70,
+    warmup_by_epoch=True)
--- a/configs/mvit/mvitv2-large_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-large_8xb256_in1k.py
@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/mvit/mvitv2-large.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+data = dict(samples_per_gpu=256)
+
+# schedule settings
+paramwise_cfg = dict(
+    norm_decay_mult=0.0,
+    bias_decay_mult=0.0,
+    custom_keys={
+        '.pos_embed': dict(decay_mult=0.0),
+        '.rel_pos_h': dict(decay_mult=0.0),
+        '.rel_pos_w': dict(decay_mult=0.0)
+    })
+
+optimizer = dict(lr=0.00025, paramwise_cfg=paramwise_cfg)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=70,
+    warmup_by_epoch=True)
--- a/configs/mvit/mvitv2-small_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-small_8xb256_in1k.py
@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/mvit/mvitv2-small.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+data = dict(samples_per_gpu=256)
+
+# schedule settings
+paramwise_cfg = dict(
+    norm_decay_mult=0.0,
+    bias_decay_mult=0.0,
+    custom_keys={
+        '.pos_embed': dict(decay_mult=0.0),
+        '.rel_pos_h': dict(decay_mult=0.0),
+        '.rel_pos_w': dict(decay_mult=0.0)
+    })
+
+optimizer = dict(lr=0.00025, paramwise_cfg=paramwise_cfg)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=70,
+    warmup_by_epoch=True)
--- a/configs/mvit/mvitv2-tiny_8xb256_in1k.py
+++ b/configs/mvit/mvitv2-tiny_8xb256_in1k.py
@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/mvit/mvitv2-tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs2048_AdamW.py',
+    '../_base_/default_runtime.py'
+]
+
+# dataset settings
+data = dict(samples_per_gpu=256)
+
+# schedule settings
+paramwise_cfg = dict(
+    norm_decay_mult=0.0,
+    bias_decay_mult=0.0,
+    custom_keys={
+        '.pos_embed': dict(decay_mult=0.0),
+        '.rel_pos_h': dict(decay_mult=0.0),
+        '.rel_pos_w': dict(decay_mult=0.0)
+    })
+
+optimizer = dict(lr=0.00025, paramwise_cfg=paramwise_cfg)
+optimizer_config = dict(grad_clip=dict(max_norm=1.0))
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=70,
+    warmup_by_epoch=True)
--- a/configs/resnet/README.md
+++ b/configs/resnet/README.md
@ -72,6 +72,12 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don'
 | :-------: | :--------------------------------------------------: | :--------: | :-------: | :------: | :-------: | :------------------------------------------------: | :---------------------------------------------------: |
 | ResNet-50 | [ImageNet-21k-mill](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth) |  448x448   |   23.92   |  16.48   |   88.45   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb8_cub.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.log.json) |

+### Stanford-Cars
+
+|   Model   |                       Pretrain                       | resolution | Params(M) | Flops(G) | Top-1 (%) |                       Config                       |                       Download                        |
+| :-------: | :--------------------------------------------------: | :--------: | :-------: | :------: | :-------: | :------------------------------------------------: | :---------------------------------------------------: |
+| ResNet-50 | [ImageNet-21k-mill](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth) |  448x448   |   23.92   |  16.48   |   92.82   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb8_cars.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cars_20220812-9d85901a.pth) \| [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cars_20220812-9d85901a.log.json) |
+
 ## Citation

 ```
--- a/configs/resnet/metafile.yml
+++ b/configs/resnet/metafile.yml
@ -350,3 +350,16 @@ Models:
    Pretrain: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth
    Weights: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cub_20220307-57840e60.pth
    Config: configs/resnet/resnet50_8xb8_cub.py
+  - Name: resnet50_8xb8_cars
+    Metadata:
+      FLOPs: 16480000000
+      Parameters: 23920000
+    In Collection: ResNet
+    Results:
+      - Dataset: StanfordCars
+        Metrics:
+          Top 1 Accuracy: 92.82
+        Task: Image Classification
+    Pretrain: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth
+    Weights: https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb8_cars_20220812-9d85901a.pth
+    Config: configs/resnet/resnet50_8xb8_cars.py
--- a/configs/resnet/resnet50_8xb8_cars.py
+++ b/configs/resnet/resnet50_8xb8_cars.py
@ -0,0 +1,19 @@
+_base_ = [
+    '../_base_/models/resnet50.py',
+    '../_base_/datasets/stanford_cars_bs8_448.py',
+    '../_base_/schedules/stanford_cars_bs8.py', '../_base_/default_runtime.py'
+]
+
+# use pre-train weight converted from https://github.com/Alibaba-MIIL/ImageNet21K # noqa
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_3rdparty-mill_in21k_20220331-faac000b.pth'  # noqa
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint, prefix='backbone')),
+    head=dict(num_classes=196, ))
+
+log_config = dict(interval=50)
+checkpoint_config = dict(
+    interval=1, max_keep_ckpts=3)  # save last three checkpoints
--- a/configs/swin_transformer_v2/README.md
+++ b/configs/swin_transformer_v2/README.md
@ -0,0 +1,58 @@
+# Swin Transformer V2
+
+> [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Large-scale NLP models have been shown to significantly improve the performance on language tasks with no signs of saturation. They also demonstrate amazing few-shot capabilities like that of human beings. This paper aims to explore large-scale models in computer vision. We tackle three major issues in training and application of large vision models, including training instability, resolution gaps between pre-training and fine-tuning, and hunger on labelled data. Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/42952108/180748696-ee7ed23d-7fee-4ccf-9eb5-f117db228a42.png" width="100%"/>
+</div>
+
+## Results and models
+
+### ImageNet-21k
+
+The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don't have evaluation results.
+
+|  Model   | resolution | Params(M) | Flops(G) |                                                                 Download                                                                 |
+| :------: | :--------: | :-------: | :------: | :--------------------------------------------------------------------------------------------------------------------------------------: |
+| Swin-B\* |  192x192   |   87.92   |   8.51   | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-base-w12_3rdparty_in21k-192px_20220803-f7dc9763.pth)  |
+| Swin-L\* |  192x192   |  196.74   |  19.04   | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-large-w12_3rdparty_in21k-192px_20220803-d9073fee.pth) |
+
+### ImageNet-1k
+
+|  Model   |   Pretrain   | resolution | window | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                             Config                              |                              Download                              |
+| :------: | :----------: | :--------: | :----: | :-------: | :------: | :-------: | :-------: | :-------------------------------------------------------------: | :----------------------------------------------------------------: |
+| Swin-T\* | From scratch |  256x256   |  8x8   |   28.35   |   4.35   |   81.76   |   95.87   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w8_3rdparty_in1k-256px_20220803-e318968f.pth) |
+| Swin-T\* | From scratch |  256x256   | 16x16  |   28.35   |   4.4    |   82.81   |   96.23   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w16_3rdparty_in1k-256px_20220803-9651cdd7.pth) |
+| Swin-S\* | From scratch |  256x256   |  8x8   |   49.73   |   8.45   |   83.74   |   96.6    | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w8_3rdparty_in1k-256px_20220803-b01a4332.pth) |
+| Swin-S\* | From scratch |  256x256   | 16x16  |   49.73   |   8.57   |   84.13   |   96.83   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w16_3rdparty_in1k-256px_20220803-b707d206.pth) |
+| Swin-B\* | From scratch |  256x256   |  8x8   |   87.92   |  14.99   |   84.2    |   96.86   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w8_3rdparty_in1k-256px_20220803-8ff28f2b.pth) |
+| Swin-B\* | From scratch |  256x256   | 16x16  |   87.92   |  15.14   |   84.6    |   97.05   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_3rdparty_in1k-256px_20220803-5a1886b7.pth) |
+| Swin-B\* | ImageNet-21k |  256x256   | 16x16  |   87.92   |  15.14   |   86.17   |   97.88   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_in21k-pre_3rdparty_in1k-256px_20220803-8d7aa8ad.pth) |
+| Swin-B\* | ImageNet-21k |  384x384   | 24x24  |   87.92   |  34.07   |   87.14   |   98.23   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w24_in21k-pre_3rdparty_in1k-384px_20220803-44eb70f8.pth) |
+| Swin-L\* | ImageNet-21k |  256X256   | 16x16  |  196.75   |  33.86   |   86.93   |   98.06   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w16_in21k-pre_3rdparty_in1k-256px_20220803-c40cbed7.pth) |
+| Swin-L\* | ImageNet-21k |  384x384   | 24x24  |  196.75   |   76.2   |   87.59   |   98.27   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w24_in21k-pre_3rdparty_in1k-384px_20220803-3b36c165.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+*ImageNet-21k pretrained models with input resolution of 256x256 and 384x384 both fine-tuned from the same pre-training model using a smaller input resolution of 192x192.*
+
+## Citation
+
+```
+@article{https://doi.org/10.48550/arxiv.2111.09883,
+  doi = {10.48550/ARXIV.2111.09883},
+  url = {https://arxiv.org/abs/2111.09883},
+  author = {Liu, Ze and Hu, Han and Lin, Yutong and Yao, Zhuliang and Xie, Zhenda and Wei, Yixuan and Ning, Jia and Cao, Yue and Zhang, Zheng and Dong, Li and Wei, Furu and Guo, Baining},
+  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
+  title = {Swin Transformer V2: Scaling Up Capacity and Resolution},
+  publisher = {arXiv},
+  year = {2021},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```
--- a/configs/swin_transformer_v2/metafile.yml
+++ b/configs/swin_transformer_v2/metafile.yml
@ -0,0 +1,204 @@
+Collections:
+  - Name: Swin-Transformer-V2
+    Metadata:
+      Training Data: ImageNet-1k
+      Training Techniques:
+        - AdamW
+        - Weight Decay
+      Training Resources: 16x V100 GPUs
+      Epochs: 300
+      Batch Size: 1024
+      Architecture:
+        - Shift Window Multihead Self Attention
+    Paper:
+      URL: https://arxiv.org/abs/2111.09883.pdf
+      Title: "Swin Transformer V2: Scaling Up Capacity and Resolution"
+    README: configs/swin_transformer_v2/README.md
+
+Models:
+  - Name: swinv2-tiny-w8_3rdparty_in1k-256px
+    Metadata:
+      FLOPs: 4350000000
+      Parameters: 28350000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.76
+          Top 5 Accuracy: 95.87
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w8_3rdparty_in1k-256px_20220803-e318968f.pth
+    Config: configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-tiny-w16_3rdparty_in1k-256px
+    Metadata:
+      FLOPs: 4400000000
+      Parameters: 28350000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.81
+          Top 5 Accuracy: 96.23
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-tiny-w16_3rdparty_in1k-256px_20220803-9651cdd7.pth
+    Config: configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-small-w8_3rdparty_in1k-256px
+    Metadata:
+      FLOPs: 8450000000
+      Parameters: 49730000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.74
+          Top 5 Accuracy: 96.6
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w8_3rdparty_in1k-256px_20220803-b01a4332.pth
+    Config: configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-small-w16_3rdparty_in1k-256px
+    Metadata:
+      FLOPs: 8570000000
+      Parameters: 49730000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.13
+          Top 5 Accuracy: 96.83
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-small-w16_3rdparty_in1k-256px_20220803-b707d206.pth
+    Config: configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-base-w8_3rdparty_in1k-256px
+    Metadata:
+      FLOPs: 14990000000
+      Parameters: 87920000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.2
+          Top 5 Accuracy: 96.86
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w8_3rdparty_in1k-256px_20220803-8ff28f2b.pth
+    Config: configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-base-w16_3rdparty_in1k-256px
+    Metadata:
+      FLOPs: 15140000000
+      Parameters: 87920000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.6
+          Top 5 Accuracy: 97.05
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_3rdparty_in1k-256px_20220803-5a1886b7.pth
+    Config: configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-base-w16_in21k-pre_3rdparty_in1k-256px
+    Metadata:
+      Training Data: ImageNet-21k
+      FLOPs: 15140000000
+      Parameters: 87920000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.17
+          Top 5 Accuracy: 97.88
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w16_in21k-pre_3rdparty_in1k-256px_20220803-8d7aa8ad.pth
+    Config: configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-base-w24_in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data: ImageNet-21k
+      FLOPs: 34070000000
+      Parameters: 87920000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.14
+          Top 5 Accuracy: 98.23
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-base-w24_in21k-pre_3rdparty_in1k-384px_20220803-44eb70f8.pth
+    Config: configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-large-w16_in21k-pre_3rdparty_in1k-256px
+    Metadata:
+      Training Data: ImageNet-21k
+      FLOPs: 33860000000
+      Parameters: 196750000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.93
+          Top 5 Accuracy: 98.06
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w16_in21k-pre_3rdparty_in1k-256px_20220803-c40cbed7.pth
+    Config: configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-large-w24_in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data: ImageNet-21k
+      FLOPs: 76200000000
+      Parameters: 196750000
+    In Collection: Swin-Transformer-V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.59
+          Top 5 Accuracy: 98.27
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/swinv2-large-w24_in21k-pre_3rdparty_in1k-384px_20220803-3b36c165.pth
+    Config: configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-base-w12_3rdparty_in21k-192px
+    Metadata:
+      Training Data: ImageNet-21k
+      FLOPs: 8510000000
+      Parameters: 87920000
+    In Collections: Swin-Transformer-V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-base-w12_3rdparty_in21k-192px_20220803-f7dc9763.pth
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pth
+      Code: https://github.com/microsoft/Swin-Transformer
+  - Name: swinv2-large-w12_3rdparty_in21k-192px
+    Metadata:
+      Training Data: ImageNet-21k
+      FLOPs: 19040000000
+      Parameters: 196740000
+    In Collections: Swin-Transformer-V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/swin-v2/pretrain/swinv2-large-w12_3rdparty_in21k-192px_20220803-d9073fee.pth
+    Converted From:
+      Weights: https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth
+      Code: https://github.com/microsoft/Swin-Transformer
--- a/configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-base-w16_16xb64_in1k-256px.py
@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/base_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(backbone=dict(window_size=[16, 16, 16, 8]))
--- a/configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-base-w16_in21k-pre_16xb64_in1k-256px.py
@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/base_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        window_size=[16, 16, 16, 8],
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6]))
--- a/configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py
+++ b/configs/swin_transformer_v2/swinv2-base-w24_in21k-pre_16xb64_in1k-384px.py
@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/base_384.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        img_size=384,
+        window_size=[24, 24, 24, 12],
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6]))
--- a/configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-base-w8_16xb64_in1k-256px.py
@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/base_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
--- a/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-large-w16_in21k-pre_16xb64_in1k-256px.py
@ -0,0 +1,13 @@
+# Only for evaluation
+_base_ = [
+    '../_base_/models/swin_transformer_v2/large_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        window_size=[16, 16, 16, 8], pretrained_window_sizes=[12, 12, 12, 6]),
+)
--- a/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py
+++ b/configs/swin_transformer_v2/swinv2-large-w24_in21k-pre_16xb64_in1k-384px.py
@ -0,0 +1,15 @@
+# Only for evaluation
+_base_ = [
+    '../_base_/models/swin_transformer_v2/large_384.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        img_size=384,
+        window_size=[24, 24, 24, 12],
+        pretrained_window_sizes=[12, 12, 12, 6]),
+)
--- a/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-small-w16_16xb64_in1k-256px.py
@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/small_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(backbone=dict(window_size=[16, 16, 16, 8]))
--- a/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-small-w8_16xb64_in1k-256px.py
@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/small_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
--- a/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-tiny-w16_16xb64_in1k-256px.py
@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/tiny_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+model = dict(backbone=dict(window_size=[16, 16, 16, 8]))
--- a/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py
+++ b/configs/swin_transformer_v2/swinv2-tiny-w8_16xb64_in1k-256px.py
@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/models/swin_transformer_v2/tiny_256.py',
+    '../_base_/datasets/imagenet_bs64_swin_256.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
--- a/configs/van/README.md
+++ b/configs/van/README.md
@ -16,15 +16,28 @@ While originally designed for natural language processing (NLP) tasks, the self-

 ### ImageNet-1k

-|  Model  |   Pretrain   | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                               Config                                |                               Download                                |
-| :-----: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------: | :-------------------------------------------------------------------: |
-| VAN-T\* | From scratch |  224x224   |   4.11    |   0.88   |   75.41   |   93.02   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-tiny_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) |
-| VAN-S\* | From scratch |  224x224   |   13.86   |   2.52   |   81.01   |   95.63   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) |
-| VAN-B\* | From scratch |  224x224   |   26.58   |   5.03   |   82.80   |   96.21   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) |
-| VAN-L\* | From scratch |  224x224   |   44.77   |   8.99   |   83.86   |   96.73   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) |
+|  Model   |   Pretrain   | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                               Config                               |                               Download                                |
+| :------: | :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------------------------: | :-------------------------------------------------------------------: |
+| VAN-B0\* | From scratch |  224x224   |   4.11    |   0.88   |   75.41   |   93.02   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b0_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth) |
+| VAN-B1\* | From scratch |  224x224   |   13.86   |   2.52   |   81.01   |   95.63   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b1_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) |
+| VAN-B2\* | From scratch |  224x224   |   26.58   |   5.03   |   82.80   |   96.21   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b2_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) |
+| VAN-B3\* | From scratch |  224x224   |   44.77   |   8.99   |   83.86   |   96.73   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b3_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) |
+| VAN-B4\* | From scratch |  224x224   |   60.28   |  12.22   |   84.13   |   96.86   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-b4_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b4_3rdparty_in1k_20220909-f4665b92.pth) |

 \*Models with * are converted from [the official repo](https://github.com/Visual-Attention-Network/VAN-Classification). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.

+### Pre-trained Models
+
+The pre-trained models on ImageNet-21k are used to fine-tune on the downstream tasks.
+
+|  Model   |   Pretrain   | resolution | Params(M) | Flops(G) |                                                  Download                                                   |
+| :------: | :----------: | :--------: | :-------: | :------: | :---------------------------------------------------------------------------------------------------------: |
+| VAN-B4\* | ImageNet-21k |  224x224   |   60.28   |  12.22   | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b4_3rdparty_in21k_20220909-db926b18.pth) |
+| VAN-B5\* | ImageNet-21k |  224x224   |   89.97   |  17.21   | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b5_3rdparty_in21k_20220909-18e904e3.pth) |
+| VAN-B6\* | ImageNet-21k |  224x224   |   283.9   |  55.28   | [model](https://download.openmmlab.com/mmclassification/v0/van/van-b6_3rdparty_in21k_20220909-96c2cb3a.pth) |
+
+\*Models with * are converted from [the official repo](https://github.com/Visual-Attention-Network/VAN-Classification).
+
 ## Citation

 ```
--- a/configs/van/metafile.yml
+++ b/configs/van/metafile.yml
@ -7,6 +7,7 @@ Collections:
        - Weight Decay
      Architecture:
        - Visual Attention Network
+        - LKA
    Paper:
      URL: https://arxiv.org/pdf/2202.09741v2.pdf
      Title: "Visual Attention Network"
@ -16,10 +17,10 @@ Collections:
      Version: v0.23.0

 Models:
-  - Name: van-tiny_8xb128_in1k
+  - Name: van-b0_3rdparty_in1k
    Metadata:
-      FLOPs: 4110000      # 4.11M
-      Parameters: 880000000   # 0.88G
+      FLOPs: 880000000   # 0.88G
+      Parameters: 4110000      # 4.11M
    In Collection: Visual-Attention-Network
    Results:
      - Dataset: ImageNet-1k
@ -28,11 +29,11 @@ Models:
          Top 5 Accuracy: 93.02
        Task: Image Classification
    Weights: https://download.openmmlab.com/mmclassification/v0/van/van-tiny_8xb128_in1k_20220501-385941af.pth
-    Config: configs/van/van-tiny_8xb128_in1k.py
-  - Name: van-small_8xb128_in1k
+    Config: configs/van/van-b0_8xb128_in1k.py
+  - Name: van-b1_3rdparty_in1k
    Metadata:
-      FLOPs:  13860000          # 13.86M
-      Parameters: 2520000000    # 2.52G
+      FLOPs:  2520000000    # 2.52G
+      Parameters: 13860000          # 13.86M
    In Collection: Visual-Attention-Network
    Results:
        - Dataset: ImageNet-1k
@ -41,11 +42,11 @@ Models:
            Top 5 Accuracy: 95.63
          Task: Image Classification
    Weights: https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth
-    Config: configs/van/van-small_8xb128_in1k.py
-  - Name: van-base_8xb128_in1k
+    Config: configs/van/van-b1_8xb128_in1k.py
+  - Name: van-b2_3rdparty_in1k
    Metadata:
-      FLOPs: 26580000            # 26.58M
-      Parameters: 5030000000                # 5.03G
+      FLOPs: 5030000000                # 5.03G
+      Parameters: 26580000            # 26.58M
    In Collection: Visual-Attention-Network
    Results:
        - Dataset: ImageNet-1k
@ -54,11 +55,11 @@ Models:
            Top 5 Accuracy: 96.21
          Task: Image Classification
    Weights: https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth
-    Config: configs/van/van-base_8xb128_in1k.py
-  - Name: van-large_8xb128_in1k
+    Config: configs/van/van-b2_8xb128_in1k.py
+  - Name: van-b3_3rdparty_in1k
    Metadata:
-      FLOPs: 44770000              # 44.77 M
-      Parameters: 8990000000              # 8.99G
+      FLOPs: 8990000000              # 8.99G
+      Parameters: 44770000              # 44.77M
    In Collection: Visual-Attention-Network
    Results:
        - Dataset: ImageNet-1k
@ -67,4 +68,17 @@ Models:
            Top 5 Accuracy: 96.73
          Task: Image Classification
    Weights: https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth
-    Config: configs/van/van-large_8xb128_in1k.py
+    Config: configs/van/van-b3_8xb128_in1k.py
+  - Name: van-b4_3rdparty_in1k
+    Metadata:
+      FLOPs: 12220000000              # 12.22G
+      Parameters: 60280000              # 60.28M
+    In Collection: Visual-Attention-Network
+    Results:
+        - Dataset: ImageNet-1k
+          Metrics:
+            Top 1 Accuracy: 84.13
+            Top 5 Accuracy: 96.86
+          Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/van/van-b4_3rdparty_in1k_20220909-f4665b92.pth
+    Config: configs/van/van-b4_8xb128_in1k.py
--- a/configs/van/van-b0_8xb128_in1k.py
+++ b/configs/van/van-b0_8xb128_in1k.py
@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/van/van_b0.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# Note that the mean and variance used here are different from other configs
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(248, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=128,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
--- a/configs/van/van-b1_8xb128_in1k.py
+++ b/configs/van/van-b1_8xb128_in1k.py
@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/van/van_b1.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# Note that the mean and variance used here are different from other configs
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(248, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=128,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
--- a/configs/van/van-b2_8xb128_in1k.py
+++ b/configs/van/van-b2_8xb128_in1k.py
@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/van/van_b2.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# Note that the mean and variance used here are different from other configs
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(248, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=128,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
--- a/configs/van/van-b3_8xb128_in1k.py
+++ b/configs/van/van-b3_8xb128_in1k.py
@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/van/van_b3.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# Note that the mean and variance used here are different from other configs
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(248, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=128,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
--- a/configs/van/van-b4_8xb128_in1k.py
+++ b/configs/van/van-b4_8xb128_in1k.py
@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/van/van_b4.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py'
+]
+
+# Note that the mean and variance used here are different from other configs
+img_norm_cfg = dict(
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        size=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies={{_base_.rand_increasing_policies}},
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
+            interpolation='bicubic')),
+    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=img_norm_cfg['mean'][::-1],
+        fill_std=img_norm_cfg['std'][::-1]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='ToTensor', keys=['gt_label']),
+    dict(type='Collect', keys=['img', 'gt_label'])
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='Resize',
+        size=(248, -1),
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='ImageToTensor', keys=['img']),
+    dict(type='Collect', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=128,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
--- a/configs/van/van-base_8xb128_in1k.py
+++ b/configs/van/van-base_8xb128_in1k.py
@ -1,61 +1,6 @@
-_base_ = [
-    '../_base_/models/van/van_base.py',
-    '../_base_/datasets/imagenet_bs64_swin_224.py',
-    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
-    '../_base_/default_runtime.py'
-]
+_base_ = ['./van-b2_8xb128_in1k.py']

-# Note that the mean and variance used here are different from other configs
-img_norm_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='RandomResizedCrop',
-        size=224,
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
-    dict(
-        type='RandAugment',
-        policies={{_base_.rand_increasing_policies}},
-        num_policies=2,
-        total_level=10,
-        magnitude_level=9,
-        magnitude_std=0.5,
-        hparams=dict(
-            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
-            interpolation='bicubic')),
-    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
-    dict(
-        type='RandomErasing',
-        erase_prob=0.25,
-        mode='rand',
-        min_area_ratio=0.02,
-        max_area_ratio=1 / 3,
-        fill_color=img_norm_cfg['mean'][::-1],
-        fill_std=img_norm_cfg['std'][::-1]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='ToTensor', keys=['gt_label']),
-    dict(type='Collect', keys=['img', 'gt_label'])
-]
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='Resize',
-        size=(248, -1),
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='Collect', keys=['img'])
-]
-
-data = dict(
-    samples_per_gpu=128,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+_deprecation_ = dict(
+    expected='van-b2_8xb128_in1k.p',
+    reference='https://github.com/open-mmlab/mmclassification/pull/1017',
+)
--- a/configs/van/van-large_8xb128_in1k.py
+++ b/configs/van/van-large_8xb128_in1k.py
@ -1,61 +1,6 @@
-_base_ = [
-    '../_base_/models/van/van_large.py',
-    '../_base_/datasets/imagenet_bs64_swin_224.py',
-    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
-    '../_base_/default_runtime.py'
-]
+_base_ = ['./van-b3_8xb128_in1k.py']

-# Note that the mean and variance used here are different from other configs
-img_norm_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='RandomResizedCrop',
-        size=224,
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
-    dict(
-        type='RandAugment',
-        policies={{_base_.rand_increasing_policies}},
-        num_policies=2,
-        total_level=10,
-        magnitude_level=9,
-        magnitude_std=0.5,
-        hparams=dict(
-            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
-            interpolation='bicubic')),
-    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
-    dict(
-        type='RandomErasing',
-        erase_prob=0.25,
-        mode='rand',
-        min_area_ratio=0.02,
-        max_area_ratio=1 / 3,
-        fill_color=img_norm_cfg['mean'][::-1],
-        fill_std=img_norm_cfg['std'][::-1]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='ToTensor', keys=['gt_label']),
-    dict(type='Collect', keys=['img', 'gt_label'])
-]
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='Resize',
-        size=(248, -1),
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='Collect', keys=['img'])
-]
-
-data = dict(
-    samples_per_gpu=128,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+_deprecation_ = dict(
+    expected='van-b3_8xb128_in1k.p',
+    reference='https://github.com/open-mmlab/mmclassification/pull/1017',
+)
--- a/configs/van/van-small_8xb128_in1k.py
+++ b/configs/van/van-small_8xb128_in1k.py
@ -1,61 +1,6 @@
-_base_ = [
-    '../_base_/models/van/van_small.py',
-    '../_base_/datasets/imagenet_bs64_swin_224.py',
-    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
-    '../_base_/default_runtime.py'
-]
+_base_ = ['./van-b1_8xb128_in1k.py']

-# Note that the mean and variance used here are different from other configs
-img_norm_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='RandomResizedCrop',
-        size=224,
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
-    dict(
-        type='RandAugment',
-        policies={{_base_.rand_increasing_policies}},
-        num_policies=2,
-        total_level=10,
-        magnitude_level=9,
-        magnitude_std=0.5,
-        hparams=dict(
-            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
-            interpolation='bicubic')),
-    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
-    dict(
-        type='RandomErasing',
-        erase_prob=0.25,
-        mode='rand',
-        min_area_ratio=0.02,
-        max_area_ratio=1 / 3,
-        fill_color=img_norm_cfg['mean'][::-1],
-        fill_std=img_norm_cfg['std'][::-1]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='ToTensor', keys=['gt_label']),
-    dict(type='Collect', keys=['img', 'gt_label'])
-]
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='Resize',
-        size=(248, -1),
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='Collect', keys=['img'])
-]
-
-data = dict(
-    samples_per_gpu=128,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+_deprecation_ = dict(
+    expected='van-b1_8xb128_in1k.py',
+    reference='https://github.com/open-mmlab/mmclassification/pull/1017',
+)
--- a/configs/van/van-tiny_8xb128_in1k.py
+++ b/configs/van/van-tiny_8xb128_in1k.py
@ -1,61 +1,6 @@
-_base_ = [
-    '../_base_/models/van/van_tiny.py',
-    '../_base_/datasets/imagenet_bs64_swin_224.py',
-    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
-    '../_base_/default_runtime.py'
-]
+_base_ = ['./van-b0_8xb128_in1k.py']

-# Note that the mean and variance used here are different from other configs
-img_norm_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='RandomResizedCrop',
-        size=224,
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'),
-    dict(
-        type='RandAugment',
-        policies={{_base_.rand_increasing_policies}},
-        num_policies=2,
-        total_level=10,
-        magnitude_level=9,
-        magnitude_std=0.5,
-        hparams=dict(
-            pad_val=[round(x) for x in img_norm_cfg['mean'][::-1]],
-            interpolation='bicubic')),
-    dict(type='ColorJitter', brightness=0.4, contrast=0.4, saturation=0.4),
-    dict(
-        type='RandomErasing',
-        erase_prob=0.25,
-        mode='rand',
-        min_area_ratio=0.02,
-        max_area_ratio=1 / 3,
-        fill_color=img_norm_cfg['mean'][::-1],
-        fill_std=img_norm_cfg['std'][::-1]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='ToTensor', keys=['gt_label']),
-    dict(type='Collect', keys=['img', 'gt_label'])
-]
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='Resize',
-        size=(248, -1),
-        backend='pillow',
-        interpolation='bicubic'),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='ImageToTensor', keys=['img']),
-    dict(type='Collect', keys=['img'])
-]
-
-data = dict(
-    samples_per_gpu=128,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
+_deprecation_ = dict(
+    expected='van-b0_8xb128_in1k.py',
+    reference='https://github.com/open-mmlab/mmclassification/pull/1017',
+)
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@ -3,8 +3,8 @@ ARG CUDA="10.2"
 ARG CUDNN="7"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

-ARG MMCV="1.4.2"
-ARG MMCLS="0.23.2"
+ARG MMCV="1.6.2"
+ARG MMCLS="0.24.0"

 ENV PYTHONUNBUFFERED TRUE

--- a/docs/en/api/datasets.rst
+++ b/docs/en/api/datasets.rst
@ -39,6 +39,11 @@ VOC

 .. autoclass:: VOC

+StanfordCars Cars
+-----------------
+
+.. autoclass:: StanfordCars
+
 Base classes
 ------------

--- a/docs/en/api/models.rst
+++ b/docs/en/api/models.rst
@ -87,6 +87,8 @@ Backbones
   VAN
   VGG
   VisionTransformer
+   EfficientFormer
+   HorNet

 .. _necks:

--- a/docs/en/changelog.md
+++ b/docs/en/changelog.md
@ -1,5 +1,49 @@
 # Changelog

+## v0.24.0(30/9/2022)
+
+### Highlights
+
+- Support HorNet, EfficientFormerm, SwinTransformer V2 and MViT backbones.
+- Support Standford Cars dataset.
+
+### New Features
+
+- Support HorNet Backbone. ([#1013](https://github.com/open-mmlab/mmclassification/pull/1013))
+- Support EfficientFormer. ([#954](https://github.com/open-mmlab/mmclassification/pull/954))
+- Support Stanford Cars dataset. ([#893](https://github.com/open-mmlab/mmclassification/pull/893))
+- Support CSRA head. ([#881](https://github.com/open-mmlab/mmclassification/pull/881))
+- Support Swin Transform V2. ([#799](https://github.com/open-mmlab/mmclassification/pull/799))
+- Support MViT and add checkpoints. ([#924](https://github.com/open-mmlab/mmclassification/pull/924))
+
+### Improvements
+
+- \[Improve\] replace loop of progressbar in api/test. ([#878](https://github.com/open-mmlab/mmclassification/pull/878))
+- \[Enhance\] RepVGG for YOLOX-PAI. ([#1025](https://github.com/open-mmlab/mmclassification/pull/1025))
+- \[Enhancement\] Update VAN. ([#1017](https://github.com/open-mmlab/mmclassification/pull/1017))
+- \[Refactor\] Re-write `get_sinusoid_encoding` from third-party implementation. ([#965](https://github.com/open-mmlab/mmclassification/pull/965))
+- \[Improve\] Upgrade onnxsim to v0.4.0. ([#915](https://github.com/open-mmlab/mmclassification/pull/915))
+- \[Improve\] Fixed typo in `RepVGG`. ([#985](https://github.com/open-mmlab/mmclassification/pull/985))
+- \[Improve\] Using `train_step` instead of `forward` in PreciseBNHook ([#964](https://github.com/open-mmlab/mmclassification/pull/964))
+- \[Improve\] Use `forward_dummy` to calculate FLOPS. ([#953](https://github.com/open-mmlab/mmclassification/pull/953))
+
+### Bug Fixes
+
+- Fix warning with `torch.meshgrid`. ([#860](https://github.com/open-mmlab/mmclassification/pull/860))
+- Add matplotlib minimum version requriments. ([#909](https://github.com/open-mmlab/mmclassification/pull/909))
+- val loader should not drop last by default. ([#857](https://github.com/open-mmlab/mmclassification/pull/857))
+- Fix config.device bug in toturial. ([#1059](https://github.com/open-mmlab/mmclassification/pull/1059))
+- Fix attenstion clamp max params ([#1034](https://github.com/open-mmlab/mmclassification/pull/1034))
+- Fix device mismatch in Swin-v2. ([#976](https://github.com/open-mmlab/mmclassification/pull/976))
+- Fix the output position of Swin-Transformer. ([#947](https://github.com/open-mmlab/mmclassification/pull/947))
+
+### Docs Update
+
+- Fix typo in config.md. ([#827](https://github.com/open-mmlab/mmclassification/pull/827))
+- Add version for torchvision to avoide error. ([#903](https://github.com/open-mmlab/mmclassification/pull/903))
+- Fixed typo for `--out-dir` option of analyze_results.py. ([#898](https://github.com/open-mmlab/mmclassification/pull/898))
+- Refine the docstring of RegNet ([#935](https://github.com/open-mmlab/mmclassification/pull/935))
+
 ## v0.23.2(28/7/2022)

 ### New Features
--- a/docs/en/faq.md
+++ b/docs/en/faq.md
@ -18,7 +18,8 @@ and make sure you fill in all required information in the template.
  | MMClassification version |      MMCV version      |
  | :----------------------: | :--------------------: |
  |           dev            |  mmcv>=1.6.0, \<1.7.0  |
-  |     0.23.2 (master)      |  mmcv>=1.4.2, \<1.7.0  |
+  |     0.24.0 (master)      |  mmcv>=1.4.2, \<1.7.0  |
+  |          0.23.2          |  mmcv>=1.4.2, \<1.7.0  |
  |          0.22.1          |  mmcv>=1.4.2, \<1.6.0  |
  |          0.21.0          | mmcv>=1.4.2, \<=1.5.0  |
  |          0.20.1          | mmcv>=1.4.2, \<=1.5.0  |
--- a/docs/en/model_zoo.md
+++ b/docs/en/model_zoo.md
@ -141,6 +141,13 @@ The ResNet family models below are trained by standard data augmentations, i.e.,
 |              VAN-S\*               |               13.86               |              2.52               |   81.01   |   95.63   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-small_8xb128_in1k_20220501-17bc91aa.pth) |
 |              VAN-B\*               |               26.58               |              5.03               |   82.80   |   96.21   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-base_8xb128_in1k_20220501-6a4cc31b.pth) |
 |              VAN-L\*               |               44.77               |              8.99               |   83.86   |   96.73   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/van/van-large_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/van/van-large_8xb128_in1k_20220501-f212ba21.pth) |
+|           MViTv2-tiny\*            |               24.17               |              4.70               |   82.33   |   96.15   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-tiny_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-tiny_3rdparty_in1k_20220722-db7beeef.pth) |
+|           MViTv2-small\*           |               34.87               |              7.00               |   83.63   |   96.51   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-small_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-small_3rdparty_in1k_20220722-986bd741.pth) |
+|           MViTv2-base\*            |               51.47               |              10.20              |   84.34   |   96.86   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-base_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-base_3rdparty_in1k_20220722-9c4f0a17.pth) |
+|           MViTv2-large\*           |              217.99               |              42.10              |   85.25   |   97.14   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mvit/mvitv2-large_8xb256_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mvit/mvitv2-large_3rdparty_in1k_20220722-2b57b983.pth) |
+|        EfficientFormer-l1\*        |               12.19               |              1.30               |   80.46   |   94.99   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientformer/efficientformer-l1_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l1_3rdparty_in1k_20220803-d66e61df.pth) |
+|        EfficientFormer-l3\*        |               31.41               |              3.93               |   82.45   |   96.18   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientformer/efficientformer-l3_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l3_3rdparty_in1k_20220803-dde1c8c5.pth) |
+|        EfficientFormer-l7\*        |               82.23               |              10.16              |   83.40   |   96.60   | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientformer/efficientformer-l7_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientformer/efficientformer-l7_3rdparty_in1k_20220803-41a552bb.pth) |

 *Models with * are converted from other repos, others are trained by ourselves.*

--- a/docs/en/tools/analysis.md
+++ b/docs/en/tools/analysis.md
@ -157,7 +157,7 @@ python tools/analysis_tools/analyze_results.py \

 - `config` : The path of the model config file.
 - `result`:  Output result file in json/pickle format from `tools/test.py`.
- `--out_dir`: Directory to store output files.
+- `--out-dir`: Directory to store output files.
 - `--topk`: The number of images in successful or failed prediction with the highest `topk` scores to save. If not specified, it will be set to 20.
 - `--cfg-options`: If specified, the key-value pair config will be merged into the config file, for more details please refer to [Tutorial 1: Learn about Configs](../tutorials/config.md)

@ -171,7 +171,7 @@ In `tools/test.py`, we support using `--out-items` option to select which kind o
 python tools/analysis_tools/analyze_results.py \
       configs/resnet/resnet50_b32x8_imagenet.py \
       result.pkl \
-       --out_dir results \
+       --out-dir results \
       --topk 50
 ```

--- a/docs/en/tutorials/MMClassification_python.ipynb
+++ b/docs/en/tutorials/MMClassification_python.ipynb
--- a/docs/en/tutorials/config.md
+++ b/docs/en/tutorials/config.md
@ -324,7 +324,7 @@ data = dict(

 Sometimes, you need to set `_delete_=True` to ignore some domain content in the basic configuration file. You can refer to [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html#inherit-from-base-config-with-ignored-fields) for more instructions.

-The following is an example. If you wangt to use cosine schedule in the above ResNet50 case, just using inheritance and directly modify it will report `get unexcepected keyword'step'` error, because the `'step'` field of the basic config in `lr_config` domain information is reserved, and you need to add `_delete_ =True` to ignore the content of `lr_config` related fields in the basic configuration file:
+The following is an example. If you want to use cosine schedule in the above ResNet50 case, just using inheritance and directly modify it will report `get unexcepected keyword'step'` error, because the `'step'` field of the basic config in `lr_config` domain information is reserved, and you need to add `_delete_ =True` to ignore the content of `lr_config` related fields in the basic configuration file:

 ```python
 _base_ = '../../configs/resnet/resnet50_8xb32_in1k.py'
--- a/docs/zh_CN/faq.md
+++ b/docs/zh_CN/faq.md
@ -16,7 +16,8 @@
  | MMClassification version |      MMCV version      |
  | :----------------------: | :--------------------: |
  |           dev            |  mmcv>=1.6.0, \<1.7.0  |
-  |     0.23.2 (master)      |  mmcv>=1.4.2, \<1.6.0  |
+  |     0.24.0 (master)      |  mmcv>=1.4.2, \<1.7.0  |
+  |          0.23.2          |  mmcv>=1.4.2, \<1.7.0  |
  |          0.22.1          |  mmcv>=1.4.2, \<1.6.0  |
  |          0.21.0          | mmcv>=1.4.2, \<=1.5.0  |
  |          0.20.1          | mmcv>=1.4.2, \<=1.5.0  |
--- a/docs/zh_CN/tools/analysis.md
+++ b/docs/zh_CN/tools/analysis.md
@ -157,7 +157,7 @@ python tools/analysis_tools/analyze_results.py \

 - `config` ：配置文件的路径。
 - `result` ： `tools/test.py` 的输出结果文件。
- `--out_dir` ：保存结果分析的文件夹路径。
+- `--out-dir` ：保存结果分析的文件夹路径。
 - `--topk` ：分别保存多少张预测成功/失败的图像。如果不指定，默认为 `20`。
 - `--cfg-options`: 额外的配置选项，会被合入配置文件，参考[教程 1：如何编写配置文件](https://mmclassification.readthedocs.io/zh_CN/latest/tutorials/config.html)。

@ -171,7 +171,7 @@ python tools/analysis_tools/analyze_results.py \
 python tools/analysis_tools/analyze_results.py \
       configs/resnet/resnet50_xxxx.py \
       result.pkl \
-       --out_dir results \
+       --out-dir results \
       --topk 50
 ```

--- a/docs/zh_CN/tutorials/MMClassification_python_cn.ipynb
+++ b/docs/zh_CN/tutorials/MMClassification_python_cn.ipynb
--- a/mmcls/apis/test.py
+++ b/mmcls/apis/test.py
@ -79,8 +79,7 @@ def single_gpu_test(model,
                    **show_kwargs)

        batch_size = data['img'].size(0)
-        for _ in range(batch_size):
-            prog_bar.update()
+        prog_bar.update(batch_size)
    return results


--- a/mmcls/apis/train.py
+++ b/mmcls/apis/train.py
@ -209,6 +209,7 @@ def train_model(model,
            **loader_cfg,
            'shuffle': False,  # Not shuffle by default
            'sampler_cfg': None,  # Not use sampler by default
+            'drop_last': False,  # Not drop last by default
            **cfg.data.get('val_dataloader', {}),
        }
        val_dataloader = build_dataloader(val_dataset, **val_loader_cfg)
--- a/mmcls/core/hook/precise_bn_hook.py
+++ b/mmcls/core/hook/precise_bn_hook.py
@ -107,7 +107,7 @@ def update_bn_stats(model: nn.Module,
        prog_bar = mmcv.ProgressBar(num_iter)

    for data in itertools.islice(loader, num_iter):
-        model(**data)
+        model.train_step(data)
        for i, bn in enumerate(bn_layers):
            running_means[i] += bn.running_mean / num_iter
            running_vars[i] += bn.running_var / num_iter
--- a/mmcls/datasets/init.py
+++ b/mmcls/datasets/init.py
@ -12,6 +12,7 @@ from .imagenet21k import ImageNet21k
 from .mnist import MNIST, FashionMNIST
 from .multi_label import MultiLabelDataset
 from .samplers import DistributedSampler, RepeatAugSampler
+from .stanford_cars import StanfordCars
 from .voc import VOC

 __all__ = [
@ -19,5 +20,6 @@ __all__ = [
    'VOC', 'MultiLabelDataset', 'build_dataloader', 'build_dataset',
    'DistributedSampler', 'ConcatDataset', 'RepeatDataset',
    'ClassBalancedDataset', 'DATASETS', 'PIPELINES', 'ImageNet21k', 'SAMPLERS',
-    'build_sampler', 'RepeatAugSampler', 'KFoldDataset', 'CUB', 'CustomDataset'
+    'build_sampler', 'RepeatAugSampler', 'KFoldDataset', 'CUB',
+    'CustomDataset', 'StanfordCars'
 ]
--- a/Show More
+++ b/Show More