[Feature] Support ConvNeXt-V2 backbone. (#1294)

* [Feature] Support ConvNeXt-V2. * Use registry of mmcls instead of mmengine. * Add README. * Add unit tests and docs.
2023-01-06 16:13:41 +08:00 · 2023-01-06 16:13:41 +08:00 · 060b0ed3b5
parent e880451a54
commit 060b0ed3b5
41 changed files with 1432 additions and 137 deletions
--- a/.dev_scripts/benchmark_regression/1-benchmark_valid.py
+++ b/.dev_scripts/benchmark_regression/1-benchmark_valid.py
@ -96,6 +96,7 @@ def inference(config_file, checkpoint, work_dir, args, exp_name):
        data = default_collate([data] * args.batch_size)
        resolution = tuple(data['inputs'].shape[-2:])
        model = Runner.from_cfg(cfg).model
+        load_checkpoint(model, checkpoint, map_location='cpu')
        forward = model.val_step
    else:
        # For configs only for get model.
--- a/configs/_base_/models/convnext/convnext-base.py
+++ b/configs/_base_/models/convnext/convnext-base.py
@ -1,27 +1,17 @@
 # Model settings
 model = dict(
    type='ImageClassifier',
-    backbone=dict(
-        type='ConvNeXt',
-        arch='base',
-        out_indices=(3, ),
-        drop_path_rate=0.5,
-        gap_before_final_norm=True,
-        init_cfg=[
-            dict(
-                type='TruncNormal',
-                layer=['Conv2d', 'Linear'],
-                std=.02,
-                bias=0.),
-            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
-        ]),
+    backbone=dict(type='ConvNeXt', arch='base', drop_path_rate=0.5),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=1024,
        loss=dict(
            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=None,
    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
    train_cfg=dict(augments=[
        dict(type='Mixup', alpha=0.8),
        dict(type='CutMix', alpha=1.0),
--- a/configs/_base_/models/convnext/convnext-large.py
+++ b/configs/_base_/models/convnext/convnext-large.py
@ -1,27 +1,17 @@
 # Model settings
 model = dict(
    type='ImageClassifier',
-    backbone=dict(
-        type='ConvNeXt',
-        arch='large',
-        out_indices=(3, ),
-        drop_path_rate=0.5,
-        gap_before_final_norm=True,
-        init_cfg=[
-            dict(
-                type='TruncNormal',
-                layer=['Conv2d', 'Linear'],
-                std=.02,
-                bias=0.),
-            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
-        ]),
+    backbone=dict(type='ConvNeXt', arch='large', drop_path_rate=0.5),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=1536,
        loss=dict(
            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=None,
    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
    train_cfg=dict(augments=[
        dict(type='Mixup', alpha=0.8),
        dict(type='CutMix', alpha=1.0),
--- a/configs/_base_/models/convnext/convnext-small.py
+++ b/configs/_base_/models/convnext/convnext-small.py
@ -1,27 +1,17 @@
 # Model settings
 model = dict(
    type='ImageClassifier',
-    backbone=dict(
-        type='ConvNeXt',
-        arch='small',
-        out_indices=(3, ),
-        drop_path_rate=0.4,
-        gap_before_final_norm=True,
-        init_cfg=[
-            dict(
-                type='TruncNormal',
-                layer=['Conv2d', 'Linear'],
-                std=.02,
-                bias=0.),
-            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
-        ]),
+    backbone=dict(type='ConvNeXt', arch='small', drop_path_rate=0.4),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=768,
        loss=dict(
            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=None,
    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
    train_cfg=dict(augments=[
        dict(type='Mixup', alpha=0.8),
        dict(type='CutMix', alpha=1.0),
--- a/configs/_base_/models/convnext/convnext-tiny.py
+++ b/configs/_base_/models/convnext/convnext-tiny.py
@ -1,27 +1,17 @@
 # Model settings
 model = dict(
    type='ImageClassifier',
-    backbone=dict(
-        type='ConvNeXt',
-        arch='tiny',
-        out_indices=(3, ),
-        drop_path_rate=0.1,
-        gap_before_final_norm=True,
-        init_cfg=[
-            dict(
-                type='TruncNormal',
-                layer=['Conv2d', 'Linear'],
-                std=.02,
-                bias=0.),
-            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
-        ]),
+    backbone=dict(type='ConvNeXt', arch='tiny', drop_path_rate=0.1),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=768,
        loss=dict(
            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=None,
    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
    train_cfg=dict(augments=[
        dict(type='Mixup', alpha=0.8),
        dict(type='CutMix', alpha=1.0),
--- a/configs/_base_/models/convnext/convnext-xlarge.py
+++ b/configs/_base_/models/convnext/convnext-xlarge.py
@ -1,27 +1,17 @@
 # Model settings
 model = dict(
    type='ImageClassifier',
-    backbone=dict(
-        type='ConvNeXt',
-        arch='xlarge',
-        out_indices=(3, ),
-        drop_path_rate=0.5,
-        gap_before_final_norm=True,
-        init_cfg=[
-            dict(
-                type='TruncNormal',
-                layer=['Conv2d', 'Linear'],
-                std=.02,
-                bias=0.),
-            dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.),
-        ]),
+    backbone=dict(type='ConvNeXt', arch='xlarge', drop_path_rate=0.5),
    head=dict(
        type='LinearClsHead',
        num_classes=1000,
        in_channels=2048,
        loss=dict(
            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        init_cfg=None,
    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
    train_cfg=dict(augments=[
        dict(type='Mixup', alpha=0.8),
        dict(type='CutMix', alpha=1.0),
--- a/configs/_base_/models/convnext_v2/atto.py
+++ b/configs/_base_/models/convnext_v2/atto.py
@ -0,0 +1,20 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='atto',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=320,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.2),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+)
--- a/configs/_base_/models/convnext_v2/base.py
+++ b/configs/_base_/models/convnext_v2/base.py
@ -0,0 +1,24 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='base',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.1),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
--- a/configs/_base_/models/convnext_v2/femto.py
+++ b/configs/_base_/models/convnext_v2/femto.py
@ -0,0 +1,20 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='femto',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=384,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.1),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+)
--- a/configs/_base_/models/convnext_v2/huge.py
+++ b/configs/_base_/models/convnext_v2/huge.py
@ -0,0 +1,24 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='huge',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=2816,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.1),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
--- a/configs/_base_/models/convnext_v2/large.py
+++ b/configs/_base_/models/convnext_v2/large.py
@ -0,0 +1,24 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='large',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=1536,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.1),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
--- a/configs/_base_/models/convnext_v2/nano.py
+++ b/configs/_base_/models/convnext_v2/nano.py
@ -0,0 +1,20 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='nano',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=640,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.2),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+)
--- a/configs/_base_/models/convnext_v2/pico.py
+++ b/configs/_base_/models/convnext_v2/pico.py
@ -0,0 +1,20 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='pico',
+        drop_path_rate=0.1,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.1),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+)
--- a/configs/_base_/models/convnext_v2/tiny.py
+++ b/configs/_base_/models/convnext_v2/tiny.py
@ -0,0 +1,24 @@
+# Model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='ConvNeXt',
+        arch='tiny',
+        drop_path_rate=0.2,
+        layer_scale_init_value=0.,
+        use_grn=True,
+    ),
+    head=dict(
+        type='LinearClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(type='LabelSmoothLoss', label_smooth_val=0.2),
+        init_cfg=None,
+    ),
+    init_cfg=dict(
+        type='TruncNormal', layer=['Conv2d', 'Linear'], std=.02, bias=0.),
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8),
+        dict(type='CutMix', alpha=1.0),
+    ]),
+)
--- a/configs/convnext_v2/README.md
+++ b/configs/convnext_v2/README.md
@ -0,0 +1,68 @@
+# ConvNeXt V2
+
+> [Co-designing and Scaling ConvNets with Masked Autoencoders](http://arxiv.org/abs/2301.00808)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt, have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can also potentially benefit from self-supervised learning techniques such as masked autoencoders (MAE). However, we found that simply combining these two approaches leads to subpar performance. In this paper, we propose a fully convolutional masked autoencoder framework and a new Global Response Normalization (GRN) layer that can be added to the ConvNeXt architecture to enhance inter-channel feature competition. This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7M-parameter Atto model with 76.7% top-1 accuracy on ImageNet, to a 650M Huge model that achieves a state-of-the-art 88.9% accuracy using only public training data.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/210496285-f235083f-218f-4153-8e21-c8a64481a2f5.png" width="50%"/>
+</div>
+
+## Results and models
+
+### Pre-trained Models
+
+The pre-trained models are only used to fine-tune, and therefore cannot be trained and don't have evaluation results.
+
+|                  Model                  | Params(M) | Flops(G) |                    Config                    |                                               Download                                               |
+| :-------------------------------------: | :-------: | :------: | :------------------------------------------: | :--------------------------------------------------------------------------------------------------: |
+| convnext-v2-atto_3rdparty-fcmae_in1k\*  |   3.71    |   0.55   | [config](./convnext-v2-atto_32xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-atto_3rdparty-fcmae_in1k_20230104-07514db4.pth) |
+| convnext-v2-femto_3rdparty-fcmae_in1k\* |   5.23    |   0.78   | [config](./convnext-v2-femto_32xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-femto_3rdparty-fcmae_in1k_20230104-adbe2082.pth) |
+| convnext-v2-pico_3rdparty-fcmae_in1k\*  |   9.07    |   1.37   | [config](./convnext-v2-pico_32xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-pico_3rdparty-fcmae_in1k_20230104-147b1b59.pth) |
+| convnext-v2-nano_3rdparty-fcmae_in1k\*  |   15.62   |   2.45   | [config](./convnext-v2-nano_32xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_3rdparty-fcmae_in1k_20230104-3dd1f29e.pth) |
+| convnext-v2-tiny_3rdparty-fcmae_in1k\*  |   28.64   |   4.47   | [config](./convnext-v2-tiny_32xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_3rdparty-fcmae_in1k_20230104-80513adc.pth) |
+| convnext-v2-base_3rdparty-fcmae_in1k\*  |   88.72   |  15.38   | [config](./convnext-v2-base_32xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_3rdparty-fcmae_in1k_20230104-8a798eaf.pth) |
+| convnext-v2-large_3rdparty-fcmae_in1k\* |  197.96   |  34.40   | [config](./convnext-v2-large_32xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_3rdparty-fcmae_in1k_20230104-bf38df92.pth) |
+| convnext-v2-huge_3rdparty-fcmae_in1k\*  |  660.29   |  115.00  | [config](./convnext-v2-huge_32xb32_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_3rdparty-fcmae_in1k_20230104-fe43ae6c.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt-V2).*
+
+### ImageNet-1k
+
+|                      Model                      |       Pretrain       | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) |                      Config                      |                      Download                      |
+| :---------------------------------------------: | :------------------: | :-------: | :------: | :-------: | :-------: | :----------------------------------------------: | :------------------------------------------------: |
+|   convnext-v2-atto_fcmae-pre_3rdparty_in1k\*    |        FCMAE         |   3.71    |   0.55   |   76.64   |   93.04   |   [config](./convnext-v2-atto_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-atto_fcmae-pre_3rdparty_in1k_20230104-23765f83.pth) |
+|   convnext-v2-femto_fcmae-pre_3rdparty_in1k\*   |        FCMAE         |   5.23    |   0.78   |   78.48   |   93.98   |   [config](./convnext-v2-femto_32xb32_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-femto_fcmae-pre_3rdparty_in1k_20230104-92a75d75.pth) |
+|   convnext-v2-pico_fcmae-pre_3rdparty_in1k\*    |        FCMAE         |   9.07    |   1.37   |   80.31   |   95.08   |   [config](./convnext-v2-pico_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-pico_fcmae-pre_3rdparty_in1k_20230104-d20263ca.pth) |
+|   convnext-v2-nano_fcmae-pre_3rdparty_in1k\*    |        FCMAE         |   15.62   |   2.45   |   81.86   |   95.75   |   [config](./convnext-v2-nano_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-pre_3rdparty_in1k_20230104-fe1aaaf2.pth) |
+| convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k\* | FCMAE + ImageNet 21k |   15.62   |   2.45   |   82.04   |   96.16   |   [config](./convnext-v2-nano_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k_20230104-91fa8ae2.pth) |
+|   convnext-v2-tiny_fcmae-pre_3rdparty_in1k\*    |        FCMAE         |   28.64   |   4.47   |   82.94   |   96.29   |   [config](./convnext-v2-tiny_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-pre_3rdparty_in1k_20230104-471a86de.pth) |
+| convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k\* | FCMAE + ImageNet 21k |   28.64   |   4.47   |   83.89   |   96.96   |   [config](./convnext-v2-tiny_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k_20230104-8cc8b8f2.pth) |
+| convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k-384px\* | FCMAE + ImageNet 21k |   15.62   |   7.21   |   83.36   |   96.75   | [config](./convnext-v2-nano_32xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-f951ae87.pth) |
+| convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k-384px\* | FCMAE + ImageNet 21k |   28.64   |  13.14   |   85.09   |   97.63   | [config](./convnext-v2-tiny_32xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-d8579f84.pth) |
+|   convnext-v2-base_fcmae-pre_3rdparty_in1k\*    |        FCMAE         |   88.72   |  15.38   |   84.87   |   97.08   |   [config](./convnext-v2-base_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-pre_3rdparty_in1k_20230104-00a70fa4.pth) |
+| convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k\* | FCMAE + ImageNet 21k |   88.72   |  15.38   |   86.74   |   98.02   |   [config](./convnext-v2-base_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k_20230104-c48d16a5.pth) |
+|   convnext-v2-large_fcmae-pre_3rdparty_in1k\*   |        FCMAE         |  197.96   |  34.40   |   85.76   |   97.59   |   [config](./convnext-v2-large_32xb32_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-pre_3rdparty_in1k_20230104-ef393013.pth) |
+| convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k\* | FCMAE + ImageNet 21k |  197.96   |  34.40   |   87.26   |   98.24   |   [config](./convnext-v2-large_32xb32_in1k.py)   | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k_20230104-d9c4dc0c.pth) |
+| convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k-384px\* | FCMAE + ImageNet 21k |   88.72   |  45.21   |   87.63   |   98.42   | [config](./convnext-v2-base_32xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-379425cc.pth) |
+| convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k-384px\* | FCMAE + ImageNet 21k |  197.96   |  101.10  |   88.18   |   98.52   | [config](./convnext-v2-large_32xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-9139a1f3.pth) |
+|   convnext-v2-huge_fcmae-pre_3rdparty_in1k\*    |        FCMAE         |  660.29   |  115.00  |   86.25   |   97.75   |   [config](./convnext-v2-huge_32xb32_in1k.py)    | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-pre_3rdparty_in1k_20230104-f795e5b8.pth) |
+| convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-384px\* | FCMAE + ImageNet 21k |  660.29   |  337.96  |   88.68   |   98.73   | [config](./convnext-v2-huge_32xb32_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-02a4eb35.pth) |
+| convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-512px\* | FCMAE + ImageNet 21k |  660.29   |  600.81  |   88.86   |   98.74   | [config](./convnext-v2-huge_32xb32_in1k-512px.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-512px_20230104-ce32e63c.pth) |
+
+*Models with * are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt-V2). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
+
+## Citation
+
+```bibtex
+@article{Woo2023ConvNeXtV2,
+  title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders},
+  author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie},
+  year={2023},
+  journal={arXiv preprint arXiv:2301.00808},
+}
+```
--- a/configs/convnext_v2/convnext-v2-atto_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-atto_32xb32_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext_v2/atto.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-base_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-base_32xb32_in1k-384px.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/base.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/base.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-femto_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-femto_32xb32_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext_v2/femto.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-huge_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-huge_32xb32_in1k-384px.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/huge.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-huge_32xb32_in1k-512px.py
+++ b/configs/convnext_v2/convnext-v2-huge_32xb32_in1k-512px.py
@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/models/convnext_v2/huge.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=512,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=512, backend='pillow', interpolation='bicubic'),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(batch_size=32, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-huge_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-huge_32xb32_in1k.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/huge.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-large_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-large_32xb32_in1k-384px.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/large.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/large.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=2.5e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=20,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=20)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-nano_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-nano_32xb32_in1k-384px.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext_v2/nano.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext_v2/nano.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/convnext_v2/pico.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=8e-4, weight_decay=0.3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True)]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=600, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k-384px.py
+++ b/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k-384px.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_384.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=3.2e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=40,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=40)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+++ b/configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/convnext_v2/tiny.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
+    '../_base_/default_runtime.py',
+]
+
+# dataset setting
+train_dataloader = dict(batch_size=32)
+
+# schedule setting
+optim_wrapper = dict(
+    optimizer=dict(lr=3.2e-3),
+    clip_grad=None,
+)
+
+# learning policy
+param_scheduler = [
+    # warm up learning rate scheduler
+    dict(
+        type='LinearLR',
+        start_factor=1e-3,
+        by_epoch=True,
+        end=40,
+        # update by iter
+        convert_to_iter_based=True),
+    # main learning rate scheduler
+    dict(type='CosineAnnealingLR', eta_min=1e-5, by_epoch=True, begin=40)
+]
+
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=300, val_interval=1)
+
+# runtime setting
+custom_hooks = [dict(type='EMAHook', momentum=1e-4, priority='ABOVE_NORMAL')]
--- a/configs/convnext_v2/metafile.yml
+++ b/configs/convnext_v2/metafile.yml
@ -0,0 +1,433 @@
+Collections:
+  - Name: ConvNeXt V2
+    Metadata:
+      Architecture:
+        - Global Response Normalization
+    Paper:
+      Title: Co-designing and Scaling ConvNets with Masked Autoencoders
+      URL: http://arxiv.org/abs/2301.00808
+    README: configs/convnext_v2/README.md
+
+Models:
+  - Name: convnext-v2-atto_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 551718080
+      Parameters: 3708400
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-atto_3rdparty-fcmae_in1k_20230104-07514db4.pth
+    Config: configs/convnext_v2/convnext-v2-atto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_atto_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-atto_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 551718080
+      Parameters: 3708400
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 76.64
+          Top 5 Accuracy: 93.04
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-atto_fcmae-pre_3rdparty_in1k_20230104-23765f83.pth
+    Config: configs/convnext_v2/convnext-v2-atto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-femto_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 784892544
+      Parameters: 5233240
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-femto_3rdparty-fcmae_in1k_20230104-adbe2082.pth
+    Config: configs/convnext_v2/convnext-v2-femto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_femto_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-femto_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 784892544
+      Parameters: 5233240
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 78.48
+          Top 5 Accuracy: 93.98
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-femto_fcmae-pre_3rdparty_in1k_20230104-92a75d75.pth
+    Config: configs/convnext_v2/convnext-v2-femto_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-pico_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 1374072320
+      Parameters: 9066280
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-pico_3rdparty-fcmae_in1k_20230104-147b1b59.pth
+    Config: configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_pico_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-pico_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 1374072320
+      Parameters: 9066280
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 80.31
+          Top 5 Accuracy: 95.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-pico_fcmae-pre_3rdparty_in1k_20230104-d20263ca.pth
+    Config: configs/convnext_v2/convnext-v2-pico_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 2454926720
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_3rdparty-fcmae_in1k_20230104-3dd1f29e.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_nano_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 2454926720
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.86
+          Top 5 Accuracy: 95.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-pre_3rdparty_in1k_20230104-fe1aaaf2.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 2454926720
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.04
+          Top 5 Accuracy: 96.16
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k_20230104-91fa8ae2.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 4469631744
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_3rdparty-fcmae_in1k_20230104-80513adc.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_tiny_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 4469631744
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 82.94
+          Top 5 Accuracy: 96.29
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-pre_3rdparty_in1k_20230104-471a86de.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 4469631744
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.89
+          Top 5 Accuracy: 96.96
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k_20230104-8cc8b8f2.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 7214472320
+      Parameters: 15623800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.36
+          Top 5 Accuracy: 96.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-nano_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-f951ae87.pth
+    Config: configs/convnext_v2/convnext-v2-nano_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 13135236864
+      Parameters: 28635496
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.09
+          Top 5 Accuracy: 97.63
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-tiny_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-d8579f84.pth
+    Config: configs/convnext_v2/convnext-v2-tiny_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 15382561792
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_3rdparty-fcmae_in1k_20230104-8a798eaf.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_base_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 15382561792
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 84.87
+          Top 5 Accuracy: 97.08
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-pre_3rdparty_in1k_20230104-00a70fa4.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 15382561792
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.74
+          Top 5 Accuracy: 98.02
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k_20230104-c48d16a5.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 34403182080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_3rdparty-fcmae_in1k_20230104-bf38df92.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_large_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 34403182080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 85.76
+          Top 5 Accuracy: 97.59
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-pre_3rdparty_in1k_20230104-ef393013.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 34403182080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.26
+          Top 5 Accuracy: 98.24
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k_20230104-d9c4dc0c.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 45205885952
+      Parameters: 88717800
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 87.63
+          Top 5 Accuracy: 98.42
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-379425cc.pth
+    Config: configs/convnext_v2/convnext-v2-base_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 101103214080
+      Parameters: 197956840
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 88.18
+          Top 5 Accuracy: 98.52
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-large_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-9139a1f3.pth
+    Config: configs/convnext_v2/convnext-v2-large_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_3rdparty-fcmae_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 114998639360
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results: null
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_3rdparty-fcmae_in1k_20230104-fe43ae6c.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_huge_1k_224_fcmae.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_fcmae-pre_3rdparty_in1k
+    Metadata:
+      Training Data: ImageNet-1k
+      FLOPs: 114998639360
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 86.25
+          Top 5 Accuracy: 97.75
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-pre_3rdparty_in1k_20230104-f795e5b8.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-384px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 337955157760
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 88.68
+          Top 5 Accuracy: 98.73
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-384px_20230104-02a4eb35.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k-384px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
+  - Name: convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-512px
+    Metadata:
+      Training Data:
+        - ImageNet-21k
+        - ImageNet-1k
+      FLOPs: 600809158400
+      Parameters: 660289640
+    In Collection: ConvNeXt V2
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 88.86
+          Top 5 Accuracy: 98.74
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-huge_fcmae-in21k-pre_3rdparty_in1k-512px_20230104-ce32e63c.pth
+    Config: configs/convnext_v2/convnext-v2-huge_32xb32_in1k-512px.py
+    Converted From:
+      Weights: https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt
+      Code: https://github.com/facebookresearch/ConvNeXt-V2
--- a/docs/zh_CN/advanced_guides/schedule.md
+++ b/docs/zh_CN/advanced_guides/schedule.md
@ -270,7 +270,7 @@ param_scheduler = [
 可以创建一个名为 `mmcls/engine/optimizer` 的文件夹，并在目录下的一个文件，如 `mmcls/engine/optimizer/my_optimizer.py` 中实现该自定义优化器：

 ```python
-from mmengine.registry import OPTIMIZERS
+from mmcls.registry import OPTIMIZERS
 from torch.optim import Optimizer


--- a/mmcls/models/backbones/convnext.py
+++ b/mmcls/models/backbones/convnext.py
@ -5,48 +5,15 @@ from typing import Sequence

 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import torch.utils.checkpoint as cp
-from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
 from mmengine.model import BaseModule, ModuleList, Sequential
-from mmengine.registry import MODELS

+from mmcls.registry import MODELS
+from ..utils import GRN, build_norm_layer
 from .base_backbone import BaseBackbone


-@MODELS.register_module('LN2d')
-class LayerNorm2d(nn.LayerNorm):
-    """LayerNorm on channels for 2d images.
-
-    Args:
-        num_channels (int): The number of channels of the input tensor.
-        eps (float): a value added to the denominator for numerical stability.
-            Defaults to 1e-5.
-        elementwise_affine (bool): a boolean value that when set to ``True``,
-            this module has learnable per-element affine parameters initialized
-            to ones (for weights) and zeros (for biases). Defaults to True.
-    """
-
-    def __init__(self, num_channels: int, **kwargs) -> None:
-        super().__init__(num_channels, **kwargs)
-        self.num_channels = self.normalized_shape[0]
-
-    def forward(self, x, data_format='channel_first'):
-        assert x.dim() == 4, 'LayerNorm2d only supports inputs with shape ' \
-            f'(N, C, H, W), but got tensor with shape {x.shape}'
-        if data_format == 'channel_last':
-            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
-                             self.eps)
-        elif data_format == 'channel_first':
-            x = x.permute(0, 2, 3, 1)
-            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
-                             self.eps)
-            # If the output is discontiguous, it may cause some unexpected
-            # problem in the downstream tasks
-            x = x.permute(0, 3, 1, 2).contiguous()
-        return x
-
-
 class ConvNeXtBlock(BaseModule):
    """ConvNeXt Block.

@ -88,6 +55,7 @@ class ConvNeXtBlock(BaseModule):
                 linear_pw_conv=True,
                 drop_path_rate=0.,
                 layer_scale_init_value=1e-6,
+                 use_grn=False,
                 with_cp=False):
        super().__init__()
        self.with_cp = with_cp
@ -96,7 +64,7 @@ class ConvNeXtBlock(BaseModule):
            in_channels, in_channels, groups=in_channels, **dw_conv_cfg)

        self.linear_pw_conv = linear_pw_conv
-        self.norm = build_norm_layer(norm_cfg, in_channels)[1]
+        self.norm = build_norm_layer(norm_cfg, in_channels)

        mid_channels = int(mlp_ratio * in_channels)
        if self.linear_pw_conv:
@ -106,9 +74,14 @@ class ConvNeXtBlock(BaseModule):
            pw_conv = partial(nn.Conv2d, kernel_size=1)

        self.pointwise_conv1 = pw_conv(in_channels, mid_channels)
-        self.act = build_activation_layer(act_cfg)
+        self.act = MODELS.build(act_cfg)
        self.pointwise_conv2 = pw_conv(mid_channels, in_channels)

+        if use_grn:
+            self.grn = GRN(mid_channels)
+        else:
+            self.grn = None
+
        self.gamma = nn.Parameter(
            layer_scale_init_value * torch.ones((in_channels)),
            requires_grad=True) if layer_scale_init_value > 0 else None
@ -124,14 +97,21 @@ class ConvNeXtBlock(BaseModule):

            if self.linear_pw_conv:
                x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
-            x = self.norm(x, data_format='channel_last')
+                x = self.norm(x, data_format='channel_last')
+                x = self.pointwise_conv1(x)
+                x = self.act(x)
+                if self.grn is not None:
+                    x = self.grn(x, data_format='channel_last')
+                x = self.pointwise_conv2(x)
+                x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+            else:
+                x = self.norm(x, data_format='channel_first')
+                x = self.pointwise_conv1(x)
+                x = self.act(x)

-            x = self.pointwise_conv1(x)
-            x = self.act(x)
-            x = self.pointwise_conv2(x)
-
-            if self.linear_pw_conv:
-                x = x.permute(0, 3, 1, 2)  # permute back
+                if self.grn is not None:
+                    x = self.grn(x, data_format='channel_first')
+                x = self.pointwise_conv2(x)

            if self.gamma is not None:
                x = x.mul(self.gamma.view(1, -1, 1, 1))
@ -148,16 +128,20 @@ class ConvNeXtBlock(BaseModule):

@MODELS.register_module()
 class ConvNeXt(BaseBackbone):
-    """ConvNeXt.
+    """ConvNeXt v1&v2 backbone.

-    A PyTorch implementation of : `A ConvNet for the 2020s
-    <https://arxiv.org/pdf/2201.03545.pdf>`_
+    A PyTorch implementation of `A ConvNet for the 2020s
+    <https://arxiv.org/abs/2201.03545>`_ and
+    `ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders
+    <http://arxiv.org/abs/2301.00808>`_

    Modified from the `official repo
    <https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py>`_
    and `timm
    <https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/convnext.py>`_.

+    To use ConvNeXt v2, please set ``use_grn=True`` and ``layer_scale_init_value=0.``.
+
    Args:
        arch (str | dict): The model's architecture. If string, it should be
            one of architecture in ``ConvNeXt.arch_settings``. And if dict, it
@ -176,6 +160,8 @@ class ConvNeXt(BaseBackbone):
            convolution. Defaults to ``dict(type='GELU')``.
        linear_pw_conv (bool): Whether to use linear layer to do pointwise
            convolution. Defaults to True.
+        use_grn (bool): Whether to add Global Response Normalization in the
+            blocks. Defaults to False.
        drop_path_rate (float): Stochastic depth rate. Defaults to 0.
        layer_scale_init_value (float): Init value for Layer Scale.
            Defaults to 1e-6.
@ -191,6 +177,22 @@ class ConvNeXt(BaseBackbone):
        init_cfg (dict, optional): Initialization config dict
    """  # noqa: E501
    arch_settings = {
+        'atto': {
+            'depths': [2, 2, 6, 2],
+            'channels': [40, 80, 160, 320]
+        },
+        'femto': {
+            'depths': [2, 2, 6, 2],
+            'channels': [48, 96, 192, 384]
+        },
+        'pico': {
+            'depths': [2, 2, 6, 2],
+            'channels': [64, 128, 256, 512]
+        },
+        'nano': {
+            'depths': [2, 2, 8, 2],
+            'channels': [80, 160, 320, 640]
+        },
        'tiny': {
            'depths': [3, 3, 9, 3],
            'channels': [96, 192, 384, 768]
@ -211,6 +213,10 @@ class ConvNeXt(BaseBackbone):
            'depths': [3, 3, 27, 3],
            'channels': [256, 512, 1024, 2048]
        },
+        'huge': {
+            'depths': [3, 3, 27, 3],
+            'channels': [352, 704, 1408, 2816]
+        }
    }

    def __init__(self,
@ -220,13 +226,23 @@ class ConvNeXt(BaseBackbone):
                 norm_cfg=dict(type='LN2d', eps=1e-6),
                 act_cfg=dict(type='GELU'),
                 linear_pw_conv=True,
+                 use_grn=False,
                 drop_path_rate=0.,
                 layer_scale_init_value=1e-6,
                 out_indices=-1,
                 frozen_stages=0,
                 gap_before_final_norm=True,
                 with_cp=False,
-                 init_cfg=None):
+                 init_cfg=[
+                     dict(
+                         type='TruncNormal',
+                         layer=['Conv2d', 'Linear'],
+                         std=.02,
+                         bias=0.),
+                     dict(
+                         type='Constant', layer=['LayerNorm'], val=1.,
+                         bias=0.),
+                 ]):
        super().__init__(init_cfg=init_cfg)

        if isinstance(arch, str):
@ -278,7 +294,7 @@ class ConvNeXt(BaseBackbone):
                self.channels[0],
                kernel_size=stem_patch_size,
                stride=stem_patch_size),
-            build_norm_layer(norm_cfg, self.channels[0])[1],
+            build_norm_layer(norm_cfg, self.channels[0]),
        )
        self.downsample_layers.append(stem)

@ -292,7 +308,7 @@ class ConvNeXt(BaseBackbone):

            if i >= 1:
                downsample_layer = nn.Sequential(
-                    build_norm_layer(norm_cfg, self.channels[i - 1])[1],
+                    build_norm_layer(norm_cfg, self.channels[i - 1]),
                    nn.Conv2d(
                        self.channels[i - 1],
                        channels,
@ -309,6 +325,7 @@ class ConvNeXt(BaseBackbone):
                    act_cfg=act_cfg,
                    linear_pw_conv=linear_pw_conv,
                    layer_scale_init_value=layer_scale_init_value,
+                    use_grn=use_grn,
                    with_cp=with_cp) for j in range(depth)
            ])
            block_idx += depth
@ -316,7 +333,7 @@ class ConvNeXt(BaseBackbone):
            self.stages.append(stage)

            if i in self.out_indices:
-                norm_layer = build_norm_layer(norm_cfg, channels)[1]
+                norm_layer = build_norm_layer(norm_cfg, channels)
                self.add_module(f'norm{i}', norm_layer)

        self._freeze_stages()
--- a/mmcls/models/backbones/edgenext.py
+++ b/mmcls/models/backbones/edgenext.py
@ -5,11 +5,12 @@ from typing import Sequence

 import torch
 import torch.nn as nn
-from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
 from mmengine.model import BaseModule, ModuleList, Sequential
-from mmengine.registry import MODELS

-from ..utils import ChannelMultiheadAttention, PositionEncodingFourier
+from mmcls.registry import MODELS
+from ..utils import (ChannelMultiheadAttention, PositionEncodingFourier,
+                     build_norm_layer)
 from .base_backbone import BaseBackbone
 from .convnext import ConvNeXtBlock

@ -81,7 +82,7 @@ class SDTAEncoder(BaseModule):
        self.pos_embed = PositionEncodingFourier(
            embed_dims=in_channel) if use_pos_emb else None

-        self.norm_csa = build_norm_layer(norm_cfg, in_channel)[1]
+        self.norm_csa = build_norm_layer(norm_cfg, in_channel)
        self.gamma_csa = nn.Parameter(
            layer_scale_init_value * torch.ones(in_channel),
            requires_grad=True) if layer_scale_init_value > 0 else None
@ -92,9 +93,9 @@ class SDTAEncoder(BaseModule):
            attn_drop=attn_drop,
            proj_drop=proj_drop)

-        self.norm = build_norm_layer(norm_cfg, in_channel)[1]
+        self.norm = build_norm_layer(norm_cfg, in_channel)
        self.pointwise_conv1 = nn.Linear(in_channel, mlp_ratio * in_channel)
-        self.act = build_activation_layer(act_cfg)
+        self.act = MODELS.build(act_cfg)
        self.pointwise_conv2 = nn.Linear(mlp_ratio * in_channel, in_channel)
        self.gamma = nn.Parameter(
            layer_scale_init_value * torch.ones(in_channel),
@ -298,7 +299,7 @@ class EdgeNeXt(BaseBackbone):
        self.downsample_layers = ModuleList()
        stem = nn.Sequential(
            nn.Conv2d(in_channels, self.channels[0], kernel_size=4, stride=4),
-            build_norm_layer(norm_cfg, self.channels[0])[1],
+            build_norm_layer(norm_cfg, self.channels[0]),
        )
        self.downsample_layers.append(stem)

@ -310,7 +311,7 @@ class EdgeNeXt(BaseBackbone):

            if i >= 1:
                downsample_layer = nn.Sequential(
-                    build_norm_layer(norm_cfg, self.channels[i - 1])[1],
+                    build_norm_layer(norm_cfg, self.channels[i - 1]),
                    nn.Conv2d(
                        self.channels[i - 1],
                        channels,
@ -354,7 +355,7 @@ class EdgeNeXt(BaseBackbone):
            if i in self.out_indices:
                out_norm_cfg = dict(type='LN') if self.gap_before_final_norm \
                    else norm_cfg
-                norm_layer = build_norm_layer(out_norm_cfg, channels)[1]
+                norm_layer = build_norm_layer(out_norm_cfg, channels)
                self.add_module(f'norm{i}', norm_layer)

    def init_weights(self) -> None:
--- a/mmcls/models/backbones/mobilevit.py
+++ b/mmcls/models/backbones/mobilevit.py
@ -5,9 +5,9 @@ from typing import Callable, Optional, Sequence
 import torch
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule, build_norm_layer
-from mmengine.registry import MODELS
 from torch import nn

+from mmcls.registry import MODELS
 from .base_backbone import BaseBackbone
 from .mobilenet_v2 import InvertedResidual
 from .vision_transformer import TransformerEncoderLayer
--- a/mmcls/models/backbones/tinyvit.py
+++ b/mmcls/models/backbones/tinyvit.py
@ -6,9 +6,9 @@ import torch.nn as nn
 import torch.utils.checkpoint as checkpoint
 from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
 from mmengine.model import BaseModule, ModuleList, Sequential
-from mmengine.registry import MODELS
 from torch.nn import functional as F

+from mmcls.registry import MODELS
 from ..utils import LeAttention
 from .base_backbone import BaseBackbone

--- a/mmcls/models/utils/init.py
+++ b/mmcls/models/utils/init.py
@ -11,6 +11,7 @@ from .helpers import is_tracing, to_2tuple, to_3tuple, to_4tuple, to_ntuple
 from .inverted_residual import InvertedResidual
 from .layer_scale import LayerScale
 from .make_divisible import make_divisible
+from .norm import GRN, LayerNorm2d, build_norm_layer
 from .position_encoding import (ConditionalPositionEncoding,
                                PositionEncodingFourier)
 from .se_layer import SELayer
@ -23,5 +24,6 @@ __all__ = [
    'resize_pos_embed', 'resize_relative_position_bias_table',
    'ClsDataPreprocessor', 'Mixup', 'CutMix', 'ResizeMix', 'BEiTAttention',
    'LayerScale', 'WindowMSA', 'WindowMSAV2', 'ChannelMultiheadAttention',
-    'PositionEncodingFourier', 'LeAttention'
+    'PositionEncodingFourier', 'LeAttention', 'GRN', 'LayerNorm2d',
+    'build_norm_layer'
 ]
--- a/mmcls/models/utils/norm.py
+++ b/mmcls/models/utils/norm.py
@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcls.registry import MODELS
+
+
+@MODELS.register_module()
+class GRN(nn.Module):
+    """Global Response Normalization Module.
+
+    Come from `ConvNeXt V2: Co-designing and Scaling ConvNets with Masked
+    Autoencoders <http://arxiv.org/abs/2301.00808>`_
+
+    Args:
+        in_channels (int): The number of channels of the input tensor.
+        eps (float): a value added to the denominator for numerical stability.
+            Defaults to 1e-6.
+    """
+
+    def __init__(self, in_channels, eps=1e-6):
+        super().__init__()
+        self.in_channels = in_channels
+        self.gamma = nn.Parameter(torch.zeros(in_channels))
+        self.beta = nn.Parameter(torch.zeros(in_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor, data_format='channel_first'):
+        """Forward method.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+            data_format (str): The format of the input tensor. If
+                ``"channel_first"``, the shape of the input tensor should be
+                (B, C, H, W). If ``"channel_last"``, the shape of the input
+                tensor should be (B, H, W, C). Defaults to "channel_first".
+        """
+        if data_format == 'channel_last':
+            gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+            nx = gx / (gx.mean(dim=-1, keepdim=True) + self.eps)
+            x = self.gamma * (x * nx) + self.beta + x
+        elif data_format == 'channel_first':
+            gx = torch.norm(x, p=2, dim=(2, 3), keepdim=True)
+            nx = gx / (gx.mean(dim=1, keepdim=True) + self.eps)
+            x = self.gamma.view(1, -1, 1, 1) * (x * nx) + self.beta.view(
+                1, -1, 1, 1) + x
+        return x
+
+
+@MODELS.register_module('LN2d')
+class LayerNorm2d(nn.LayerNorm):
+    """LayerNorm on channels for 2d images.
+
+    Args:
+        num_channels (int): The number of channels of the input tensor.
+        eps (float): a value added to the denominator for numerical stability.
+            Defaults to 1e-5.
+        elementwise_affine (bool): a boolean value that when set to ``True``,
+            this module has learnable per-element affine parameters initialized
+            to ones (for weights) and zeros (for biases). Defaults to True.
+    """
+
+    def __init__(self, num_channels: int, **kwargs) -> None:
+        super().__init__(num_channels, **kwargs)
+        self.num_channels = self.normalized_shape[0]
+
+    def forward(self, x, data_format='channel_first'):
+        """Forward method.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+            data_format (str): The format of the input tensor. If
+                ``"channel_first"``, the shape of the input tensor should be
+                (B, C, H, W). If ``"channel_last"``, the shape of the input
+                tensor should be (B, H, W, C). Defaults to "channel_first".
+        """
+        assert x.dim() == 4, 'LayerNorm2d only supports inputs with shape ' \
+            f'(N, C, H, W), but got tensor with shape {x.shape}'
+        if data_format == 'channel_last':
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
+                             self.eps)
+        elif data_format == 'channel_first':
+            x = x.permute(0, 2, 3, 1)
+            x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
+                             self.eps)
+            # If the output is discontiguous, it may cause some unexpected
+            # problem in the downstream tasks
+            x = x.permute(0, 3, 1, 2).contiguous()
+        return x
+
+
+def build_norm_layer(cfg: dict, num_features: int) -> nn.Module:
+    """Build normalization layer.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+
+        num_features (int): Number of input channels.
+
+    Returns:
+        nn.Module: The created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    norm_layer = MODELS.get(layer_type)
+    if norm_layer is None:
+        raise KeyError(f'Cannot find {layer_type} in registry under scope '
+                       f'name {MODELS.scope}')
+
+    layer = norm_layer(num_features, **cfg_)
+    if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+        layer._specify_ddp_gpu_num(1)
+
+    return layer
--- a/model-index.yml
+++ b/model-index.yml
@ -48,3 +48,4 @@ Import:
  - configs/clip/metafile.yml
  - configs/mixmim/metafile.yml
  - configs/efficientnet_v2/metafile.yml
+  - configs/convnext_v2/metafile.yml
--- a/tests/test_datasets/test_datasets.py
+++ b/tests/test_datasets/test_datasets.py
@ -9,9 +9,8 @@ from unittest.mock import MagicMock, call, patch

 import numpy as np
 from mmengine.logging import MMLogger
-from mmengine.registry import TRANSFORMS

-from mmcls.registry import DATASETS
+from mmcls.registry import DATASETS, TRANSFORMS
 from mmcls.utils import register_all_modules

 register_all_modules()
--- a/tests/test_models/test_backbones/test_convnext.py
+++ b/tests/test_models/test_backbones/test_convnext.py
@ -94,3 +94,13 @@ def test_convnext():
    feat = model(imgs)
    assert len(feat) == 1
    assert feat[0].shape == torch.Size([1, 768])
+
+    # Test linear_pw_conv=False
+    model = ConvNeXt(arch='tiny', out_indices=-1, linear_pw_conv=False)
+    model.init_weights()
+    model.train()
+
+    imgs = torch.randn(1, 3, 224, 224)
+    feat = model(imgs)
+    assert len(feat) == 1
+    assert feat[0].shape == torch.Size([1, 768])
--- a/tests/test_models/test_utils/test_norm.py
+++ b/tests/test_models/test_utils/test_norm.py
@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+import torch.nn.functional as F
+
+from mmcls.models.utils import GRN, LayerNorm2d
+
+
+class TestGRN(TestCase):
+
+    def test_init(self):
+        module = GRN(in_channels=32, eps=1e-3)
+        self.assertEqual(module.in_channels, 32)
+        self.assertEqual(module.eps, 1e-3)
+        self.assertTrue(module.gamma.requires_grad)
+        self.assertTrue(module.beta.requires_grad)
+        self.assertEqual(module.gamma.shape, (32, ))
+        self.assertTrue(module.beta.shape, (32, ))
+
+    def test_forward(self):
+        module = GRN(in_channels=32, eps=1e-3)
+        input_ = torch.rand(1, 28, 28, 32)
+        gx = torch.norm(input_, p=2, dim=(1, 2), keepdim=True)
+        nx = gx / (gx.mean(dim=3, keepdim=True) + 1e-3)
+        expected_out = module.gamma * input_ * nx + module.beta + input_
+
+        torch.testing.assert_allclose(
+            module(input_, data_format='channel_last'), expected_out)
+
+        input_ = input_.permute([0, 3, 1, 2])
+        expected_out = expected_out.permute([0, 3, 1, 2])
+        torch.testing.assert_allclose(
+            module(input_, data_format='channel_first'), expected_out)
+
+
+class TestLayerNorm2d(TestCase):
+
+    def test_init(self):
+        module = LayerNorm2d(num_channels=32, eps=1e-3)
+        self.assertEqual(module.num_channels, 32)
+        self.assertEqual(module.eps, 1e-3)
+        self.assertTrue(module.weight.requires_grad)
+        self.assertTrue(module.bias.requires_grad)
+        self.assertEqual(module.weight.shape, (32, ))
+        self.assertTrue(module.bias.shape, (32, ))
+
+    def test_forward(self):
+        module = LayerNorm2d(num_channels=32, eps=1e-3)
+        input_ = torch.rand(1, 28, 28, 32)
+        expected_out = F.layer_norm(input_, module.normalized_shape,
+                                    module.weight, module.bias, 1e-3)
+
+        torch.testing.assert_allclose(
+            module(input_, data_format='channel_last'), expected_out)
+
+        input_ = input_.permute([0, 3, 1, 2])
+        expected_out = expected_out.permute([0, 3, 1, 2])
+        torch.testing.assert_allclose(
+            module(input_, data_format='channel_first'), expected_out)