[Feature] Implement the conformer backbone. (#494)

* implement the conformer * format code style * format code style * reuse the TransformerEncoderLayer in the vision_transformer.py * Modify variable name * delete unused params * Remove warning info in Conformer head since it already exists in Conformer. * Rename some variables * Add unit tests * Use `getattr` instead of `get_submodule`. * Remove some useless layers * Refactor conformer and add configs * Update configs and add metafile. * Fix unit tests * Update README Co-authored-by: mzr1996 <mzr1996@163.com>
2025-06-03 14:59:18 +08:00 · 2021-12-07 14:00:17 +08:00 · 2021-12-07 14:00:17 +08:00 · 18f6bb0b10
commit 18f6bb0b10
parent 0aa789f3c3
19 changed files with 1095 additions and 3 deletions
--- a/configs/_base_/models/conformer/base-p16.py
+++ b/configs/_base_/models/conformer/base-p16.py
@ -0,0 +1,22 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='Conformer', arch='base', drop_path_rate=0.1, init_cfg=None),
+    neck=None,
+    head=dict(
+        type='ConformerHead',
+        num_classes=1000,
+        in_channels=[1536, 576],
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/conformer/small-p16.py
+++ b/configs/_base_/models/conformer/small-p16.py
@ -0,0 +1,22 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='Conformer', arch='small', drop_path_rate=0.1, init_cfg=None),
+    neck=None,
+    head=dict(
+        type='ConformerHead',
+        num_classes=1000,
+        in_channels=[1024, 384],
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/conformer/small-p32.py
+++ b/configs/_base_/models/conformer/small-p32.py
@ -0,0 +1,26 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='Conformer',
+        arch='small',
+        patch_size=32,
+        drop_path_rate=0.1,
+        init_cfg=None),
+    neck=None,
+    head=dict(
+        type='ConformerHead',
+        num_classes=1000,
+        in_channels=[1024, 384],
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/models/conformer/tiny-p16.py
+++ b/configs/_base_/models/conformer/tiny-p16.py
@ -0,0 +1,22 @@
+# model settings
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='Conformer', arch='tiny', drop_path_rate=0.1, init_cfg=None),
+    neck=None,
+    head=dict(
+        type='ConformerHead',
+        num_classes=1000,
+        in_channels=[256, 384],
+        init_cfg=None,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+        cal_acc=False),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+    train_cfg=dict(augments=[
+        dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5),
+        dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5)
+    ]))
--- a/configs/_base_/schedules/imagenet_bs1024_adamw_conformer.py
+++ b/configs/_base_/schedules/imagenet_bs1024_adamw_conformer.py
@ -0,0 +1,29 @@
+paramwise_cfg = dict(
+    norm_decay_mult=0.0,
+    bias_decay_mult=0.0,
+    custom_keys={
+        '.cls_token': dict(decay_mult=0.0),
+    })
+
+# for batch in each gpu is 128, 8 gpu
+# lr = 5e-4 * 128 * 8 / 512 = 0.001
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4 * 128 * 8 / 512,
+    weight_decay=0.05,
+    eps=1e-8,
+    betas=(0.9, 0.999),
+    paramwise_cfg=paramwise_cfg)
+optimizer_config = dict(grad_clip=None)
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False,
+    min_lr_ratio=1e-2,
+    warmup='linear',
+    warmup_ratio=1e-3,
+    warmup_iters=5 * 1252,
+    warmup_by_epoch=False)
+
+runner = dict(type='EpochBasedRunner', max_epochs=300)
--- a/configs/conformer/README.md
+++ b/configs/conformer/README.md
@ -0,0 +1,39 @@
+# Conformer: Local Features Coupling Global Representations for Visual Recognition
+<!-- {Conformer} -->
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+Within Convolutional Neural Network (CNN), the convolution operations are good at extracting local features but experience difficulty to capture global representations. Within visual transformer, the cascaded self-attention modules can capture long-distance feature dependencies but unfortunately deteriorate local feature details. In this paper, we propose a hybrid network structure, termed Conformer, to take advantage of convolutional operations and self-attention mechanisms for enhanced representation learning. Conformer roots in the Feature Coupling Unit (FCU), which fuses local features and global representations under different resolutions in an interactive fashion. Conformer adopts a concurrent structure so that local features and global representations are retained to the maximum extent. Experiments show that Conformer, under the comparable parameter complexity, outperforms the visual transformer (DeiT-B) by 2.3% on ImageNet. On MSCOCO, it outperforms ResNet-101 by 3.7% and 3.6% mAPs for object detection and instance segmentation, respectively, demonstrating the great potential to be a general backbone network.
+
+<!-- [IMAGE] -->
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/144957687-926390ed-6119-4e4c-beaa-9bc0017fe953.png" width="90%"/>
+</div>
+
+## Citation
+
+```latex
+@article{peng2021conformer,
+      title={Conformer: Local Features Coupling Global Representations for Visual Recognition},
+      author={Zhiliang Peng and Wei Huang and Shanzhi Gu and Lingxi Xie and Yaowei Wang and Jianbin Jiao and Qixiang Ye},
+      journal={arXiv preprint arXiv:2105.03889},
+      year={2021},
+}
+```
+
+## Results and models
+
+Some pre-trained models are converted from [official repo](https://github.com/pengzhiliang/Conformer).
+
+## ImageNet-1k
+
+|         Model         | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download |
+|:---------------------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:|
+| Conformer-tiny-p16\*  |  23.52    | 4.90 | 81.31 | 95.60 | [config](configs/conformer/conformer-tiny-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth) |
+| Conformer-small-p32   |  38.85    | 7.09 | 81.96 | 96.02 | [config](configs/conformer/conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) |
+| Conformer-small-p16\* |  37.67    | 10.31 | 83.32 | 96.46 | [config](configs/conformer/conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) |
+| Conformer-base-p16\*  |  83.29    | 22.89 | 83.82 | 96.59 | [config](configs/conformer/conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) |
+
+*Models with \* are converted from other repos.*
--- a/configs/conformer/conformer-base-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-base-p16_8xb128_in1k.py
@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/conformer/base-p16.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+data = dict(samples_per_gpu=128)
+evaluation = dict(interval=1, metric='accuracy')
--- a/configs/conformer/conformer-small-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-small-p16_8xb128_in1k.py
@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/conformer/small-p16.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+data = dict(samples_per_gpu=128)
+evaluation = dict(interval=1, metric='accuracy')
--- a/configs/conformer/conformer-small-p32_8xb128_in1k.py
+++ b/configs/conformer/conformer-small-p32_8xb128_in1k.py
@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/conformer/small-p32.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+data = dict(samples_per_gpu=128)
+evaluation = dict(interval=1, metric='accuracy')
--- a/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
+++ b/configs/conformer/conformer-tiny-p16_8xb128_in1k.py
@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/conformer/tiny-p16.py',
+    '../_base_/datasets/imagenet_bs64_swin_224.py',
+    '../_base_/schedules/imagenet_bs1024_adamw_conformer.py',
+    '../_base_/default_runtime.py'
+]
+
+data = dict(samples_per_gpu=128)
+evaluation = dict(interval=1, metric='accuracy')
--- a/configs/conformer/metafile.yml
+++ b/configs/conformer/metafile.yml
@ -0,0 +1,78 @@
+Collections:
+  - Name: Conformer
+    Metadata:
+      Training Data: ImageNet-1k
+      Architecture:
+        - Layer Normalization
+        - Scaled Dot-Product Attention
+        - Dropout
+    Paper:
+      URL: https://arxiv.org/abs/2105.03889
+      Title: "Conformer: Local Features Coupling Global Representations for Visual Recognition"
+    README: configs/conformer/README.md
+#    Code:
+#      URL:  # todo
+#      Version:  # todo
+
+Models:
+  - Name: conformer-tiny-p16_3rdparty_8xb128_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-tiny-p16_8xb128_in1k.py
+    Metadata:
+      FLOPs: 4899611328
+      Parameters: 23524704
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.31
+          Top 5 Accuracy: 95.60
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/19SxGhKcWOR5oQSxNUWUM2MGYiaWMrF1z/view?usp=sharing
+      Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L65
+  - Name: conformer-small-p16_3rdparty_8xb128_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-small-p16_8xb128_in1k.py
+    Metadata:
+      FLOPs: 10311309312
+      Parameters: 37673424
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.32
+          Top 5 Accuracy: 96.46
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1mpOlbLaVxOfEwV4-ha78j_1Ebqzj2B83/view?usp=sharing
+      Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L73
+  - Name: conformer-small-p32_8xb128_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-small-p32_8xb128_in1k.py
+    Metadata:
+      FLOPs: 7087281792
+      Parameters: 38853072
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 81.96
+          Top 5 Accuracy: 96.02
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth
+  - Name: conformer-base-p16_3rdparty_8xb128_in1k
+    In Collection: Conformer
+    Config: configs/conformer/conformer-base-p16_8xb128_in1k.py
+    Metadata:
+      FLOPs: 22892078080
+      Parameters: 83289136
+    Results:
+      - Dataset: ImageNet-1k
+        Metrics:
+          Top 1 Accuracy: 83.82
+          Top 5 Accuracy: 96.59
+        Task: Image Classification
+    Weights: https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth
+    Converted From:
+      Weights: https://drive.google.com/file/d/1oeQ9LSOGKEUaYGu7WTlUGl3KDsQIi0MA/view?usp=sharing
+      Code: https://github.com/pengzhiliang/Conformer/blob/main/models.py#L89
--- a/configs/t2t_vit/README.md
+++ b/configs/t2t_vit/README.md
@ -23,7 +23,7 @@ Transformers, which are popular for language modeling, have been explored for so

 ## Pretrain model

-The pre-trained modles are converted from [official repo](https://github.com/yitu-opensource/T2T-ViT/tree/main#2-t2t-vit-models).
+The pre-trained models are converted from [official repo](https://github.com/yitu-opensource/T2T-ViT/tree/main#2-t2t-vit-models).

 ### ImageNet-1k

--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@ -63,6 +63,10 @@ The ResNet family models below are trained by standard data augmentations, i.e.,
 | T2T-ViT_t-24\* |   64.00   | 12.69    | 82.55     | 96.06     | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth)  &#124; [log]()|
 |  Mixer-B/16\*  |  59.88   |  12.61    | 76.68     | 92.25     | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-base-p16_64xb64_in1k.py)  | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-base-p16_3rdparty_64xb64_in1k_20211124-1377e3e0.pth)  &#124; [log]()|
 |  Mixer-L/16\*  |  208.2   |  44.57    | 72.34     | 88.02     | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth)  &#124; [log]()|
+| Conformer-tiny-p16\*  |  23.52    | 4.90 | 81.31 | 95.60 | [config](configs/conformer/conformer-tiny-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-tiny-p16_3rdparty_8xb128_in1k_20211206-f6860372.pth)  &#124; [log]()|
+| Conformer-small-p32   |  38.85    | 7.09 | 81.96 | 96.02 | [config](configs/conformer/conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth)  &#124; [log]()|
+| Conformer-small-p16\* |  37.67    | 10.31 | 83.32 | 96.46 | [config](configs/conformer/conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth)  &#124; [log]()|
+| Conformer-base-p16\*  |  83.29    | 22.89 | 83.82 | 96.59 | [config](configs/conformer/conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth)  &#124; [log]()|


 Models with * are converted from other repos, others are trained by ourselves.
--- a/mmcls/models/backbones/init.py
+++ b/mmcls/models/backbones/init.py
@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .alexnet import AlexNet
+from .conformer import Conformer
 from .lenet import LeNet5
 from .mlp_mixer import MlpMixer
 from .mobilenet_v2 import MobileNetV2
@ -27,5 +28,5 @@ __all__ = [
    'ResNeSt', 'ResNet_CIFAR', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1',
    'ShuffleNetV2', 'MobileNetV2', 'MobileNetV3', 'VisionTransformer',
    'SwinTransformer', 'TNT', 'TIMMBackbone', 'T2T_ViT', 'Res2Net', 'RepVGG',
-    'MlpMixer'
+    'Conformer', 'MlpMixer'
 ]
--- a/mmcls/models/backbones/conformer.py
+++ b/mmcls/models/backbones/conformer.py
@ -0,0 +1,616 @@
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import DropPath
+from mmcv.cnn.utils.weight_init import trunc_normal_
+
+from mmcls.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone, BaseModule
+from .vision_transformer import TransformerEncoderLayer
+
+
+class ConvBlock(BaseModule):
+    """Basic convluation block used in Conformer.
+
+    This block includes three convluation modules, and supports three new
+    functions:
+    1. Returns the output of both the final layers and the second convluation
+    module.
+    2. Fuses the input of the second convluation module with an extra input
+    feature map.
+    3. Supports to add an extra convluation module to the identity connection.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        stride (int): The stride of the second convluation module.
+            Defaults to 1.
+        groups (int): The groups of the second convluation module.
+            Defaults to 1.
+        drop_path_rate (float): The rate of the DropPath layer. Defaults to 0.
+        with_residual_conv (bool): Whether to add an extra convluation module
+            to the identity connection. Defaults to False.
+        norm_cfg (dict): The config of normalization layers.
+            Defaults to ``dict(type='BN', eps=1e-6)``.
+        act_cfg (dict): The config of activative functions.
+            Defaults to ``dict(type='ReLU', inplace=True))``.
+        init_cfg (dict, optional): The extra config to initialize the module.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 groups=1,
+                 drop_path_rate=0.,
+                 with_residual_conv=False,
+                 norm_cfg=dict(type='BN', eps=1e-6),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 init_cfg=None):
+        super(ConvBlock, self).__init__(init_cfg=init_cfg)
+
+        expansion = 4
+        mid_channels = out_channels // expansion
+
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False)
+        self.bn1 = build_norm_layer(norm_cfg, mid_channels)[1]
+        self.act1 = build_activation_layer(act_cfg)
+
+        self.conv2 = nn.Conv2d(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=stride,
+            groups=groups,
+            padding=1,
+            bias=False)
+        self.bn2 = build_norm_layer(norm_cfg, mid_channels)[1]
+        self.act2 = build_activation_layer(act_cfg)
+
+        self.conv3 = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False)
+        self.bn3 = build_norm_layer(norm_cfg, out_channels)[1]
+        self.act3 = build_activation_layer(act_cfg)
+
+        if with_residual_conv:
+            self.residual_conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                bias=False)
+            self.residual_bn = build_norm_layer(norm_cfg, out_channels)[1]
+
+        self.with_residual_conv = with_residual_conv
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.bn3.weight)
+
+    def forward(self, x, fusion_features=None, out_conv2=True):
+        identity = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        x = self.conv2(x) if fusion_features is None else self.conv2(
+            x + fusion_features)
+        x = self.bn2(x)
+        x2 = self.act2(x)
+
+        x = self.conv3(x2)
+        x = self.bn3(x)
+
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+
+        if self.with_residual_conv:
+            identity = self.residual_conv(identity)
+            identity = self.residual_bn(identity)
+
+        x += identity
+        x = self.act3(x)
+
+        if out_conv2:
+            return x, x2
+        else:
+            return x
+
+
+class FCUDown(BaseModule):
+    """CNN feature maps -> Transformer patch embeddings."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 down_stride,
+                 with_cls_token=True,
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 act_cfg=dict(type='GELU'),
+                 init_cfg=None):
+        super(FCUDown, self).__init__(init_cfg=init_cfg)
+        self.down_stride = down_stride
+        self.with_cls_token = with_cls_token
+
+        self.conv_project = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.sample_pooling = nn.AvgPool2d(
+            kernel_size=down_stride, stride=down_stride)
+
+        self.ln = build_norm_layer(norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x, x_t):
+        x = self.conv_project(x)  # [N, C, H, W]
+
+        x = self.sample_pooling(x).flatten(2).transpose(1, 2)
+        x = self.ln(x)
+        x = self.act(x)
+
+        if self.with_cls_token:
+            x = torch.cat([x_t[:, 0][:, None, :], x], dim=1)
+
+        return x
+
+
+class FCUUp(BaseModule):
+    """Transformer patch embeddings -> CNN feature maps."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 up_stride,
+                 with_cls_token=True,
+                 norm_cfg=dict(type='BN', eps=1e-6),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 init_cfg=None):
+        super(FCUUp, self).__init__(init_cfg=init_cfg)
+
+        self.up_stride = up_stride
+        self.with_cls_token = with_cls_token
+
+        self.conv_project = nn.Conv2d(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.bn = build_norm_layer(norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x, H, W):
+        B, _, C = x.shape
+        # [N, 197, 384] -> [N, 196, 384] -> [N, 384, 196] -> [N, 384, 14, 14]
+        if self.with_cls_token:
+            x_r = x[:, 1:].transpose(1, 2).reshape(B, C, H, W)
+        else:
+            x_r = x.transpose(1, 2).reshape(B, C, H, W)
+
+        x_r = self.act(self.bn(self.conv_project(x_r)))
+
+        return F.interpolate(
+            x_r, size=(H * self.up_stride, W * self.up_stride))
+
+
+class ConvTransBlock(BaseModule):
+    """Basic module for Conformer.
+
+    This module is a fusion of CNN block transformer encoder block.
+
+    Args:
+        in_channels (int): The number of input channels in conv blocks.
+        out_channels (int): The number of output channels in conv blocks.
+        embed_dims (int): The embedding dimension in transformer blocks.
+        conv_stride (int): The stride of conv2d layers. Defaults to 1.
+        groups (int): The groups of conv blocks. Defaults to 1.
+        with_residual_conv (bool): Whether to add a conv-bn layer to the
+            identity connect in the conv block. Defaults to False.
+        down_stride (int): The stride of the downsample pooling layer.
+            Defaults to 4.
+        num_heads (int): The number of heads in transformer attention layers.
+            Defaults to 12.
+        mlp_ratio (float): The expansion ratio in transformer FFN module.
+            Defaults to 4.
+        qkv_bias (bool): Enable bias for qkv if True. Defaults to False.
+        with_cls_token (bool): Whether use class token or not.
+            Defaults to True.
+        drop_rate (float): The dropout rate of the output projection and
+            FFN in the transformer block. Defaults to 0.
+        attn_drop_rate (float): The dropout rate after the attention
+            calculation in the transformer block. Defaults to 0.
+        drop_path_rate (bloat): The drop path rate in both the conv block
+            and the transformer block. Defaults to 0.
+        last_fusion (bool): Whether this block is the last stage. If so,
+            downsample the fusion feature map.
+        init_cfg (dict, optional): The extra config to initialize the module.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 embed_dims,
+                 conv_stride=1,
+                 groups=1,
+                 with_residual_conv=False,
+                 down_stride=4,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 with_cls_token=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 last_fusion=False,
+                 init_cfg=None):
+        super(ConvTransBlock, self).__init__(init_cfg=init_cfg)
+        expansion = 4
+        self.cnn_block = ConvBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            with_residual_conv=with_residual_conv,
+            stride=conv_stride,
+            groups=groups)
+
+        if last_fusion:
+            self.fusion_block = ConvBlock(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                stride=2,
+                with_residual_conv=True,
+                groups=groups,
+                drop_path_rate=drop_path_rate)
+        else:
+            self.fusion_block = ConvBlock(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                groups=groups,
+                drop_path_rate=drop_path_rate)
+
+        self.squeeze_block = FCUDown(
+            in_channels=out_channels // expansion,
+            out_channels=embed_dims,
+            down_stride=down_stride,
+            with_cls_token=with_cls_token)
+
+        self.expand_block = FCUUp(
+            in_channels=embed_dims,
+            out_channels=out_channels // expansion,
+            up_stride=down_stride,
+            with_cls_token=with_cls_token)
+
+        self.trans_block = TransformerEncoderLayer(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            feedforward_channels=int(embed_dims * mlp_ratio),
+            drop_rate=drop_rate,
+            drop_path_rate=drop_path_rate,
+            attn_drop_rate=attn_drop_rate,
+            qkv_bias=qkv_bias,
+            norm_cfg=dict(type='LN', eps=1e-6))
+
+        self.down_stride = down_stride
+        self.embed_dim = embed_dims
+        self.last_fusion = last_fusion
+
+    def forward(self, cnn_input, trans_input):
+        x, x_conv2 = self.cnn_block(cnn_input, out_conv2=True)
+
+        _, _, H, W = x_conv2.shape
+
+        # Convert the feature map of conv2 to transformer embedding
+        # and concat with class token.
+        conv2_embedding = self.squeeze_block(x_conv2, trans_input)
+
+        trans_output = self.trans_block(conv2_embedding + trans_input)
+
+        # Convert the transformer output embedding to feature map
+        trans_features = self.expand_block(trans_output, H // self.down_stride,
+                                           W // self.down_stride)
+        x = self.fusion_block(
+            x, fusion_features=trans_features, out_conv2=False)
+
+        return x, trans_output
+
+
+@BACKBONES.register_module()
+class Conformer(BaseBackbone):
+    """Conformer backbone.
+
+    A PyTorch implementation of : `Conformer: Local Features Coupling Global
+    Representations for Visual Recognition <https://arxiv.org/abs/2105.03889>`_
+
+    Args:
+        arch (str | dict): Conformer architecture. Defaults to 'tiny'.
+        patch_size (int): The patch size. Defaults to 16.
+        base_channels (int): The base number of channels in CNN network.
+            Defaults to 64.
+        mlp_ratio (float): The expansion ratio of FFN network in transformer
+            block. Defaults to 4.
+        with_cls_token (bool): Whether use class token or not.
+            Defaults to True.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        out_indices (Sequence | int): Output from which stages.
+            Defaults to -1, means the last stage.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to None.
+    """
+    arch_zoo = {
+        **dict.fromkeys(['t', 'tiny'],
+                        {'embed_dims': 384,
+                         'channel_ratio': 1,
+                         'num_heads': 6,
+                         'depths': 12
+                         }),
+        **dict.fromkeys(['s', 'small'],
+                        {'embed_dims': 384,
+                         'channel_ratio': 4,
+                         'num_heads': 6,
+                         'depths': 12
+                         }),
+        **dict.fromkeys(['b', 'base'],
+                        {'embed_dims': 576,
+                         'channel_ratio': 6,
+                         'num_heads': 9,
+                         'depths': 12
+                         }),
+    }  # yapf: disable
+
+    _version = 1
+
+    def __init__(self,
+                 arch='tiny',
+                 patch_size=16,
+                 base_channels=64,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 with_cls_token=True,
+                 drop_path_rate=0.,
+                 norm_eval=True,
+                 frozen_stages=0,
+                 out_indices=-1,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(arch, str):
+            arch = arch.lower()
+            assert arch in set(self.arch_zoo), \
+                f'Arch {arch} is not in default archs {set(self.arch_zoo)}'
+            self.arch_settings = self.arch_zoo[arch]
+        else:
+            essential_keys = {
+                'embed_dims', 'depths', 'num_heads', 'channel_ratio'
+            }
+            assert isinstance(arch, dict) and set(arch) == essential_keys, \
+                f'Custom arch needs a dict with keys {essential_keys}'
+            self.arch_settings = arch
+
+        self.num_features = self.embed_dims = self.arch_settings['embed_dims']
+        self.depths = self.arch_settings['depths']
+        self.num_heads = self.arch_settings['num_heads']
+        self.channel_ratio = self.arch_settings['channel_ratio']
+
+        if isinstance(out_indices, int):
+            out_indices = [out_indices]
+        assert isinstance(out_indices, Sequence), \
+            f'"out_indices" must by a sequence or int, ' \
+            f'get {type(out_indices)} instead.'
+        for i, index in enumerate(out_indices):
+            if index < 0:
+                out_indices[i] = self.depths + index + 1
+                assert out_indices[i] >= 0, f'Invalid out_indices {index}'
+        self.out_indices = out_indices
+
+        self.norm_eval = norm_eval
+        self.frozen_stages = frozen_stages
+
+        self.with_cls_token = with_cls_token
+        if self.with_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
+
+        # stochastic depth decay rule
+        self.trans_dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, self.depths)
+        ]
+
+        # Stem stage: get the feature maps by conv block
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3,
+            bias=False)  # 1 / 2 [112, 112]
+        self.bn1 = nn.BatchNorm2d(64)
+        self.act1 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1)  # 1 / 4 [56, 56]
+
+        # 1 stage
+        stage1_channels = int(base_channels * self.channel_ratio)
+        trans_down_stride = patch_size // 4
+        self.conv_1 = ConvBlock(
+            in_channels=64,
+            out_channels=stage1_channels,
+            with_residual_conv=True,
+            stride=1)
+        self.trans_patch_conv = nn.Conv2d(
+            64,
+            self.embed_dims,
+            kernel_size=trans_down_stride,
+            stride=trans_down_stride,
+            padding=0)
+
+        self.trans_1 = TransformerEncoderLayer(
+            embed_dims=self.embed_dims,
+            num_heads=self.num_heads,
+            feedforward_channels=int(self.embed_dims * mlp_ratio),
+            drop_path_rate=self.trans_dpr[0],
+            qkv_bias=qkv_bias,
+            norm_cfg=dict(type='LN', eps=1e-6))
+
+        # 2~4 stage
+        init_stage = 2
+        fin_stage = self.depths // 3 + 1
+        for i in range(init_stage, fin_stage):
+            self.add_module(
+                f'conv_trans_{i}',
+                ConvTransBlock(
+                    in_channels=stage1_channels,
+                    out_channels=stage1_channels,
+                    embed_dims=self.embed_dims,
+                    conv_stride=1,
+                    with_residual_conv=False,
+                    down_stride=trans_down_stride,
+                    num_heads=self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop_path_rate=self.trans_dpr[i - 1],
+                    with_cls_token=self.with_cls_token))
+
+        stage2_channels = int(base_channels * self.channel_ratio * 2)
+        # 5~8 stage
+        init_stage = fin_stage  # 5
+        fin_stage = fin_stage + self.depths // 3  # 9
+        for i in range(init_stage, fin_stage):
+            if i == init_stage:
+                conv_stride = 2
+                in_channels = stage1_channels
+            else:
+                conv_stride = 1
+                in_channels = stage2_channels
+
+            with_residual_conv = True if i == init_stage else False
+            self.add_module(
+                f'conv_trans_{i}',
+                ConvTransBlock(
+                    in_channels=in_channels,
+                    out_channels=stage2_channels,
+                    embed_dims=self.embed_dims,
+                    conv_stride=conv_stride,
+                    with_residual_conv=with_residual_conv,
+                    down_stride=trans_down_stride // 2,
+                    num_heads=self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop_path_rate=self.trans_dpr[i - 1],
+                    with_cls_token=self.with_cls_token))
+
+        stage3_channels = int(base_channels * self.channel_ratio * 2 * 2)
+        # 9~12 stage
+        init_stage = fin_stage  # 9
+        fin_stage = fin_stage + self.depths // 3  # 13
+        for i in range(init_stage, fin_stage):
+            if i == init_stage:
+                conv_stride = 2
+                in_channels = stage2_channels
+                with_residual_conv = True
+            else:
+                conv_stride = 1
+                in_channels = stage3_channels
+                with_residual_conv = False
+
+            last_fusion = (i == self.depths)
+
+            self.add_module(
+                f'conv_trans_{i}',
+                ConvTransBlock(
+                    in_channels=in_channels,
+                    out_channels=stage3_channels,
+                    embed_dims=self.embed_dims,
+                    conv_stride=conv_stride,
+                    with_residual_conv=with_residual_conv,
+                    down_stride=trans_down_stride // 4,
+                    num_heads=self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop_path_rate=self.trans_dpr[i - 1],
+                    with_cls_token=self.with_cls_token,
+                    last_fusion=last_fusion))
+        self.fin_stage = fin_stage
+
+        self.pooling = nn.AdaptiveAvgPool2d(1)
+        self.trans_norm = nn.LayerNorm(self.embed_dims)
+
+        if self.with_cls_token:
+            trunc_normal_(self.cls_token, std=.02)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(
+                m.weight, mode='fan_out', nonlinearity='relu')
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.weight, 1.)
+            nn.init.constant_(m.bias, 0.)
+
+        if hasattr(m, 'zero_init_last_bn'):
+            m.zero_init_last_bn()
+
+    def init_weights(self):
+        super(Conformer, self).init_weights()
+        logger = get_root_logger()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress default init if use pretrained model.
+            return
+        else:
+            logger.info(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            self.apply(self._init_weights)
+
+    def forward(self, x):
+        output = []
+        B = x.shape[0]
+        if self.with_cls_token:
+            cls_tokens = self.cls_token.expand(B, -1, -1)
+
+        # stem
+        x_base = self.maxpool(self.act1(self.bn1(self.conv1(x))))
+
+        # 1 stage [N, 64, 56, 56] -> [N, 128, 56, 56]
+        x = self.conv_1(x_base, out_conv2=False)
+        x_t = self.trans_patch_conv(x_base).flatten(2).transpose(1, 2)
+        if self.with_cls_token:
+            x_t = torch.cat([cls_tokens, x_t], dim=1)
+        x_t = self.trans_1(x_t)
+
+        # 2 ~ final
+        for i in range(2, self.fin_stage):
+            stage = getattr(self, f'conv_trans_{i}')
+            x, x_t = stage(x, x_t)
+            if i in self.out_indices:
+                if self.with_cls_token:
+                    output.append([
+                        self.pooling(x).flatten(1),
+                        self.trans_norm(x_t)[:, 0]
+                    ])
+                else:
+                    # if no class token, use the mean patch token
+                    # as the transformer feature.
+                    output.append([
+                        self.pooling(x).flatten(1),
+                        self.trans_norm(x_t).mean(dim=1)
+                    ])
+
+        return tuple(output)
--- a/mmcls/models/heads/init.py
+++ b/mmcls/models/heads/init.py
@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .cls_head import ClsHead
+from .conformer_head import ConformerHead
 from .linear_head import LinearClsHead
 from .multi_label_head import MultiLabelClsHead
 from .multi_label_linear_head import MultiLabelLinearClsHead
@ -8,5 +9,5 @@ from .vision_transformer_head import VisionTransformerClsHead

 __all__ = [
    'ClsHead', 'LinearClsHead', 'StackedLinearClsHead', 'MultiLabelClsHead',
-    'MultiLabelLinearClsHead', 'VisionTransformerClsHead'
+    'MultiLabelLinearClsHead', 'VisionTransformerClsHead', 'ConformerHead'
 ]
--- a/mmcls/models/heads/conformer_head.py
+++ b/mmcls/models/heads/conformer_head.py
@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.utils.weight_init import trunc_normal_
+
+from ..builder import HEADS
+from .cls_head import ClsHead
+
+
+@HEADS.register_module()
+class ConformerHead(ClsHead):
+    """Linear classifier head.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict | optional): The extra init config of layers.
+            Defaults to use dict(type='Normal', layer='Linear', std=0.01).
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,  # [conv_dim, trans_dim]
+            init_cfg=dict(type='Normal', layer='Linear', std=0.01),
+            *args,
+            **kwargs):
+        super(ConformerHead, self).__init__(init_cfg=None, *args, **kwargs)
+
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.init_cfg = init_cfg
+
+        if self.num_classes <= 0:
+            raise ValueError(
+                f'num_classes={num_classes} must be a positive integer')
+
+        self.conv_cls_head = nn.Linear(self.in_channels[0], num_classes)
+        self.trans_cls_head = nn.Linear(self.in_channels[1], num_classes)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def init_weights(self):
+        super(ConformerHead, self).init_weights()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress default init if use pretrained model.
+            return
+        else:
+            self.apply(self._init_weights)
+
+    def simple_test(self, x):
+        """Test without augmentation."""
+        if isinstance(x, tuple):
+            x = x[-1]
+        assert isinstance(x,
+                          list)  # There are two outputs in the Conformer model
+
+        conv_cls_score = self.conv_cls_head(x[0])
+        tran_cls_score = self.trans_cls_head(x[1])
+
+        cls_score = conv_cls_score + tran_cls_score
+
+        pred = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        return self.post_process(pred)
+
+    def forward_train(self, x, gt_label):
+        if isinstance(x, tuple):
+            x = x[-1]
+        assert isinstance(x, list) and len(x) == 2, \
+            'There should be two outputs in the Conformer model'
+
+        conv_cls_score = self.conv_cls_head(x[0])
+        tran_cls_score = self.trans_cls_head(x[1])
+
+        losses = self.loss([conv_cls_score, tran_cls_score], gt_label)
+        return losses
+
+    def loss(self, cls_score, gt_label):
+        num_samples = len(cls_score[0])
+        losses = dict()
+        # compute loss
+        loss = sum([
+            self.compute_loss(score, gt_label, avg_factor=num_samples) /
+            len(cls_score) for score in cls_score
+        ])
+        if self.cal_acc:
+            # compute accuracy
+            acc = self.compute_accuracy(cls_score[0] + cls_score[1], gt_label)
+            assert len(acc) == len(self.topk)
+            losses['accuracy'] = {
+                f'top-{k}': a
+                for k, a in zip(self.topk, acc)
+            }
+        losses['loss'] = loss
+        return losses
--- a/model-index.yml
+++ b/model-index.yml
@ -13,3 +13,4 @@ Import:
  - configs/vision_transformer/metafile.yml
  - configs/t2t_vit/metafile.yml
  - configs/mlp_mixer/metafile.yml
+  - configs/conformer/metafile.yml
--- a/tests/test_models/test_backbones/test_conformer.py
+++ b/tests/test_models/test_backbones/test_conformer.py
@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import pytest
+import torch
+from torch.nn.modules import GroupNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmcls.models.backbones import Conformer
+
+
+def is_norm(modules):
+    """Check if is one of the norms."""
+    if isinstance(modules, (GroupNorm, _BatchNorm)):
+        return True
+    return False
+
+
+def check_norm_state(modules, train_state):
+    """Check if norm layer is in correct train state."""
+    for mod in modules:
+        if isinstance(mod, _BatchNorm):
+            if mod.training != train_state:
+                return False
+    return True
+
+
+def test_conformer_backbone():
+
+    cfg_ori = dict(
+        arch='T',
+        drop_path_rate=0.1,
+    )
+
+    with pytest.raises(AssertionError):
+        # test invalid arch
+        cfg = deepcopy(cfg_ori)
+        cfg['arch'] = 'unknown'
+        Conformer(**cfg)
+
+    with pytest.raises(AssertionError):
+        # test arch without essential keys
+        cfg = deepcopy(cfg_ori)
+        cfg['arch'] = {'embed_dims': 24, 'channel_ratio': 6, 'num_heads': 9}
+        Conformer(**cfg)
+
+    # Test Conformer small model with patch size of 16
+    model = Conformer(**cfg_ori)
+    model.init_weights()
+    model.train()
+
+    assert check_norm_state(model.modules(), True)
+
+    imgs = torch.randn(3, 3, 224, 224)
+    conv_feature, transformer_feature = model(imgs)[-1]
+    assert conv_feature.shape == (3, 64 * 1 * 4
+                                  )  # base_channels * channel_ratio * 4
+    assert transformer_feature.shape == (3, 384)
+
+    # Test custom arch Conformer without output cls token
+    cfg = deepcopy(cfg_ori)
+    cfg['arch'] = {
+        'embed_dims': 128,
+        'depths': 15,
+        'num_heads': 16,
+        'channel_ratio': 3,
+    }
+    cfg['with_cls_token'] = False
+    cfg['base_channels'] = 32
+    model = Conformer(**cfg)
+    conv_feature, transformer_feature = model(imgs)[-1]
+    assert conv_feature.shape == (3, 32 * 3 * 4)
+    assert transformer_feature.shape == (3, 128)
+
+    # Test ViT with multi out indices
+    cfg = deepcopy(cfg_ori)
+    cfg['out_indices'] = [4, 8, 12]
+    model = Conformer(**cfg)
+    outs = model(imgs)
+    assert len(outs) == 3
+    # stage 1
+    conv_feature, transformer_feature = outs[0]
+    assert conv_feature.shape == (3, 64 * 1)
+    assert transformer_feature.shape == (3, 384)
+    # stage 2
+    conv_feature, transformer_feature = outs[1]
+    assert conv_feature.shape == (3, 64 * 1 * 2)
+    assert transformer_feature.shape == (3, 384)
+    # stage 3
+    conv_feature, transformer_feature = outs[2]
+    assert conv_feature.shape == (3, 64 * 1 * 4)
+    assert transformer_feature.shape == (3, 384)