diff --git a/.circleci/test.yml b/.circleci/test.yml
index 76f9f70d8..c8acb4829 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -61,8 +61,9 @@ jobs:
           command: |
             pip install git+https://github.com/open-mmlab/mmengine.git@main
             pip install -U openmim
-            mim install 'mmcv>=2.0.0rc1'
+            mim install 'mmcv>=2.0.0rc3'
             pip install git+https://github.com/open-mmlab/mmclassification@dev-1.x
+            pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
             pip install -r requirements/tests.txt -r requirements/optional.txt
       - run:
           name: Build and install
@@ -96,18 +97,20 @@ jobs:
           command: |
             git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine
             git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmclassification.git /home/circleci/mmclassification
+            git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
       - run:
           name: Build Docker image
           command: |
             docker build .circleci/docker -t mmseg:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
-            docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmclassification:/mmclassification -w /mmseg --name mmseg mmseg:gpu
+            docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmclassification:/mmclassification -v /home/circleci/mmdetection:/mmdetection -w /mmseg --name mmseg mmseg:gpu
       - run:
           name: Install mmseg dependencies
           command: |
             docker exec mmseg pip install -e /mmengine
             docker exec mmseg pip install -U openmim
-            docker exec mmseg mim install 'mmcv>=2.0.0rc1'
+            docker exec mmseg mim install 'mmcv>=2.0.0rc3'
             docker exec mmseg pip install -e /mmclassification
+            docker exec mmseg pip install -e /mmdetection
             docker exec mmseg pip install -r requirements/tests.txt -r requirements/optional.txt
       - run:
           name: Build and install
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 644eaf651..97cfda589 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -20,11 +20,7 @@ jobs:
           python -m pip install pre-commit
           pre-commit install
       - name: Linting
-        run: |
-          sudo apt-add-repository ppa:brightbox/ruby-ng -y
-          sudo apt-get update
-          sudo apt-get install -y ruby2.7
-          pre-commit run --all-files
+        run: pre-commit run --all-files
       - name: Check docstring coverage
         run: |
           python -m pip install interrogate
diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index 42a9dc0c4..b4a4a4424 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -44,8 +44,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -92,8 +93,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -155,8 +157,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -187,8 +190,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
index 30e50a962..302c4689f 100644
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@@ -40,8 +40,9 @@ jobs:
         run: |
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -92,8 +93,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -124,8 +126,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
diff --git a/.gitignore b/.gitignore
index f5841a1be..787d13ec6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,6 +105,7 @@ venv.bak/
 # mypy
 .mypy_cache/
 
+data
 .vscode
 .idea
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 34b120968..03b537683 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 repos:
-  - repo: https://gitlab.com/pycqa/flake8.git
+  - repo: https://github.com/PyCQA/flake8
     rev: 5.0.4
     hooks:
       - id: flake8
diff --git a/README.md b/README.md
index 8a0bc52ec..056f9029b 100644
--- a/README.md
+++ b/README.md
@@ -62,11 +62,10 @@ The 1.x branch works with **PyTorch 1.6+**.
 
 ## What's New
 
-v1.0.0rc1 was released in 2/11/2022.
+v1.0.0rc2 was released in 6/12/2022.
 Please refer to [changelog.md](docs/en/notes/changelog.md) for details and release history.
 
-- Support PoolFormer ([#2191](https://github.com/open-mmlab/mmsegmentation/pull/2191))
-- Add Decathlon dataset ([#2227](https://github.com/open-mmlab/mmsegmentation/pull/2227))
+- Support MaskFormer and Mask2Former ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215), [2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))
 
 ## Installation
 
@@ -139,6 +138,8 @@ Supported methods:
 - [x] [Segmenter (ICCV'2021)](configs/segmenter)
 - [x] [SegFormer (NeurIPS'2021)](configs/segformer)
 - [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
+- [x] [Mask2Former (CVPR'2022)](configs/mask2former)
 
 Supported datasets:
 
@@ -194,6 +195,7 @@ This project is released under the [Apache 2.0 license](LICENSE).
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
 - [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 975fca4ee..72abc867a 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -61,7 +61,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 
 ## 更新日志
 
-最新版本 v1.0.0rc1 在 2022.11.2 发布。
+最新版本 v1.0.0rc2 在 2022.12.6 发布。
 如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/notes/changelog.md)。
 
 ## 安装
@@ -134,6 +134,8 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 - [x] [Segmenter (ICCV'2021)](configs/segmenter)
 - [x] [SegFormer (NeurIPS'2021)](configs/segformer)
 - [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
+- [x] [Mask2Former (CVPR'2022)](configs/mask2former)
 
 已支持的数据集：
 
@@ -186,6 +188,7 @@ MMSegmentation 是一个由来自不同高校和企业的研发人员共同参
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练库
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
 - [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
diff --git a/configs/mask2former/README.md b/configs/mask2former/README.md
new file mode 100644
index 000000000..8881b0d66
--- /dev/null
+++ b/configs/mask2former/README.md
@@ -0,0 +1,72 @@
+# Mask2Former
+
+[Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/Mask2Former">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).
+
+```bibtex
+@inproceedings{cheng2021mask2former,
+  title={Masked-attention Mask Transformer for Universal Image Segmentation},
+  author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
+  journal={CVPR},
+  year={2022}
+}
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
+
+### Usage
+
+- Mask2Former model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### Cityscapes
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                       config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5806 | 9.17           | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
+| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6971 | 7.11           | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x1024  | 90000   |     6511 | 7.18           | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
+| Mask2Former | Swin-S         | 512x1024  | 90000   |     8282 | 5.57           | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
+| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    11152 | 4.32           | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
+| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    16207 | 2.86           | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
+
+### ADE20K
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                   config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x512   | 160000  |     3385 | 26.59          | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
+| Mask2Former | R-101-D32      | 512x512   | 160000  |     4190 | 22.97          | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
+| Mask2Former | Swin-S         | 512x512   | 160000  |     5034 | 19.69          | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
+| Mask2Former | Swin-B         | 640x640   | 160000  |     5795 | 12.48          | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
+| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5795 | 12.43          | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
+| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     9077 | 8.81           | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
+
+Note:
+
+- All experiments of Mask2Former are implemented with 8 A100 GPUs with 2 samplers per GPU.
+- As mentioned at [the official repo](https://github.com/facebookresearch/Mask2Former/issues/5), the results of Mask2Former are relatively not stable, the result of Mask2Former(swin-s) on ADE20K dataset in the table is the medium result obtained by training 5 times following the suggestion of the author.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
diff --git a/configs/mask2former/mask2former.yml b/configs/mask2former/mask2former.yml
new file mode 100644
index 000000000..78655fc52
--- /dev/null
+++ b/configs/mask2former/mask2former.yml
@@ -0,0 +1,290 @@
+Collections:
+- Name: Mask2Former
+  Metadata:
+    Training Data:
+    - Usage
+    - Cityscapes
+    - ADE20K
+  Paper:
+    URL: https://arxiv.org/abs/2112.01527
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+  README: configs/mask2former/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+    Version: 3.x
+  Converted From:
+    Code: https://github.com/facebookresearch/Mask2Former
+Models:
+- Name: mask2former_r50_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-50-D32
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 109.05
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 5806.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.44
+  Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth
+- Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-101-D32
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 140.65
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 6971.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.8
+  Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth
+- Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-T
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 139.28
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 6511.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.71
+  Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth
+- Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-S
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 179.53
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 8282.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 82.57
+  Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-B (in22k)
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 231.48
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 11152.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.52
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-L (in22k)
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 349.65
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 16207.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.65
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth
+- Name: mask2former_r50_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-50-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 37.61
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3385.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.87
+  Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth
+- Name: mask2former_r101_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-101-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 43.54
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 4190.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.6
+  Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth
+- Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-T
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 41.98
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3826.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.66
+  Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth
+- Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-S
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 50.79
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 5034.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 51.24
+  Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth
+- Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-B
+    crop size: (640,640)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 80.13
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (640,640)
+    Training Memory (GB): 5795.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.44
+  Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-B (in22k)
+    crop size: (640,640)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 80.45
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (640,640)
+    Training Memory (GB): 5795.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.9
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-L (in22k)
+    crop size: (640,640)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 113.51
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (640,640)
+    Training Memory (GB): 9077.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 56.01
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth
diff --git a/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..48f6c12d1
--- /dev/null
+++ b/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 000000000..275a7dab5
--- /dev/null
+++ b/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..598cabfb6
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,207 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/ade20k.py']
+
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(512 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2048),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 000000000..f92dda98a
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,206 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py']
+
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (512, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 19
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(1024 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=4096),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=90000,
+        by_epoch=False)
+]
+
+# training schedule for 90k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 000000000..56112dfa3
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,237 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+depths = [2, 2, 18, 2]
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[128, 256, 512, 1024],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 640) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2560),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 000000000..f39a3c590
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,5 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 000000000..0c229c145
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[128, 256, 512, 1024]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 000000000..f2657e884
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,9 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        num_heads=[6, 12, 24, 48],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_queries=100, in_channels=[192, 384, 768, 1536]))
diff --git a/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 000000000..01a7b9988
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[192, 384, 768, 1536]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..a7796d569
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 000000000..5f75544b1
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..9de3d242e
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 000000000..0abda6430
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
new file mode 100644
index 000000000..5e33d17af
--- /dev/null
+++ b/configs/maskformer/README.md
@@ -0,0 +1,60 @@
+# MaskFormer
+
+[MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/MaskFormer/">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/199215459-ea507126-aafe-4823-8eb1-ae6487509d5c.png" width="90%"/>
+</div>
+
+```bibtex
+@article{cheng2021per,
+  title={Per-pixel classification is not all you need for semantic segmentation},
+  author={Cheng, Bowen and Schwing, Alex and Kirillov, Alexander},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={17864--17875},
+  year={2021}
+}
+```
+
+### Usage
+
+- MaskFormer model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### ADE20K
+
+| Method     | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                                                                     |
+| ---------- | --------- | --------- | ------- | -------- | -------------- | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | 42.20          | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
+| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | 34.90          | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
+| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | 40.53          | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
+| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | 26.98          | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
+
+Note:
+
+- All experiments of MaskFormer are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
+- The results of MaskFormer are relatively not stable.  The accuracy (mIoU) of model with `R-101-D32` is from 44.7 to 46.0, and with `Swin-S` is from 49.0 to 49.8.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
diff --git a/configs/maskformer/maskformer.yml b/configs/maskformer/maskformer.yml
new file mode 100644
index 000000000..1b3d398e3
--- /dev/null
+++ b/configs/maskformer/maskformer.yml
@@ -0,0 +1,101 @@
+Collections:
+- Name: MaskFormer
+  Metadata:
+    Training Data:
+    - Usage
+    - ADE20K
+  Paper:
+    URL: https://arxiv.org/abs/2107.06278
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+  README: configs/maskformer/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+    Version: dev-3.x
+  Converted From:
+    Code: https://github.com/facebookresearch/MaskFormer/
+Models:
+- Name: maskformer_r50-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: R-50-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 23.7
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3.29
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.29
+  Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth
+- Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: R-101-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 28.65
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 4.12
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.11
+  Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth
+- Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: Swin-T
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 24.67
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3.73
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.69
+  Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth
+- Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: Swin-S
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 37.06
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 5.33
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.36
+  Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth
diff --git a/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..04bd37546
--- /dev/null
+++ b/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..7d8f65722
--- /dev/null
+++ b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,143 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='pytorch',
+        contract_dilation=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024,
+                     2048],  # input channels of pixel_decoder modules
+        feat_channels=256,
+        in_index=[0, 1, 2, 3],
+        num_classes=150,
+        out_channels=256,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='mmdet.PixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=6,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.1,
+                    dropout_layer=None,
+                    add_identity=True),
+                # the following parameter was not used,
+                # just make current api happy
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=1.0),
+                    dict(
+                        type='mmdet.FocalLossCost',
+                        weight=20.0,
+                        binary_input=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=1.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'),
+)
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1),
+    }))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# In MaskFormer implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..2cbc038ac
--- /dev/null
+++ b/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,79 @@
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 000000000..aa242dbe3
--- /dev/null
+++ b/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,81 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/demo/MMSegmentation_Tutorial.ipynb b/demo/MMSegmentation_Tutorial.ipynb
index f679f997a..89d6e5261 100644
--- a/demo/MMSegmentation_Tutorial.ipynb
+++ b/demo/MMSegmentation_Tutorial.ipynb
@@ -33,7 +33,7 @@
     "## Install MMSegmentation\n",
     "This step may take several minutes. \n",
     "\n",
-    "We use PyTorch 1.10 and CUDA 11.1 for this tutorial. You may install other versions by change the version number in pip install command. "
+    "We use PyTorch 1.12 and CUDA 11.3 for this tutorial. You may install other versions by change the version number in pip install command. "
    ]
   },
   {
@@ -67,13 +67,13 @@
    "outputs": [],
    "source": [
     "# Install PyTorch\n",
-    "!conda install pytorch=1.10.0 torchvision cudatoolkit=11.1 -c pytorch\n",
+    "!conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch\n",
     "# Install mim\n",
     "!pip install -U openmim\n",
     "# Install mmengine\n",
     "!mim install mmengine\n",
     "# Install MMCV\n",
-    "!mim install 'mmcv >= 2.0.0rc1'"
+    "!mim install 'mmcv >= 2.0.0rc1'\n"
    ]
   },
   {
@@ -500,16 +500,17 @@
    },
    "outputs": [],
    "source": [
-    "from mmseg.apis import inference_model, show_result_pyplot\n",
+    "from mmseg.apis import init_model, inference_model, show_result_pyplot\n",
     "\n",
-    "model=runner.model\n",
-    "model.cfg=cfg\n",
+    "# Init the model from the config and the checkpoint\n",
+    "checkpoint_path = './work_dirs/tutorial/iter_200.pth'\n",
+    "model = init_model(cfg, checkpoint_path, 'cuda:0')\n",
     "\n",
     "img = mmcv.imread('iccv09Data/images/6000124.jpg')\n",
     "result = inference_model(model, img)\n",
     "plt.figure(figsize=(8, 6))\n",
-    "vis_result = show_result_pyplot(model, img, result, palette)\n",
-    "plt.imshow(mmcv.bgr2rgb(vis_result))"
+    "vis_result = show_result_pyplot(model, img, result)\n",
+    "plt.imshow(mmcv.bgr2rgb(vis_result))\n"
    ]
   }
  ],
@@ -522,7 +523,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3.7.13 ('pt1.12')",
+   "display_name": "Python 3.8.5 ('tensorflow')",
    "language": "python",
    "name": "python3"
   },
@@ -536,7 +537,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.13"
+   "version": "3.8.5"
   },
   "pycharm": {
    "stem_cell": {
@@ -549,7 +550,7 @@
   },
   "vscode": {
    "interpreter": {
-    "hash": "ffdb7915c29738c259ec7ee5d0d1b9253c264f1fd267d45dd77f1a420396c120"
+    "hash": "20d4b83e0c8b3730b580c42434163d64f4b735d580303a8fade7c849d4d29eba"
    }
   }
  },
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 56e70ebf4..9ee49ab35 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,7 +1,7 @@
 ARG PYTORCH="1.11.0"
 ARG CUDA="11.3"
 ARG CUDNN="8"
-ARG MMCV="2.0.0rc1"
+ARG MMCV="2.0.0rc3"
 
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
index e43baebd8..bb150076d 100644
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@@ -3,8 +3,8 @@ ARG CUDA="11.3"
 ARG CUDNN="8"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
-ARG MMCV="2.0.0rc1"
-ARG MMSEG="1.0.0rc1"
+ARG MMCV="2.0.0rc3"
+ARG MMSEG="1.0.0rc2"
 
 ENV PYTHONUNBUFFERED TRUE
 
diff --git a/docs/en/advanced_guides/transforms.md b/docs/en/advanced_guides/transforms.md
index 3c5aa6f57..d42d61a9e 100644
--- a/docs/en/advanced_guides/transforms.md
+++ b/docs/en/advanced_guides/transforms.md
@@ -1,5 +1,13 @@
 # Data Transforms
 
+In this tutorial, we introduce the design of transforms pipeline in MMSegmentation.
+
+The structure of this guide is as follows:
+
+- [Data Transforms](#data-transforms)
+  - [Design of Data pipelines](#design-of-data-pipelines)
+  - [Customization data transformation](#customization-data-transformation)
+
 ## Design of Data pipelines
 
 Following typical conventions, we use `Dataset` and `DataLoader` for data loading
@@ -10,13 +18,31 @@ we introduce a new `DataContainer` type in MMCV to help collect and distribute
 data of different size.
 See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
 
-The data preparation pipeline and the dataset is decomposed. Usually a dataset
+In 1.x version of MMSegmentation, all data transformations are inherited from [`BaseTransform`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/transforms/base.py#L6).
+The input and output types of transformations are both dict. A simple example is as follows:
+
+```python
+>>> from mmseg.datasets.transforms import LoadAnnotations
+>>> transforms = LoadAnnotations()
+>>> img_path = './data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png.png'
+>>> gt_path = './data/cityscapes/gtFine/train/aachen/aachen_000015_000019_gtFine_instanceTrainIds.png'
+>>> results = dict(
+>>>     img_path=img_path,
+>>>     seg_map_path=gt_path,
+>>>     reduce_zero_label=False,
+>>>     seg_fields=[])
+>>> data_dict = transforms(results)
+>>> print(data_dict.keys())
+dict_keys(['img_path', 'seg_map_path', 'reduce_zero_label', 'seg_fields', 'gt_seg_map'])
+```
+
+The data preparation pipeline and the dataset are decomposed. Usually a dataset
 defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
-A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
+A pipeline consists of a sequence of operations. Each operation takes a dict as input and also outputs a dict for the next transform.
 
 The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
 
-Here is an pipeline example for PSPNet.
+Here is a pipeline example for PSPNet.
 
 ```python
 crop_size = (512, 1024)
@@ -37,53 +63,110 @@ test_pipeline = [
     dict(type='LoadImageFromFile'),
     dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
     # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
+    # does not need to resize data transform
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
 ```
 
-For each operation, we list the related dict fields that are added/updated/removed.
-Before pipelines, the information we can directly obtain from the datasets are img_path, seg_map_path.
+For each operation, we list the related dict fields that are `added`/`updated`/`removed`.
+Before pipelines, the information we can directly obtain from the datasets are `img_path` and `seg_map_path`.
 
 ### Data loading
 
-`LoadImageFromFile`
+`LoadImageFromFile`: Load an image from file.
 
-- add: img, img_shape, ori_shape
+- add: `img`, `img_shape`, `ori_shape`
 
-`LoadAnnotations`
+`LoadAnnotations`: Load semantic segmentation maps provided by dataset.
 
-- add: seg_fields, gt_seg_map
+- add: `seg_fields`, `gt_seg_map`
 
 ### Pre-processing
 
-`RandomResize`
+`RandomResize`: Random resize image & segmentation map.
 
-- add: scale, scale_factor, keep_ratio
-- update: img, img_shape, gt_seg_map
+- add: `scale`, `scale_factor`, `keep_ratio`
+- update: `img`, `img_shape`, `gt_seg_map`
 
-`Resize`
+`Resize`: Resize image & segmentation map.
 
-- add: scale, scale_factor, keep_ratio
-- update: img, gt_seg_map, img_shape
+- add: `scale`, `scale_factor`, `keep_ratio`
+- update: `img`, `gt_seg_map`, `img_shape`
 
-`RandomCrop`
+`RandomCrop`: Random crop image & segmentation map.
 
-- update: img, pad_shape, gt_seg_map
+- update: `img`, `gt_seg_map`, `img_shape`.
 
-`RandomFlip`
+`RandomFlip`: Flip the image & segmentation map.
 
-- add: flip, flip_direction
-- update: img, gt_seg_map
+- add: `flip`, `flip_direction`
+- update: `img`, `gt_seg_map`
 
-`PhotoMetricDistortion`
+`PhotoMetricDistortion`: Apply photometric distortion to image sequentially,
+every transformation is applied with a probability of 0.5.
+The position of random contrast is in second or second to last(mode 0 or 1 below, respectively).
 
-- update: img
+```
+1. random brightness
+2. random contrast (mode 0)
+3. convert color from BGR to HSV
+4. random saturation
+5. random hue
+6. convert color from HSV to BGR
+7. random contrast (mode 1)
+```
+
+- update: `img`
 
 ### Formatting
 
-`PackSegInputs`
+`PackSegInputs`: Pack the inputs data for the semantic segmentation.
 
-- add: inputs, data_sample
+- add: `inputs`, `data_sample`
 - remove: keys specified by `meta_keys` (merged into the metainfo of data_sample), all other keys
+
+## Customization data transformation
+
+The customized data transformation must inherited from `BaseTransform` and implement `transform` function.
+Here we use a simple flipping transformation as example:
+
+```python
+import random
+import mmcv
+from mmcv.transforms import BaseTransform, TRANSFORMS
+
+@TRANSFORMS.register_module()
+class MyFlip(BaseTransform):
+    def __init__(self, direction: str):
+        super().__init__()
+        self.direction = direction
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+Thus, we can instantiate a `MyFlip` object and use it to process the data dict.
+
+```python
+import numpy as np
+
+transform = MyFlip(direction='horizontal')
+data_dict = {'img': np.random.rand(224, 224, 3)}
+data_dict = transform(data_dict)
+processed_img = data_dict['img']
+```
+
+Or, we can use `MyFlip` transformation in data pipeline in our config file.
+
+```python
+pipeline = [
+    ...
+    dict(type='MyFlip', direction='horizontal'),
+    ...
+]
+```
+
+Note that if you want to use `MyFlip` in config, you must ensure the file containing `MyFlip` is imported during runtime.
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index 9c468ab9e..a808f57c6 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,47 @@
 # Changelog of v1.x
 
+## v1.0.0rc2(6/12/2022)
+
+### Highlights
+
+- Support MaskFormer ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215))
+- Support Mask2Former ([#2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))
+
+### Features
+
+- Add ResizeShortestEdge transform ([#2339](https://github.com/open-mmlab/mmsegmentation/pull/2339))
+- Support padding in data pre-processor for model testing([#2290](https://github.com/open-mmlab/mmsegmentation/pull/2290))
+- Fix the problem of post-processing not removing padding ([#2367](https://github.com/open-mmlab/mmsegmentation/pull/2367))
+
+### Bug fix
+
+- Fix links in README ([#2024](https://github.com/open-mmlab/mmsegmentation/pull/2024))
+- Fix swin load state_dict ([#2304](https://github.com/open-mmlab/mmsegmentation/pull/2304))
+- Fix typo of BaseSegDataset docstring ([#2322](https://github.com/open-mmlab/mmsegmentation/pull/2322))
+- Fix the bug in the visualization step ([#2326](https://github.com/open-mmlab/mmsegmentation/pull/2326))
+- Fix ignore class id from -1 to 255 in BaseSegDataset ([#2332](https://github.com/open-mmlab/mmsegmentation/pull/2332))
+- Fix KNet IterativeDecodeHead bug ([#2334](https://github.com/open-mmlab/mmsegmentation/pull/2334))
+- Add input argument for datasets ([#2379](https://github.com/open-mmlab/mmsegmentation/pull/2379))
+- Fix typo in warning on binary classification ([#2382](https://github.com/open-mmlab/mmsegmentation/pull/2382))
+
+### Enhancement
+
+- Fix ci for 1.x ([#2011](https://github.com/open-mmlab/mmsegmentation/pull/2011), [#2019](https://github.com/open-mmlab/mmsegmentation/pull/2019))
+- Fix lint and pre-commit hook ([#2308](https://github.com/open-mmlab/mmsegmentation/pull/2308))
+- Add `data` string in .gitignore file in dev-1.x branch ([#2336](https://github.com/open-mmlab/mmsegmentation/pull/2336))
+- Make scipy as a default dependency in runtime ([#2362](https://github.com/open-mmlab/mmsegmentation/pull/2362))
+- Delete mmcls in runtime.txt ([#2368](https://github.com/open-mmlab/mmsegmentation/pull/2368))
+
+### Documentation
+
+- Update configuration documentation ([#2048](https://github.com/open-mmlab/mmsegmentation/pull/2048))
+- Update inference documentation ([#2052](https://github.com/open-mmlab/mmsegmentation/pull/2052))
+- Update train test documentation ([#2061](https://github.com/open-mmlab/mmsegmentation/pull/2061))
+- Update get started documentatin ([#2148](https://github.com/open-mmlab/mmsegmentation/pull/2148))
+- Update transforms documentation ([#2088](https://github.com/open-mmlab/mmsegmentation/pull/2088))
+- Add MMEval projects like in README ([#2259](https://github.com/open-mmlab/mmsegmentation/pull/2259))
+- Translate the visualization.md ([#2298](https://github.com/open-mmlab/mmsegmentation/pull/2298))
+
 ## v1.0.0rc1 (2/11/2022)
 
 ### Highlights
diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md
index 1efb481d8..4868d138a 100644
--- a/docs/en/notes/faq.md
+++ b/docs/en/notes/faq.md
@@ -6,31 +6,32 @@ We list some common troubles faced by many users and their corresponding solutio
 
 The compatible MMSegmentation and MMCV versions are as below. Please install the correct version of MMCV to avoid installation issues.
 
-| MMSegmentation version |        MMCV version         | MMClassification version |
-| :--------------------: | :-------------------------: | :----------------------: |
-|        1.0.0rc1        |      mmcv >= 2.0.0rc1       |     mmcls>=1.0.0rc0      |
-|        1.0.0rc0        |      mmcv >= 2.0.0rc1       |     mmcls>=1.0.0rc0      |
-|         master         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.24.1         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.23.0         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.22.0         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.21.1         | mmcv-full>=1.4.4, \<=1.6.0  |       Not required       |
-|         0.20.2         | mmcv-full>=1.3.13, \<=1.6.0 |       Not required       |
-|         0.19.0         | mmcv-full>=1.3.13, \<1.3.17 |       Not required       |
-|         0.18.0         | mmcv-full>=1.3.13, \<1.3.17 |       Not required       |
-|         0.17.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.16.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.15.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.14.1         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.14.0         |  mmcv-full>=1.3.1, \<1.3.2  |       Not required       |
-|         0.13.0         |  mmcv-full>=1.3.1, \<1.3.2  |       Not required       |
-|         0.12.0         |  mmcv-full>=1.1.4, \<1.3.2  |       Not required       |
-|         0.11.0         |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.10.0         |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.9.0          |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.8.0          |  mmcv-full>=1.1.4, \<1.2.0  |       Not required       |
-|         0.7.0          |  mmcv-full>=1.1.2, \<1.2.0  |       Not required       |
-|         0.6.0          |  mmcv-full>=1.1.2, \<1.2.0  |       Not required       |
+| MMSegmentation version |        MMCV version         | MMClassification (optional) version | MMDetection (optional) version |
+| :--------------------: | :-------------------------: | :---------------------------------: | :----------------------------: |
+|        1.0.0rc2        |      mmcv >= 2.0.0rc3       |           mmcls>=1.0.0rc0           |        mmdet>=3.0.0rc4         |
+|        1.0.0rc1        |      mmcv >= 2.0.0rc1       |           mmcls>=1.0.0rc0           |          Not required          |
+|        1.0.0rc0        |      mmcv >= 2.0.0rc1       |           mmcls>=1.0.0rc0           |          Not required          |
+|         master         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.24.1         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.23.0         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.22.0         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.21.1         | mmcv-full>=1.4.4, \<=1.6.0  |            Not required             |          Not required          |
+|         0.20.2         | mmcv-full>=1.3.13, \<=1.6.0 |            Not required             |          Not required          |
+|         0.19.0         | mmcv-full>=1.3.13, \<1.3.17 |            Not required             |          Not required          |
+|         0.18.0         | mmcv-full>=1.3.13, \<1.3.17 |            Not required             |          Not required          |
+|         0.17.0         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.16.0         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.15.0         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.14.1         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.14.0         |  mmcv-full>=1.3.1, \<1.3.2  |            Not required             |          Not required          |
+|         0.13.0         |  mmcv-full>=1.3.1, \<1.3.2  |            Not required             |          Not required          |
+|         0.12.0         |  mmcv-full>=1.1.4, \<1.3.2  |            Not required             |          Not required          |
+|         0.11.0         |  mmcv-full>=1.1.4, \<1.3.0  |            Not required             |          Not required          |
+|         0.10.0         |  mmcv-full>=1.1.4, \<1.3.0  |            Not required             |          Not required          |
+|         0.9.0          |  mmcv-full>=1.1.4, \<1.3.0  |            Not required             |          Not required          |
+|         0.8.0          |  mmcv-full>=1.1.4, \<1.2.0  |            Not required             |          Not required          |
+|         0.7.0          |  mmcv-full>=1.1.2, \<1.2.0  |            Not required             |          Not required          |
+|         0.6.0          |  mmcv-full>=1.1.2, \<1.2.0  |            Not required             |          Not required          |
 
 ## How to know the number of GPUs needed to train the model
 
diff --git a/docs/zh_cn/user_guides/visualization.md b/docs/zh_cn/user_guides/visualization.md
index f2b2fc02e..ac8b9e289 100644
--- a/docs/zh_cn/user_guides/visualization.md
+++ b/docs/zh_cn/user_guides/visualization.md
@@ -1 +1,173 @@
 # 可视化
+
+MMSegmentation 1.x 提供了简便的方式监控训练时的状态以及可视化在模型预测时的数据。
+
+## 训练状态监控
+
+MMSegmentation 1.x 使用 TensorBoard 来监控训练时候的状态。
+
+### TensorBoard 的配置
+
+安装 TensorBoard 的过程可以按照 [官方安装指南](https://www.tensorflow.org/install) ，具体的步骤如下：
+
+```shell
+pip install tensorboardX
+pip install future tensorboard
+```
+
+在配置文件 `default_runtime.py` 的 `vis_backend` 中添加 `TensorboardVisBackend`。
+
+```python
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
+
+### 检查 TensorBoard 中的标量
+
+启动训练实验的命令如下
+
+```shell
+python tools/train.py configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py --work-dir work_dir/test_visual
+```
+
+开始训练后找到 `work_dir` 中的 `vis_data` 路径，例如：本次特定测试的 vis_data 路径如下所示：
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data
+```
+
+vis_data 路径中的标量文件包括了学习率、损失函数和 data_time 等，还记录了指标结果，您可以参考 MMEngine 中的 [记录日志教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html) 中的日志教程来帮助记录自己定义的数据。 Tensorboard 的可视化结果使用下面的命令执行：
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+## 数据和结果的可视化
+
+### 模型测试或验证期间的可视化数据样本
+
+MMSegmentation 提供了 `SegVisualizationHook` ，它是一个可以用于可视化 ground truth 和在模型测试和验证期间的预测分割结果的[钩子](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/hook.html) 。 它的配置在 `default_hooks` 中，更多详细信息请参见 [执行器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)。
+
+例如，在 `_base_/schedules/schedule_20k.py` 中，修改 `SegVisualizationHook` 配置，将 `draw` 设置为 `True` 以启用网络推理结果的存储，`interval` 表示预测结果的采样间隔， 设置为 1 时，将保存网络的每个推理结果。 `interval` 默认设置为 50：
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=1))
+
+```
+
+启动训练实验后，可视化结果将在 validation loop 存储到本地文件夹中，或者在一个数据集上启动评估模型时，预测结果将存储在本地。本地的可视化的存储结果保存在 `$WORK_DIRS/vis_data` 下的 `vis_image` 中，例如：
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data/vis_image
+```
+
+另外，如果在 `vis_backends` 中添加 `TensorboardVisBackend` ，如 [TensorBoard 的配置](#tensorboard-configuration)，我们还可以运行下面的命令在 TensorBoard 中查看它们：
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+### 可视化单个数据样本
+
+如果你想可视化单个样本数据，我们建议使用 `SegLocalVisualizer` 。
+
+`SegLocalVisualizer`是继承自 MMEngine 中`Visualizer` 类的子类，适用于 MMSegmentation 可视化，有关`Visualizer`的详细信息请参考在 MMEngine 中的[可视化教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html) 。
+
+以下是一个关于 `SegLocalVisualizer` 的示例，首先你可以使用下面的命令下载这个案例中的数据：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
+</div>
+
+```shell
+wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
+wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
+```
+
+然后你可以找到他们本地的路径和使用下面的脚本文件对其进行可视化：
+
+```python
+import mmcv
+import os.path as osp
+import torch
+
+# `PixelData` 是 MMEngine 中用于定义像素级标注或预测的数据结构。
+# 请参考下面的MMEngine数据结构教程文件：
+# https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html#pixeldata
+
+from mmengine.structures import PixelData
+
+# `SegDataSample` 是在 MMSegmentation 中定义的不同组件之间的数据结构接口，
+# 它包括 ground truth、语义分割的预测结果和预测逻辑。
+# 详情请参考下面的 `SegDataSample` 教程文件：
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/advanced_guides/structures.md
+
+from mmseg.structures import SegDataSample
+from mmseg.visualization import SegLocalVisualizer
+
+out_file = 'out_file_cityscapes'
+save_dir = './work_dirs'
+
+image = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_leftImg8bit.png'
+    ),
+    'color')
+sem_seg = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_gtFine_labelTrainIds.png'  # noqa
+    ),
+    'unchanged')
+sem_seg = torch.from_numpy(sem_seg)
+gt_sem_seg_data = dict(data=sem_seg)
+gt_sem_seg = PixelData(**gt_sem_seg_data)
+data_sample = SegDataSample()
+data_sample.gt_sem_seg = gt_sem_seg
+
+seg_local_visualizer = SegLocalVisualizer(
+    vis_backends=[dict(type='LocalVisBackend')],
+    save_dir=save_dir)
+
+# 数据集的元信息通常包括类名的 `classes` 和
+# 用于可视化每个前景颜色的 `palette` 。
+# 所有类名和调色板都在此文件中定义：
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/utils/class_names.py
+
+seg_local_visualizer.dataset_meta = dict(
+    classes=('road', 'sidewalk', 'building', 'wall', 'fence',
+             'pole', 'traffic light', 'traffic sign',
+             'vegetation', 'terrain', 'sky', 'person', 'rider',
+             'car', 'truck', 'bus', 'train', 'motorcycle',
+             'bicycle'),
+    palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70],
+             [102, 102, 156], [190, 153, 153], [153, 153, 153],
+             [250, 170, 30], [220, 220, 0], [107, 142, 35],
+             [152, 251, 152], [70, 130, 180], [220, 20, 60],
+             [255, 0, 0], [0, 0, 142], [0, 0, 70],
+             [0, 60, 100], [0, 80, 100], [0, 0, 230],
+             [119, 11, 32]])
+
+# 当`show=True`时，直接显示结果，
+# 当 `show=False`时，结果将保存在本地文件夹中。
+
+seg_local_visualizer.add_datasample(out_file, image,
+                                    data_sample, show=False)
+```
+
+可视化后的图像结果和它的对应的 ground truth 图像可以在 `./work_dirs/vis_data/vis_image/` 路径找到，文件名字是：`out_file_cityscapes_0.png` ：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189835713-c0534054-4bfa-4b75-9254-0afbeb5ff02e.png" width="70%"/>
+</div>
+
+如果你想知道更多的关于可视化的使用指引，你可以参考 MMEngine 中的[可视化教程](<[https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md)>)
diff --git a/mmseg/__init__.py b/mmseg/__init__.py
index 8a8593bc9..b39501352 100644
--- a/mmseg/__init__.py
+++ b/mmseg/__init__.py
@@ -7,7 +7,7 @@ from packaging.version import parse
 
 from .version import __version__, version_info
 
-MMCV_MIN = '2.0.0rc1'
+MMCV_MIN = '2.0.0rc3'
 MMCV_MAX = '2.1.0'
 MMENGINE_MIN = '0.1.0'
 MMENGINE_MAX = '1.0.0'
diff --git a/mmseg/datasets/__init__.py b/mmseg/datasets/__init__.py
index bf506eafa..bd04a5a67 100644
--- a/mmseg/datasets/__init__.py
+++ b/mmseg/datasets/__init__.py
@@ -22,7 +22,8 @@ from .transforms import (CLAHE, AdjustGamma, GenerateEdge, LoadAnnotations,
                          LoadBiomedicalImageFromFile, LoadImageFromNDArray,
                          PackSegInputs, PhotoMetricDistortion, RandomCrop,
                          RandomCutOut, RandomMosaic, RandomRotate, Rerange,
-                         ResizeToMultiple, RGB2Gray, SegRescale)
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
 from .voc import PascalVOCDataset
 
 __all__ = [
@@ -36,5 +37,5 @@ __all__ = [
     'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
     'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
     'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
-    'DecathlonDataset', 'LIPDataset'
+    'DecathlonDataset', 'LIPDataset', 'ResizeShortestEdge'
 ]
diff --git a/mmseg/datasets/ade.py b/mmseg/datasets/ade.py
index 1f97fcb2c..e9bdae742 100644
--- a/mmseg/datasets/ade.py
+++ b/mmseg/datasets/ade.py
@@ -80,9 +80,13 @@ class ADE20KDataset(BaseSegDataset):
                  [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
                  [102, 255, 0], [92, 0, 255]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/basesegdataset.py b/mmseg/datasets/basesegdataset.py
index 4476a1eda..e97f8ca9d 100644
--- a/mmseg/datasets/basesegdataset.py
+++ b/mmseg/datasets/basesegdataset.py
@@ -65,7 +65,7 @@ class BaseSegDataset(BaseDataset):
             instantiation. In some cases, such as visualization, only the meta
             information of the dataset is needed, which is not necessary to
             load annotation file. ``Basedataset`` can skip load annotations to
-            save time by set ``lazy_init=False``. Defaults to False.
+            save time by set ``lazy_init=True``. Defaults to False.
         max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
             None img. The maximum extra number of cycles to get a valid
             image. Defaults to 1000.
@@ -179,7 +179,7 @@ class BaseSegDataset(BaseDataset):
                     f'subset of classes {old_classes} in METAINFO.')
             for i, c in enumerate(old_classes):
                 if c not in new_classes:
-                    label_map[i] = -1
+                    label_map[i] = 255
                 else:
                     label_map[i] = new_classes.index(c)
             return label_map
diff --git a/mmseg/datasets/chase_db1.py b/mmseg/datasets/chase_db1.py
index 71139f2aa..5cc1fc567 100644
--- a/mmseg/datasets/chase_db1.py
+++ b/mmseg/datasets/chase_db1.py
@@ -17,10 +17,14 @@ class ChaseDB1Dataset(BaseSegDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_1stHO.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_1stHO.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
         assert self.file_client.exists(self.data_prefix['img_path'])
diff --git a/mmseg/datasets/coco_stuff.py b/mmseg/datasets/coco_stuff.py
index b0891a387..1e1574d97 100644
--- a/mmseg/datasets/coco_stuff.py
+++ b/mmseg/datasets/coco_stuff.py
@@ -91,6 +91,9 @@ class COCOStuffDataset(BaseSegDataset):
                  [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
                  [64, 192, 96], [64, 160, 64], [64, 64, 0]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_labelTrainIds.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg', seg_map_suffix='_labelTrainIds.png', **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/dark_zurich.py b/mmseg/datasets/dark_zurich.py
index c59249a02..9b5393fa9 100644
--- a/mmseg/datasets/dark_zurich.py
+++ b/mmseg/datasets/dark_zurich.py
@@ -7,8 +7,9 @@ from .cityscapes import CityscapesDataset
 class DarkZurichDataset(CityscapesDataset):
     """DarkZurichDataset dataset."""
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='_rgb_anon.png',
+                 seg_map_suffix='_gt_labelTrainIds.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='_rgb_anon.png',
-            seg_map_suffix='_gt_labelTrainIds.png',
-            **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/drive.py b/mmseg/datasets/drive.py
index fbaf729be..c42e18e71 100644
--- a/mmseg/datasets/drive.py
+++ b/mmseg/datasets/drive.py
@@ -17,10 +17,14 @@ class DRIVEDataset(BaseSegDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_manual1.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_manual1.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
         assert self.file_client.exists(self.data_prefix['img_path'])
diff --git a/mmseg/datasets/hrf.py b/mmseg/datasets/hrf.py
index f8d330918..0df6ccc49 100644
--- a/mmseg/datasets/hrf.py
+++ b/mmseg/datasets/hrf.py
@@ -17,10 +17,14 @@ class HRFDataset(BaseSegDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
         assert self.file_client.exists(self.data_prefix['img_path'])
diff --git a/mmseg/datasets/isaid.py b/mmseg/datasets/isaid.py
index a91a12b3b..d75cfcb7e 100644
--- a/mmseg/datasets/isaid.py
+++ b/mmseg/datasets/isaid.py
@@ -23,10 +23,14 @@ class iSAIDDataset(BaseSegDataset):
                  [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
                  [0, 127, 191], [0, 127, 255], [0, 100, 155]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_instance_color_RGB.png',
+                 ignore_index=255,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_instance_color_RGB.png',
-            ignore_index=255,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ignore_index=ignore_index,
             **kwargs)
         assert self.file_client.exists(self.data_prefix['img_path'])
diff --git a/mmseg/datasets/isprs.py b/mmseg/datasets/isprs.py
index 78df4e385..30af53c56 100644
--- a/mmseg/datasets/isprs.py
+++ b/mmseg/datasets/isprs.py
@@ -17,9 +17,13 @@ class ISPRSDataset(BaseSegDataset):
         palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
                  [255, 255, 0], [255, 0, 0]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/lip.py b/mmseg/datasets/lip.py
index 40a703ffd..3a32a193a 100644
--- a/mmseg/datasets/lip.py
+++ b/mmseg/datasets/lip.py
@@ -39,5 +39,9 @@ class LIPDataset(BaseSegDataset):
             [255, 170, 0],
         ))
 
-    def __init__(self, **kwargs) -> None:
-        super().__init__(img_suffix='.jpg', seg_map_suffix='.png', **kwargs)
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/loveda.py b/mmseg/datasets/loveda.py
index b5d80f25a..5c16db503 100644
--- a/mmseg/datasets/loveda.py
+++ b/mmseg/datasets/loveda.py
@@ -17,9 +17,13 @@ class LoveDADataset(BaseSegDataset):
         palette=[[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
                  [159, 129, 183], [0, 255, 0], [255, 195, 128]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/night_driving.py b/mmseg/datasets/night_driving.py
index 5e542194f..3ead91ec7 100644
--- a/mmseg/datasets/night_driving.py
+++ b/mmseg/datasets/night_driving.py
@@ -7,8 +7,9 @@ from .cityscapes import CityscapesDataset
 class NightDrivingDataset(CityscapesDataset):
     """NightDrivingDataset dataset."""
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtCoarse_labelTrainIds.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='_leftImg8bit.png',
-            seg_map_suffix='_gtCoarse_labelTrainIds.png',
-            **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/pascal_context.py b/mmseg/datasets/pascal_context.py
index a98372352..a6b2fba7b 100644
--- a/mmseg/datasets/pascal_context.py
+++ b/mmseg/datasets/pascal_context.py
@@ -45,10 +45,14 @@ class PascalContextDataset(BaseSegDataset):
                  [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
                  [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
 
-    def __init__(self, ann_file: str, **kwargs) -> None:
+    def __init__(self,
+                 ann_file: str,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
             reduce_zero_label=False,
             **kwargs)
@@ -95,12 +99,17 @@ class PascalContextDataset59(BaseSegDataset):
                  [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
                  [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
 
-    def __init__(self, ann_file: str, **kwargs):
+    def __init__(self,
+                 ann_file: str,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs):
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
-            reduce_zero_label=True,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
         assert self.file_client.exists(
             self.data_prefix['img_path']) and osp.isfile(self.ann_file)
diff --git a/mmseg/datasets/potsdam.py b/mmseg/datasets/potsdam.py
index 808cf6ec7..6892de3dd 100644
--- a/mmseg/datasets/potsdam.py
+++ b/mmseg/datasets/potsdam.py
@@ -17,9 +17,13 @@ class PotsdamDataset(BaseSegDataset):
         palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
                  [255, 255, 0], [255, 0, 0]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/stare.py b/mmseg/datasets/stare.py
index 485470277..2bfce2344 100644
--- a/mmseg/datasets/stare.py
+++ b/mmseg/datasets/stare.py
@@ -16,10 +16,14 @@ class STAREDataset(BaseSegDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.ah.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.ah.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
         assert self.file_client.exists(self.data_prefix['img_path'])
diff --git a/mmseg/datasets/transforms/__init__.py b/mmseg/datasets/transforms/__init__.py
index 09f6c655a..656f806e1 100644
--- a/mmseg/datasets/transforms/__init__.py
+++ b/mmseg/datasets/transforms/__init__.py
@@ -5,13 +5,15 @@ from .loading import (LoadAnnotations, LoadBiomedicalAnnotation,
                       LoadImageFromNDArray)
 from .transforms import (CLAHE, AdjustGamma, GenerateEdge,
                          PhotoMetricDistortion, RandomCrop, RandomCutOut,
-                         RandomMosaic, RandomRotate, Rerange, ResizeToMultiple,
-                         RGB2Gray, SegRescale)
+                         RandomMosaic, RandomRotate, Rerange,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
 
 __all__ = [
     'LoadAnnotations', 'RandomCrop', 'SegRescale', 'PhotoMetricDistortion',
     'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray',
     'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
     'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
-    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge'
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'ResizeShortestEdge'
 ]
diff --git a/mmseg/datasets/transforms/transforms.py b/mmseg/datasets/transforms/transforms.py
index 46d3a66e0..772eddb4a 100644
--- a/mmseg/datasets/transforms/transforms.py
+++ b/mmseg/datasets/transforms/transforms.py
@@ -1226,3 +1226,87 @@ class GenerateEdge(BaseTransform):
         repr_str += f'edge_width={self.edge_width}, '
         repr_str += f'ignore_index={self.ignore_index})'
         return repr_str
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+    Copyright (c) Facebook, Inc. and its affiliates.
+    Licensed under the Apache-2.0 License
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional))
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self, scale: Union[int, Tuple[int, int]],
+                 max_size: int) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        # Create a empty Resize object
+        self.resize = TRANSFORMS.build({
+            'type': 'Resize',
+            'scale': 0,
+            'keep_ratio': True
+        })
+
+    def _get_output_shape(self, img, short_edge_length) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return (new_w, new_h)
+
+    def transform(self, results: Dict) -> Dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
diff --git a/mmseg/datasets/voc.py b/mmseg/datasets/voc.py
index 1defcbf6c..66f223078 100644
--- a/mmseg/datasets/voc.py
+++ b/mmseg/datasets/voc.py
@@ -24,10 +24,14 @@ class PascalVOCDataset(BaseSegDataset):
                  [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                  [0, 64, 128]])
 
-    def __init__(self, ann_file, **kwargs) -> None:
+    def __init__(self,
+                 ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
             **kwargs)
         assert self.file_client.exists(
diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py
index 4553f72d8..e5da71e72 100644
--- a/mmseg/models/backbones/beit.py
+++ b/mmseg/models/backbones/beit.py
@@ -11,6 +11,7 @@ from mmengine.model import BaseModule, ModuleList
 from mmengine.model.weight_init import (constant_init, kaiming_init,
                                         trunc_normal_)
 from mmengine.runner.checkpoint import _load_checkpoint
+from scipy import interpolate
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.modules.utils import _pair as to_2tuple
 
@@ -18,11 +19,6 @@ from mmseg.registry import MODELS
 from ..utils import PatchEmbed
 from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer
 
-try:
-    from scipy import interpolate
-except ImportError:
-    interpolate = None
-
 
 class BEiTAttention(BaseModule):
     """Window based multi-head self-attention (W-MSA) module with relative
diff --git a/mmseg/models/backbones/swin.py b/mmseg/models/backbones/swin.py
index 57ab99085..c0ace3c13 100644
--- a/mmseg/models/backbones/swin.py
+++ b/mmseg/models/backbones/swin.py
@@ -13,7 +13,7 @@ from mmengine.logging import print_log
 from mmengine.model import BaseModule, ModuleList
 from mmengine.model.weight_init import (constant_init, trunc_normal_,
                                         trunc_normal_init)
-from mmengine.runner import CheckpointLoader, load_state_dict
+from mmengine.runner import CheckpointLoader
 from mmengine.utils import to_2tuple
 
 from mmseg.registry import MODELS
@@ -732,7 +732,7 @@ class SwinTransformer(BaseModule):
                         nH2, L2).permute(1, 0).contiguous()
 
             # load state_dict
-            load_state_dict(self, state_dict, strict=False, logger=None)
+            self.load_state_dict(state_dict, strict=False)
 
     def forward(self, x):
         x, hw_shape = self.patch_embed(x)
diff --git a/mmseg/models/data_preprocessor.py b/mmseg/models/data_preprocessor.py
index 34087d0c0..deef365a9 100644
--- a/mmseg/models/data_preprocessor.py
+++ b/mmseg/models/data_preprocessor.py
@@ -48,18 +48,24 @@ class SegDataPreProcessor(BaseDataPreprocessor):
         rgb_to_bgr (bool): whether to convert image from RGB to RGB.
             Defaults to False.
         batch_augments (list[dict], optional): Batch-level augmentations
+        test_cfg (dict, optional): The padding size config in testing, if not
+            specify, will use `size` and `size_divisor` params as default.
+            Defaults to None, only supports keys `size` or `size_divisor`.
     """
 
-    def __init__(self,
-                 mean: Sequence[Number] = None,
-                 std: Sequence[Number] = None,
-                 size: Optional[tuple] = None,
-                 size_divisor: Optional[int] = None,
-                 pad_val: Number = 0,
-                 seg_pad_val: Number = 255,
-                 bgr_to_rgb: bool = False,
-                 rgb_to_bgr: bool = False,
-                 batch_augments: Optional[List[dict]] = None):
+    def __init__(
+        self,
+        mean: Sequence[Number] = None,
+        std: Sequence[Number] = None,
+        size: Optional[tuple] = None,
+        size_divisor: Optional[int] = None,
+        pad_val: Number = 0,
+        seg_pad_val: Number = 255,
+        bgr_to_rgb: bool = False,
+        rgb_to_bgr: bool = False,
+        batch_augments: Optional[List[dict]] = None,
+        test_cfg: dict = None,
+    ):
         super().__init__()
         self.size = size
         self.size_divisor = size_divisor
@@ -86,6 +92,9 @@ class SegDataPreProcessor(BaseDataPreprocessor):
         # TODO: support batch augmentations.
         self.batch_augments = batch_augments
 
+        # Support different padding methods in testing
+        self.test_cfg = test_cfg
+
     def forward(self, data: dict, training: bool = False) -> Dict[str, Any]:
         """Perform normalization、padding and bgr2rgb conversion based on
         ``BaseDataPreprocessor``.
@@ -122,10 +131,21 @@ class SegDataPreProcessor(BaseDataPreprocessor):
             if self.batch_augments is not None:
                 inputs, data_samples = self.batch_augments(
                     inputs, data_samples)
-            return dict(inputs=inputs, data_samples=data_samples)
         else:
             assert len(inputs) == 1, (
                 'Batch inference is not support currently, '
                 'as the image size might be different in a batch')
-            return dict(
-                inputs=torch.stack(inputs, dim=0), data_samples=data_samples)
+            # pad images when testing
+            if self.test_cfg:
+                inputs, padded_samples = stack_batch(
+                    inputs=inputs,
+                    size=self.test_cfg.get('size', None),
+                    size_divisor=self.test_cfg.get('size_divisor', None),
+                    pad_val=self.pad_val,
+                    seg_pad_val=self.seg_pad_val)
+                for data_sample, pad_info in zip(data_samples, padded_samples):
+                    data_sample.set_metainfo({**pad_info})
+            else:
+                inputs = torch.stack(inputs, dim=0)
+
+        return dict(inputs=inputs, data_samples=data_samples)
diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py
index 8add7615c..b18152d7d 100644
--- a/mmseg/models/decode_heads/__init__.py
+++ b/mmseg/models/decode_heads/__init__.py
@@ -15,6 +15,8 @@ from .gc_head import GCHead
 from .isa_head import ISAHead
 from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
 from .lraspp_head import LRASPPHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
 from .nl_head import NLHead
 from .ocr_head import OCRHead
 from .point_head import PointHead
@@ -36,5 +38,5 @@ __all__ = [
     'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
     'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
     'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
-    'KernelUpdateHead', 'KernelUpdator'
+    'KernelUpdateHead', 'KernelUpdator', 'MaskFormerHead', 'Mask2FormerHead'
 ]
diff --git a/mmseg/models/decode_heads/decode_head.py b/mmseg/models/decode_heads/decode_head.py
index c7223f944..0803715f8 100644
--- a/mmseg/models/decode_heads/decode_head.py
+++ b/mmseg/models/decode_heads/decode_head.py
@@ -120,7 +120,7 @@ class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
                 warnings.warn('For binary segmentation, we suggest using'
                               '`out_channels = 1` to define the output'
                               'channels of segmentor, and use `threshold`'
-                              'to convert seg_logist into a prediction'
+                              'to convert `seg_logits` into a prediction'
                               'applying a threshold')
             out_channels = num_classes
 
diff --git a/mmseg/models/decode_heads/knet_head.py b/mmseg/models/decode_heads/knet_head.py
index 181ae0216..82d3a2807 100644
--- a/mmseg/models/decode_heads/knet_head.py
+++ b/mmseg/models/decode_heads/knet_head.py
@@ -413,6 +413,9 @@ class IterativeDecodeHead(BaseDecodeHead):
 
     def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
                  **kwargs):
+        # ``IterativeDecodeHead`` would skip initialization of
+        # ``BaseDecodeHead`` which would be called when building
+        # ``self.kernel_generate_head``.
         super(BaseDecodeHead, self).__init__(**kwargs)
         assert num_stages == len(kernel_update_head)
         self.num_stages = num_stages
@@ -422,6 +425,7 @@ class IterativeDecodeHead(BaseDecodeHead):
         self.num_classes = self.kernel_generate_head.num_classes
         self.input_transform = self.kernel_generate_head.input_transform
         self.ignore_index = self.kernel_generate_head.ignore_index
+        self.out_channels = self.num_classes
 
         for head_cfg in kernel_update_head:
             self.kernel_update_head.append(MODELS.build(head_cfg))
diff --git a/mmseg/models/decode_heads/mask2former_head.py b/mmseg/models/decode_heads/mask2former_head.py
new file mode 100644
index 000000000..0ea742430
--- /dev/null
+++ b/mmseg/models/decode_heads/mask2former_head.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from mmdet.models.dense_heads import \
+        Mask2FormerHead as MMDET_Mask2FormerHead
+except ModuleNotFoundError:
+    MMDET_Mask2FormerHead = None
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MMDET_Mask2FormerHead):
+    """Implements the Mask2Former head.
+
+    See `Mask2Former: Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/abs/2112.01527>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 align_corners=False,
+                 ignore_index=255,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_Mask2FormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros(
+                    (0, gt_sem_seg.shape[-2],
+                     gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+            else:
+                gt_masks = torch.stack(masks).squeeze(1).long()
+
+            instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+        batch_data_samples = [
+            SegDataSample(metainfo=metainfo) for metainfo in batch_img_metas
+        ]
+
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+        if 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape']
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred_results = F.interpolate(
+            mask_pred_results, size=size, mode='bilinear', align_corners=False)
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc, bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/mmseg/models/decode_heads/maskformer_head.py b/mmseg/models/decode_heads/maskformer_head.py
new file mode 100644
index 000000000..98ca92b99
--- /dev/null
+++ b/mmseg/models/decode_heads/maskformer_head.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
+except ModuleNotFoundError:
+    MMDET_MaskFormerHead = None
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class MaskFormerHead(MMDET_MaskFormerHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes: int = 150,
+                 align_corners: bool = False,
+                 ignore_index: int = 255,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.out_channels = kwargs['out_channels']
+        self.align_corners = True
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_MaskFormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        for data_sample in batch_data_samples:
+            # Add `batch_input_shape` in metainfo of data_sample, which would
+            # be used in MaskFormerHead of MMDetection.
+            metainfo = data_sample.metainfo
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            data_sample.set_metainfo(metainfo)
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros((0, gt_sem_seg.shape[-2],
+                                        gt_sem_seg.shape[-1])).to(gt_sem_seg)
+            else:
+                gt_masks = torch.stack(masks).squeeze(1)
+
+            instance_data = InstanceData(
+                labels=gt_labels, masks=gt_masks.long())
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+
+        batch_data_samples = []
+        for metainfo in batch_img_metas:
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            batch_data_samples.append(SegDataSample(metainfo=metainfo))
+        # Forward function of MaskFormerHead from MMDetection needs
+        # 'batch_data_samples' as inputs, which is image shape　actually.
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=img_shape,
+            mode='bilinear',
+            align_corners=False)
+
+        # semantic inference
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/mmseg/models/segmentors/base.py b/mmseg/models/segmentors/base.py
index dfceddd99..1625addf6 100644
--- a/mmseg/models/segmentors/base.py
+++ b/mmseg/models/segmentors/base.py
@@ -159,12 +159,17 @@ class BaseSegmentor(BaseModel, metaclass=ABCMeta):
             if not only_prediction:
                 img_meta = data_samples[i].metainfo
                 # remove padding area
-                padding_left, padding_right, padding_top, padding_bottom = \
-                    img_meta.get('padding_size', [0]*4)
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
                 # i_seg_logits shape is 1, C, H, W after remove padding
                 i_seg_logits = seg_logits[i:i + 1, :,
                                           padding_top:H - padding_bottom,
                                           padding_left:W - padding_right]
+
                 # resize as original shape
                 i_seg_logits = resize(
                     i_seg_logits,
diff --git a/mmseg/utils/misc.py b/mmseg/utils/misc.py
index 4413234fb..aa3089360 100644
--- a/mmseg/utils/misc.py
+++ b/mmseg/utils/misc.py
@@ -105,6 +105,9 @@ def stack_batch(inputs: List[torch.Tensor],
             })
             padded_samples.append(data_sample)
         else:
-            padded_samples = None
+            padded_samples.append(
+                dict(
+                    img_padding_size=padding_size,
+                    pad_shape=pad_img.shape[-2:]))
 
     return torch.stack(padded_inputs, dim=0), padded_samples
diff --git a/mmseg/version.py b/mmseg/version.py
index 840dca669..f6ccff601 100644
--- a/mmseg/version.py
+++ b/mmseg/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '1.0.0rc1'
+__version__ = '1.0.0rc2'
 
 
 def parse_version_info(version_str):
diff --git a/model-index.yml b/model-index.yml
index b087a7294..ae96bd30f 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -25,6 +25,8 @@ Import:
 - configs/isanet/isanet.yml
 - configs/knet/knet.yml
 - configs/mae/mae.yml
+- configs/mask2former/mask2former.yml
+- configs/maskformer/maskformer.yml
 - configs/mobilenet_v2/mobilenet_v2.yml
 - configs/mobilenet_v3/mobilenet_v3.yml
 - configs/nonlocal_net/nonlocal_net.yml
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index ef11eb677..d27af8dd0 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,3 +1,4 @@
 mmcls>=1.0.0rc0
-mmcv>=2.0.0rc1,<2.1.0
+mmcv>=2.0.0rc3,<2.1.0
+mmdet>=3.0.0rc4
 mmengine>=0.1.0,<1.0.0
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 82b392e74..3e242581e 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,5 +1,5 @@
 matplotlib
-mmcls>=1.0.0rc0
 numpy
 packaging
 prettytable
+scipy
diff --git a/setup.py b/setup.py
index 79ecaa54b..854dd1860 100755
--- a/setup.py
+++ b/setup.py
@@ -192,7 +192,6 @@ if __name__ == '__main__':
         extras_require={
             'all': parse_requirements('requirements.txt'),
             'tests': parse_requirements('requirements/tests.txt'),
-            'build': parse_requirements('requirements/build.txt'),
             'optional': parse_requirements('requirements/optional.txt'),
             'mim': parse_requirements('requirements/mminstall.txt'),
         },
diff --git a/tests/test_datasets/test_transform.py b/tests/test_datasets/test_transform.py
index bd9c05ac4..0833ac183 100644
--- a/tests/test_datasets/test_transform.py
+++ b/tests/test_datasets/test_transform.py
@@ -10,6 +10,9 @@ from PIL import Image
 from mmseg.datasets.transforms import *  # noqa
 from mmseg.datasets.transforms import PhotoMetricDistortion, RandomCrop
 from mmseg.registry import TRANSFORMS
+from mmseg.utils import register_all_modules
+
+register_all_modules()
 
 
 def test_resize():
@@ -71,6 +74,34 @@ def test_resize():
     resized_results = resize_module(results.copy())
     assert max(resized_results['img_shape'][:2]) <= 1333 * 1.1
 
+    # test RandomChoiceResize, which `resize_type` is `ResizeShortestEdge`
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[128, 256, 512],
+        resize_type='ResizeShortestEdge',
+        max_size=1333)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][0] in [128, 256, 512]
+
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[512],
+        resize_type='ResizeShortestEdge',
+        max_size=512)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][1] == 512
+
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[(128, 256), (256, 512), (512, 1024)],
+        resize_type='ResizeShortestEdge',
+        max_size=1333)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][0] in [128, 256, 512]
+
     # test scale=None and scale_factor is tuple.
     # img shape: (288, 512, 3)
     transform = dict(
diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py
index cf3960894..59a12c5d0 100644
--- a/tests/test_models/test_backbones/test_beit.py
+++ b/tests/test_models/test_backbones/test_beit.py
@@ -140,8 +140,11 @@ def test_beit_init():
         }
     }
     model = BEiT(img_size=(512, 512))
-    with pytest.raises(AttributeError):
-        model.resize_rel_pos_embed(ckpt)
+    # If scipy is installed, this AttributeError would not be raised.
+    from mmengine.utils import is_installed
+    if not is_installed('scipy'):
+        with pytest.raises(AttributeError):
+            model.resize_rel_pos_embed(ckpt)
 
     # pretrained=None
     # init_cfg=123, whose type is unsupported
diff --git a/tests/test_models/test_backbones/test_mae.py b/tests/test_models/test_backbones/test_mae.py
index 562d067a7..16f52b54b 100644
--- a/tests/test_models/test_backbones/test_mae.py
+++ b/tests/test_models/test_backbones/test_mae.py
@@ -138,8 +138,11 @@ def test_mae_init():
         }
     }
     model = MAE(img_size=(512, 512))
-    with pytest.raises(AttributeError):
-        model.resize_rel_pos_embed(ckpt)
+    # If scipy is installed, this AttributeError would not be raised.
+    from mmengine.utils import is_installed
+    if not is_installed('scipy'):
+        with pytest.raises(AttributeError):
+            model.resize_rel_pos_embed(ckpt)
 
     # test resize abs pos embed
     ckpt = model.resize_abs_pos_embed(ckpt['state_dict'])
diff --git a/tests/test_models/test_data_preprocessor.py b/tests/test_models/test_data_preprocessor.py
index 6b2903ff3..d05eef1c7 100644
--- a/tests/test_models/test_data_preprocessor.py
+++ b/tests/test_models/test_data_preprocessor.py
@@ -46,3 +46,19 @@ class TestSegDataPreProcessor(TestCase):
         out = processor(data, training=True)
         self.assertEqual(out['inputs'].shape, (2, 3, 20, 20))
         self.assertEqual(len(out['data_samples']), 2)
+
+        # test predict with padding
+        processor = SegDataPreProcessor(
+            mean=[0, 0, 0],
+            std=[1, 1, 1],
+            size=(20, 20),
+            test_cfg=dict(size_divisor=15))
+        data = {
+            'inputs': [
+                torch.randint(0, 256, (3, 11, 10)),
+            ],
+            'data_samples': [data_sample]
+        }
+        out = processor(data, training=False)
+        self.assertEqual(out['inputs'].shape[2] % 15, 0)
+        self.assertEqual(out['inputs'].shape[3] % 15, 0)
diff --git a/tests/test_models/test_heads/test_mask2former_head.py b/tests/test_models/test_heads/test_mask2former_head.py
new file mode 100644
index 000000000..079e94ed9
--- /dev/null
+++ b/tests/test_models/test_heads/test_mask2former_head.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine import Config
+from mmengine.structures import PixelData
+
+from mmseg.models.decode_heads import Mask2FormerHead
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList
+from .utils import to_cuda
+
+
+def test_mask2former_head():
+    num_classes = 19
+    cfg = dict(
+        in_channels=[96, 192, 384, 768],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler')))
+    cfg = Config(cfg)
+    head = Mask2FormerHead(**cfg)
+
+    inputs = [
+        torch.rand((2, 96, 8, 8)),
+        torch.rand((2, 192, 4, 4)),
+        torch.rand((2, 384, 2, 2)),
+        torch.rand((2, 768, 1, 1))
+    ]
+
+    data_samples: SampleList = []
+    for i in range(2):
+        data_sample = SegDataSample()
+        img_meta = {}
+        img_meta['img_shape'] = (32, 32)
+        img_meta['ori_shape'] = (32, 32)
+        data_sample.gt_sem_seg = PixelData(
+            data=torch.randint(0, num_classes, (1, 32, 32)))
+        data_sample.set_metainfo(img_meta)
+        data_samples.append(data_sample)
+
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+        for data_sample in data_samples:
+            data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.data.cuda()
+
+    loss_dict = head.loss(inputs, data_samples, None)
+    assert isinstance(loss_dict, dict)
+
+    batch_img_metas = []
+    for data_sample in data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+
+    seg_logits = head.predict(inputs, batch_img_metas, None)
+    assert seg_logits.shape == torch.Size((2, num_classes, 32, 32))
diff --git a/tests/test_models/test_heads/test_maskformer_head.py b/tests/test_models/test_heads/test_maskformer_head.py
new file mode 100644
index 000000000..fe4bf96fe
--- /dev/null
+++ b/tests/test_models/test_heads/test_maskformer_head.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os.path import dirname, join
+
+import torch
+from mmengine import Config
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import register_all_modules
+
+
+def test_maskformer_head():
+    register_all_modules()
+    repo_dpath = dirname(dirname(__file__))
+    cfg = Config.fromfile(
+        join(
+            repo_dpath,
+            '../../configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'  # noqa
+        ))
+    cfg.model.train_cfg = None
+    decode_head = MODELS.build(cfg.model.decode_head)
+    inputs = (torch.randn(1, 256, 32, 32), torch.randn(1, 512, 16, 16),
+              torch.randn(1, 1024, 8, 8), torch.randn(1, 2048, 4, 4))
+    # test inference
+    batch_img_metas = [
+        dict(
+            scale_factor=(1.0, 1.0),
+            img_shape=(512, 683),
+            ori_shape=(512, 683))
+    ]
+    test_cfg = dict(mode='whole')
+    output = decode_head.predict(inputs, batch_img_metas, test_cfg)
+    assert output.shape == (1, 150, 512, 683)
+
+    # test training
+    inputs = (torch.randn(2, 256, 32, 32), torch.randn(2, 512, 16, 16),
+              torch.randn(2, 1024, 8, 8), torch.randn(2, 2048, 4, 4))
+    batch_data_samples = []
+    img_meta = {
+        'img_shape': (512, 512),
+        'ori_shape': (480, 640),
+        'pad_shape': (512, 512),
+        'scale_factor': (1.425, 1.425),
+    }
+    for _ in range(2):
+        data_sample = SegDataSample(
+            gt_sem_seg=PixelData(data=torch.ones(512, 512).long()))
+        data_sample.set_metainfo(img_meta)
+        batch_data_samples.append(data_sample)
+    train_cfg = {}
+    losses = decode_head.loss(inputs, batch_data_samples, train_cfg)
+    assert (loss in losses.keys()
+            for loss in ('loss_cls', 'loss_mask', 'loss_dice'))
diff --git a/tests/test_models/test_segmentors/test_encoder_decoder.py b/tests/test_models/test_segmentors/test_encoder_decoder.py
index 81f89db41..5795f513d 100644
--- a/tests/test_models/test_segmentors/test_encoder_decoder.py
+++ b/tests/test_models/test_segmentors/test_encoder_decoder.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
+import torch
 from mmengine import ConfigDict
+from mmengine.structures import PixelData
 
 from mmseg.models import build_segmentor
+from mmseg.structures import SegDataSample
 from .utils import _segmentor_forward_train_test
 
 
@@ -57,3 +59,42 @@ def test_encoder_decoder():
     cfg.test_cfg = ConfigDict(mode='whole')
     segmentor = build_segmentor(cfg)
     _segmentor_forward_train_test(segmentor)
+
+
+def test_postprocess_result():
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    model = build_segmentor(cfg)
+
+    # test postprocess
+    data_sample = SegDataSample()
+    data_sample.gt_sem_seg = PixelData(
+        **{'data': torch.randint(0, 10, (1, 8, 8))})
+    data_sample.set_metainfo({
+        'padding_size': (0, 2, 0, 2),
+        'ori_shape': (8, 8)
+    })
+    seg_logits = torch.zeros((1, 2, 10, 10))
+    seg_logits[:, :, :8, :8] = 1
+    data_samples = [data_sample]
+
+    outputs = model.postprocess_result(seg_logits, data_samples)
+    assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
+    assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))
+
+    data_sample = SegDataSample()
+    data_sample.gt_sem_seg = PixelData(
+        **{'data': torch.randint(0, 10, (1, 8, 8))})
+    data_sample.set_metainfo({
+        'img_padding_size': (0, 2, 0, 2),
+        'ori_shape': (8, 8)
+    })
+
+    data_samples = [data_sample]
+    outputs = model.postprocess_result(seg_logits, data_samples)
+    assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
+    assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))