Merge pull request #2385 from open-mmlab/dev-1.x

Merge MMSegmentation 1.x development branch dev-1.x to main branch 1.x for v1.0.0rc2
2022-12-06 17:11:02 +08:00 · 2022-12-06 17:11:02 +08:00 · 8a611e122d
parent 0c87f7a0c9 750bb4f180
commit 8a611e122d
78 changed files with 3088 additions and 184 deletions
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@ -61,8 +61,9 @@ jobs:
          command: |
            pip install git+https://github.com/open-mmlab/mmengine.git@main
            pip install -U openmim
-            mim install 'mmcv>=2.0.0rc1'
+            mim install 'mmcv>=2.0.0rc3'
            pip install git+https://github.com/open-mmlab/mmclassification@dev-1.x
+            pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
            pip install -r requirements/tests.txt -r requirements/optional.txt
      - run:
          name: Build and install
@ -96,18 +97,20 @@ jobs:
          command: |
            git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine
            git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmclassification.git /home/circleci/mmclassification
+            git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
      - run:
          name: Build Docker image
          command: |
            docker build .circleci/docker -t mmseg:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
-            docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmclassification:/mmclassification -w /mmseg --name mmseg mmseg:gpu
+            docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmclassification:/mmclassification -v /home/circleci/mmdetection:/mmdetection -w /mmseg --name mmseg mmseg:gpu
      - run:
          name: Install mmseg dependencies
          command: |
            docker exec mmseg pip install -e /mmengine
            docker exec mmseg pip install -U openmim
-            docker exec mmseg mim install 'mmcv>=2.0.0rc1'
+            docker exec mmseg mim install 'mmcv>=2.0.0rc3'
            docker exec mmseg pip install -e /mmclassification
+            docker exec mmseg pip install -e /mmdetection
            docker exec mmseg pip install -r requirements/tests.txt -r requirements/optional.txt
      - run:
          name: Build and install
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -20,11 +20,7 @@ jobs:
          python -m pip install pre-commit
          pre-commit install
      - name: Linting
-        run: |
-          sudo apt-add-repository ppa:brightbox/ruby-ng -y
-          sudo apt-get update
-          sudo apt-get install -y ruby2.7
-          pre-commit run --all-files
+        run: pre-commit run --all-files
      - name: Check docstring coverage
        run: |
          python -m pip install interrogate
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@ -44,8 +44,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -92,8 +93,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -155,8 +157,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -187,8 +190,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@ -40,8 +40,9 @@ jobs:
        run: |
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -92,8 +93,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -124,8 +126,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv>=2.0.0rc1'
+          mim install 'mmcv>=2.0.0rc3'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
--- a/.gitignore
+++ b/.gitignore
@ -105,6 +105,7 @@ venv.bak/
 # mypy
 .mypy_cache/

+data
 .vscode
 .idea

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,5 +1,5 @@
 repos:
-  - repo: https://gitlab.com/pycqa/flake8.git
+  - repo: https://github.com/PyCQA/flake8
    rev: 5.0.4
    hooks:
      - id: flake8
--- a/README.md
+++ b/README.md
@ -62,11 +62,10 @@ The 1.x branch works with **PyTorch 1.6+**.

 ## What's New

-v1.0.0rc1 was released in 2/11/2022.
+v1.0.0rc2 was released in 6/12/2022.
 Please refer to [changelog.md](docs/en/notes/changelog.md) for details and release history.

- Support PoolFormer ([#2191](https://github.com/open-mmlab/mmsegmentation/pull/2191))
- Add Decathlon dataset ([#2227](https://github.com/open-mmlab/mmsegmentation/pull/2227))
+- Support MaskFormer and Mask2Former ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215), [2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))

 ## Installation

@ -139,6 +138,8 @@ Supported methods:
 - [x] [Segmenter (ICCV'2021)](configs/segmenter)
 - [x] [SegFormer (NeurIPS'2021)](configs/segformer)
 - [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
+- [x] [Mask2Former (CVPR'2022)](configs/mask2former)

 Supported datasets:

@ -194,6 +195,7 @@ This project is released under the [Apache 2.0 license](LICENSE).
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
 - [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -61,7 +61,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O

 ## 更新日志

-最新版本 v1.0.0rc1 在 2022.11.2 发布。
+最新版本 v1.0.0rc2 在 2022.12.6 发布。
 如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/notes/changelog.md)。

 ## 安装
@ -134,6 +134,8 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 - [x] [Segmenter (ICCV'2021)](configs/segmenter)
 - [x] [SegFormer (NeurIPS'2021)](configs/segformer)
 - [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
+- [x] [Mask2Former (CVPR'2022)](configs/mask2former)

 已支持的数据集：

@ -186,6 +188,7 @@ MMSegmentation 是一个由来自不同高校和企业的研发人员共同参
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练库
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
 - [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
--- a/configs/mask2former/README.md
+++ b/configs/mask2former/README.md
@ -0,0 +1,72 @@
+# Mask2Former
+
+[Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/Mask2Former">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).
+
+```bibtex
+@inproceedings{cheng2021mask2former,
+  title={Masked-attention Mask Transformer for Universal Image Segmentation},
+  author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
+  journal={CVPR},
+  year={2022}
+}
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
+
+### Usage
+
+- Mask2Former model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### Cityscapes
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                       config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5806 | 9.17           | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
+| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6971 | 7.11           | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x1024  | 90000   |     6511 | 7.18           | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
+| Mask2Former | Swin-S         | 512x1024  | 90000   |     8282 | 5.57           | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
+| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    11152 | 4.32           | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
+| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    16207 | 2.86           | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
+
+### ADE20K
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                   config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x512   | 160000  |     3385 | 26.59          | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
+| Mask2Former | R-101-D32      | 512x512   | 160000  |     4190 | 22.97          | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
+| Mask2Former | Swin-S         | 512x512   | 160000  |     5034 | 19.69          | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
+| Mask2Former | Swin-B         | 640x640   | 160000  |     5795 | 12.48          | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
+| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5795 | 12.43          | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
+| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     9077 | 8.81           | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
+
+Note:
+
+- All experiments of Mask2Former are implemented with 8 A100 GPUs with 2 samplers per GPU.
+- As mentioned at [the official repo](https://github.com/facebookresearch/Mask2Former/issues/5), the results of Mask2Former are relatively not stable, the result of Mask2Former(swin-s) on ADE20K dataset in the table is the medium result obtained by training 5 times following the suggestion of the author.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
--- a/configs/mask2former/mask2former.yml
+++ b/configs/mask2former/mask2former.yml
@ -0,0 +1,290 @@
+Collections:
+- Name: Mask2Former
+  Metadata:
+    Training Data:
+    - Usage
+    - Cityscapes
+    - ADE20K
+  Paper:
+    URL: https://arxiv.org/abs/2112.01527
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+  README: configs/mask2former/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+    Version: 3.x
+  Converted From:
+    Code: https://github.com/facebookresearch/Mask2Former
+Models:
+- Name: mask2former_r50_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-50-D32
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 109.05
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 5806.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.44
+  Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth
+- Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-101-D32
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 140.65
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 6971.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.8
+  Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth
+- Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-T
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 139.28
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 6511.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.71
+  Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth
+- Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-S
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 179.53
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 8282.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 82.57
+  Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-B (in22k)
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 231.48
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 11152.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.52
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-L (in22k)
+    crop size: (512,1024)
+    lr schd: 90000
+    inference time (ms/im):
+    - value: 349.65
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,1024)
+    Training Memory (GB): 16207.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.65
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth
+- Name: mask2former_r50_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-50-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 37.61
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3385.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.87
+  Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth
+- Name: mask2former_r101_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: R-101-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 43.54
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 4190.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.6
+  Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth
+- Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-T
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 41.98
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3826.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.66
+  Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth
+- Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-S
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 50.79
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 5034.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 51.24
+  Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth
+- Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-B
+    crop size: (640,640)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 80.13
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (640,640)
+    Training Memory (GB): 5795.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.44
+  Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-B (in22k)
+    crop size: (640,640)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 80.45
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (640,640)
+    Training Memory (GB): 5795.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.9
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Metadata:
+    backbone: Swin-L (in22k)
+    crop size: (640,640)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 113.51
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (640,640)
+    Training Memory (GB): 9077.0
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 56.01
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth
--- a/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
+++ b/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
--- a/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
--- a/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+++ b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,207 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/ade20k.py']
+
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(512 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2048),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
--- a/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
@ -0,0 +1,206 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py']
+
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (512, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 19
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(1024 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=4096),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=90000,
+        by_epoch=False)
+]
+
+# training schedule for 90k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
--- a/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+++ b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
@ -0,0 +1,237 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+depths = [2, 2, 18, 2]
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[128, 256, 512, 1024],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 640) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2560),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
--- a/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+++ b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@ -0,0 +1,5 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
--- a/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[128, 256, 512, 1024]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
--- a/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+++ b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@ -0,0 +1,9 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        num_heads=[6, 12, 24, 48],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_queries=100, in_channels=[192, 384, 768, 1536]))
--- a/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[192, 384, 768, 1536]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
--- a/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
+++ b/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
--- a/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
--- a/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
+++ b/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
--- a/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@ -0,0 +1,60 @@
+# MaskFormer
+
+[MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/MaskFormer/">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/199215459-ea507126-aafe-4823-8eb1-ae6487509d5c.png" width="90%"/>
+</div>
+
+```bibtex
+@article{cheng2021per,
+  title={Per-pixel classification is not all you need for semantic segmentation},
+  author={Cheng, Bowen and Schwing, Alex and Kirillov, Alexander},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={17864--17875},
+  year={2021}
+}
+```
+
+### Usage
+
+- MaskFormer model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### ADE20K
+
+| Method     | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                                                                     |
+| ---------- | --------- | --------- | ------- | -------- | -------------- | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | 42.20          | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
+| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | 34.90          | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
+| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | 40.53          | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
+| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | 26.98          | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
+
+Note:
+
+- All experiments of MaskFormer are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
+- The results of MaskFormer are relatively not stable.  The accuracy (mIoU) of model with `R-101-D32` is from 44.7 to 46.0, and with `Swin-S` is from 49.0 to 49.8.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
--- a/configs/maskformer/maskformer.yml
+++ b/configs/maskformer/maskformer.yml
@ -0,0 +1,101 @@
+Collections:
+- Name: MaskFormer
+  Metadata:
+    Training Data:
+    - Usage
+    - ADE20K
+  Paper:
+    URL: https://arxiv.org/abs/2107.06278
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+  README: configs/maskformer/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+    Version: dev-3.x
+  Converted From:
+    Code: https://github.com/facebookresearch/MaskFormer/
+Models:
+- Name: maskformer_r50-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: R-50-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 23.7
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3.29
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.29
+  Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth
+- Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: R-101-D32
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 28.65
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 4.12
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.11
+  Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth
+- Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: Swin-T
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 24.67
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 3.73
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.69
+  Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth
+- Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Metadata:
+    backbone: Swin-S
+    crop size: (512,512)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 37.06
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    Training Memory (GB): 5.33
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.36
+  Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth
--- a/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
+++ b/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,7 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
--- a/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+++ b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,143 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='pytorch',
+        contract_dilation=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024,
+                     2048],  # input channels of pixel_decoder modules
+        feat_channels=256,
+        in_index=[0, 1, 2, 3],
+        num_classes=150,
+        out_channels=256,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='mmdet.PixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=6,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.1,
+                    dropout_layer=None,
+                    add_identity=True),
+                # the following parameter was not used,
+                # just make current api happy
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=1.0),
+                    dict(
+                        type='mmdet.FocalLossCost',
+                        weight=20.0,
+                        binary_input=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=1.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'),
+)
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1),
+    }))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# In MaskFormer implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
--- a/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+++ b/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,79 @@
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
--- a/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
+++ b/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
@ -0,0 +1,81 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
--- a/demo/MMSegmentation_Tutorial.ipynb
+++ b/demo/MMSegmentation_Tutorial.ipynb
@ -33,7 +33,7 @@
    "## Install MMSegmentation\n",
    "This step may take several minutes. \n",
    "\n",
-    "We use PyTorch 1.10 and CUDA 11.1 for this tutorial. You may install other versions by change the version number in pip install command. "
+    "We use PyTorch 1.12 and CUDA 11.3 for this tutorial. You may install other versions by change the version number in pip install command. "
   ]
  },
  {
@ -67,13 +67,13 @@
   "outputs": [],
   "source": [
    "# Install PyTorch\n",
-    "!conda install pytorch=1.10.0 torchvision cudatoolkit=11.1 -c pytorch\n",
+    "!conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch\n",
    "# Install mim\n",
    "!pip install -U openmim\n",
    "# Install mmengine\n",
    "!mim install mmengine\n",
    "# Install MMCV\n",
-    "!mim install 'mmcv >= 2.0.0rc1'"
+    "!mim install 'mmcv >= 2.0.0rc1'\n"
   ]
  },
  {
@ -500,16 +500,17 @@
   },
   "outputs": [],
   "source": [
-    "from mmseg.apis import inference_model, show_result_pyplot\n",
+    "from mmseg.apis import init_model, inference_model, show_result_pyplot\n",
    "\n",
-    "model=runner.model\n",
-    "model.cfg=cfg\n",
+    "# Init the model from the config and the checkpoint\n",
+    "checkpoint_path = './work_dirs/tutorial/iter_200.pth'\n",
+    "model = init_model(cfg, checkpoint_path, 'cuda:0')\n",
    "\n",
    "img = mmcv.imread('iccv09Data/images/6000124.jpg')\n",
    "result = inference_model(model, img)\n",
    "plt.figure(figsize=(8, 6))\n",
-    "vis_result = show_result_pyplot(model, img, result, palette)\n",
-    "plt.imshow(mmcv.bgr2rgb(vis_result))"
+    "vis_result = show_result_pyplot(model, img, result)\n",
+    "plt.imshow(mmcv.bgr2rgb(vis_result))\n"
   ]
  }
 ],
@ -522,7 +523,7 @@
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "Python 3.7.13 ('pt1.12')",
+   "display_name": "Python 3.8.5 ('tensorflow')",
   "language": "python",
   "name": "python3"
  },
@ -536,7 +537,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.13"
+   "version": "3.8.5"
  },
  "pycharm": {
   "stem_cell": {
@ -549,7 +550,7 @@
  },
  "vscode": {
   "interpreter": {
-    "hash": "ffdb7915c29738c259ec7ee5d0d1b9253c264f1fd267d45dd77f1a420396c120"
+    "hash": "20d4b83e0c8b3730b580c42434163d64f4b735d580303a8fade7c849d4d29eba"
   }
  }
 },
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,7 +1,7 @@
 ARG PYTORCH="1.11.0"
 ARG CUDA="11.3"
 ARG CUDNN="8"
-ARG MMCV="2.0.0rc1"
+ARG MMCV="2.0.0rc3"

 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@ -3,8 +3,8 @@ ARG CUDA="11.3"
 ARG CUDNN="8"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

-ARG MMCV="2.0.0rc1"
-ARG MMSEG="1.0.0rc1"
+ARG MMCV="2.0.0rc3"
+ARG MMSEG="1.0.0rc2"

 ENV PYTHONUNBUFFERED TRUE

--- a/docs/en/advanced_guides/transforms.md
+++ b/docs/en/advanced_guides/transforms.md
@ -1,5 +1,13 @@
 # Data Transforms

+In this tutorial, we introduce the design of transforms pipeline in MMSegmentation.
+
+The structure of this guide is as follows:
+
+- [Data Transforms](#data-transforms)
+  - [Design of Data pipelines](#design-of-data-pipelines)
+  - [Customization data transformation](#customization-data-transformation)
+
 ## Design of Data pipelines

 Following typical conventions, we use `Dataset` and `DataLoader` for data loading
@ -10,13 +18,31 @@ we introduce a new `DataContainer` type in MMCV to help collect and distribute
 data of different size.
 See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.

-The data preparation pipeline and the dataset is decomposed. Usually a dataset
+In 1.x version of MMSegmentation, all data transformations are inherited from [`BaseTransform`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/transforms/base.py#L6).
+The input and output types of transformations are both dict. A simple example is as follows:
+
+```python
+>>> from mmseg.datasets.transforms import LoadAnnotations
+>>> transforms = LoadAnnotations()
+>>> img_path = './data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png.png'
+>>> gt_path = './data/cityscapes/gtFine/train/aachen/aachen_000015_000019_gtFine_instanceTrainIds.png'
+>>> results = dict(
+>>>     img_path=img_path,
+>>>     seg_map_path=gt_path,
+>>>     reduce_zero_label=False,
+>>>     seg_fields=[])
+>>> data_dict = transforms(results)
+>>> print(data_dict.keys())
+dict_keys(['img_path', 'seg_map_path', 'reduce_zero_label', 'seg_fields', 'gt_seg_map'])
+```
+
+The data preparation pipeline and the dataset are decomposed. Usually a dataset
 defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
-A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
+A pipeline consists of a sequence of operations. Each operation takes a dict as input and also outputs a dict for the next transform.

 The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.

-Here is an pipeline example for PSPNet.
+Here is a pipeline example for PSPNet.

 ```python
 crop_size = (512, 1024)
@ -37,53 +63,110 @@ test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
+    # does not need to resize data transform
    dict(type='LoadAnnotations'),
    dict(type='PackSegInputs')
 ]
 ```

-For each operation, we list the related dict fields that are added/updated/removed.
-Before pipelines, the information we can directly obtain from the datasets are img_path, seg_map_path.
+For each operation, we list the related dict fields that are `added`/`updated`/`removed`.
+Before pipelines, the information we can directly obtain from the datasets are `img_path` and `seg_map_path`.

 ### Data loading

-`LoadImageFromFile`
+`LoadImageFromFile`: Load an image from file.

- add: img, img_shape, ori_shape
+- add: `img`, `img_shape`, `ori_shape`

-`LoadAnnotations`
+`LoadAnnotations`: Load semantic segmentation maps provided by dataset.

- add: seg_fields, gt_seg_map
+- add: `seg_fields`, `gt_seg_map`

 ### Pre-processing

-`RandomResize`
+`RandomResize`: Random resize image & segmentation map.

- add: scale, scale_factor, keep_ratio
- update: img, img_shape, gt_seg_map
+- add: `scale`, `scale_factor`, `keep_ratio`
+- update: `img`, `img_shape`, `gt_seg_map`

-`Resize`
+`Resize`: Resize image & segmentation map.

- add: scale, scale_factor, keep_ratio
- update: img, gt_seg_map, img_shape
+- add: `scale`, `scale_factor`, `keep_ratio`
+- update: `img`, `gt_seg_map`, `img_shape`

-`RandomCrop`
+`RandomCrop`: Random crop image & segmentation map.

- update: img, pad_shape, gt_seg_map
+- update: `img`, `gt_seg_map`, `img_shape`.

-`RandomFlip`
+`RandomFlip`: Flip the image & segmentation map.

- add: flip, flip_direction
- update: img, gt_seg_map
+- add: `flip`, `flip_direction`
+- update: `img`, `gt_seg_map`

-`PhotoMetricDistortion`
+`PhotoMetricDistortion`: Apply photometric distortion to image sequentially,
+every transformation is applied with a probability of 0.5.
+The position of random contrast is in second or second to last(mode 0 or 1 below, respectively).

- update: img
+```
+1. random brightness
+2. random contrast (mode 0)
+3. convert color from BGR to HSV
+4. random saturation
+5. random hue
+6. convert color from HSV to BGR
+7. random contrast (mode 1)
+```
+
+- update: `img`

 ### Formatting

-`PackSegInputs`
+`PackSegInputs`: Pack the inputs data for the semantic segmentation.

- add: inputs, data_sample
+- add: `inputs`, `data_sample`
 - remove: keys specified by `meta_keys` (merged into the metainfo of data_sample), all other keys
+
+## Customization data transformation
+
+The customized data transformation must inherited from `BaseTransform` and implement `transform` function.
+Here we use a simple flipping transformation as example:
+
+```python
+import random
+import mmcv
+from mmcv.transforms import BaseTransform, TRANSFORMS
+
+@TRANSFORMS.register_module()
+class MyFlip(BaseTransform):
+    def __init__(self, direction: str):
+        super().__init__()
+        self.direction = direction
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+Thus, we can instantiate a `MyFlip` object and use it to process the data dict.
+
+```python
+import numpy as np
+
+transform = MyFlip(direction='horizontal')
+data_dict = {'img': np.random.rand(224, 224, 3)}
+data_dict = transform(data_dict)
+processed_img = data_dict['img']
+```
+
+Or, we can use `MyFlip` transformation in data pipeline in our config file.
+
+```python
+pipeline = [
+    ...
+    dict(type='MyFlip', direction='horizontal'),
+    ...
+]
+```
+
+Note that if you want to use `MyFlip` in config, you must ensure the file containing `MyFlip` is imported during runtime.
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@ -1,5 +1,47 @@
 # Changelog of v1.x

+## v1.0.0rc2(6/12/2022)
+
+### Highlights
+
+- Support MaskFormer ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215))
+- Support Mask2Former ([#2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))
+
+### Features
+
+- Add ResizeShortestEdge transform ([#2339](https://github.com/open-mmlab/mmsegmentation/pull/2339))
+- Support padding in data pre-processor for model testing([#2290](https://github.com/open-mmlab/mmsegmentation/pull/2290))
+- Fix the problem of post-processing not removing padding ([#2367](https://github.com/open-mmlab/mmsegmentation/pull/2367))
+
+### Bug fix
+
+- Fix links in README ([#2024](https://github.com/open-mmlab/mmsegmentation/pull/2024))
+- Fix swin load state_dict ([#2304](https://github.com/open-mmlab/mmsegmentation/pull/2304))
+- Fix typo of BaseSegDataset docstring ([#2322](https://github.com/open-mmlab/mmsegmentation/pull/2322))
+- Fix the bug in the visualization step ([#2326](https://github.com/open-mmlab/mmsegmentation/pull/2326))
+- Fix ignore class id from -1 to 255 in BaseSegDataset ([#2332](https://github.com/open-mmlab/mmsegmentation/pull/2332))
+- Fix KNet IterativeDecodeHead bug ([#2334](https://github.com/open-mmlab/mmsegmentation/pull/2334))
+- Add input argument for datasets ([#2379](https://github.com/open-mmlab/mmsegmentation/pull/2379))
+- Fix typo in warning on binary classification ([#2382](https://github.com/open-mmlab/mmsegmentation/pull/2382))
+
+### Enhancement
+
+- Fix ci for 1.x ([#2011](https://github.com/open-mmlab/mmsegmentation/pull/2011), [#2019](https://github.com/open-mmlab/mmsegmentation/pull/2019))
+- Fix lint and pre-commit hook ([#2308](https://github.com/open-mmlab/mmsegmentation/pull/2308))
+- Add `data` string in .gitignore file in dev-1.x branch ([#2336](https://github.com/open-mmlab/mmsegmentation/pull/2336))
+- Make scipy as a default dependency in runtime ([#2362](https://github.com/open-mmlab/mmsegmentation/pull/2362))
+- Delete mmcls in runtime.txt ([#2368](https://github.com/open-mmlab/mmsegmentation/pull/2368))
+
+### Documentation
+
+- Update configuration documentation ([#2048](https://github.com/open-mmlab/mmsegmentation/pull/2048))
+- Update inference documentation ([#2052](https://github.com/open-mmlab/mmsegmentation/pull/2052))
+- Update train test documentation ([#2061](https://github.com/open-mmlab/mmsegmentation/pull/2061))
+- Update get started documentatin ([#2148](https://github.com/open-mmlab/mmsegmentation/pull/2148))
+- Update transforms documentation ([#2088](https://github.com/open-mmlab/mmsegmentation/pull/2088))
+- Add MMEval projects like in README ([#2259](https://github.com/open-mmlab/mmsegmentation/pull/2259))
+- Translate the visualization.md ([#2298](https://github.com/open-mmlab/mmsegmentation/pull/2298))
+
 ## v1.0.0rc1 (2/11/2022)

 ### Highlights
--- a/docs/en/notes/faq.md
+++ b/docs/en/notes/faq.md
@ -6,31 +6,32 @@ We list some common troubles faced by many users and their corresponding solutio

 The compatible MMSegmentation and MMCV versions are as below. Please install the correct version of MMCV to avoid installation issues.

-| MMSegmentation version |        MMCV version         | MMClassification version |
-| :--------------------: | :-------------------------: | :----------------------: |
-|        1.0.0rc1        |      mmcv >= 2.0.0rc1       |     mmcls>=1.0.0rc0      |
-|        1.0.0rc0        |      mmcv >= 2.0.0rc1       |     mmcls>=1.0.0rc0      |
-|         master         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.24.1         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.23.0         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.22.0         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.21.1         | mmcv-full>=1.4.4, \<=1.6.0  |       Not required       |
-|         0.20.2         | mmcv-full>=1.3.13, \<=1.6.0 |       Not required       |
-|         0.19.0         | mmcv-full>=1.3.13, \<1.3.17 |       Not required       |
-|         0.18.0         | mmcv-full>=1.3.13, \<1.3.17 |       Not required       |
-|         0.17.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.16.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.15.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.14.1         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.14.0         |  mmcv-full>=1.3.1, \<1.3.2  |       Not required       |
-|         0.13.0         |  mmcv-full>=1.3.1, \<1.3.2  |       Not required       |
-|         0.12.0         |  mmcv-full>=1.1.4, \<1.3.2  |       Not required       |
-|         0.11.0         |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.10.0         |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.9.0          |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.8.0          |  mmcv-full>=1.1.4, \<1.2.0  |       Not required       |
-|         0.7.0          |  mmcv-full>=1.1.2, \<1.2.0  |       Not required       |
-|         0.6.0          |  mmcv-full>=1.1.2, \<1.2.0  |       Not required       |
+| MMSegmentation version |        MMCV version         | MMClassification (optional) version | MMDetection (optional) version |
+| :--------------------: | :-------------------------: | :---------------------------------: | :----------------------------: |
+|        1.0.0rc2        |      mmcv >= 2.0.0rc3       |           mmcls>=1.0.0rc0           |        mmdet>=3.0.0rc4         |
+|        1.0.0rc1        |      mmcv >= 2.0.0rc1       |           mmcls>=1.0.0rc0           |          Not required          |
+|        1.0.0rc0        |      mmcv >= 2.0.0rc1       |           mmcls>=1.0.0rc0           |          Not required          |
+|         master         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.24.1         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.23.0         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.22.0         | mmcv-full>=1.4.4, \<=1.6.0  |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
+|         0.21.1         | mmcv-full>=1.4.4, \<=1.6.0  |            Not required             |          Not required          |
+|         0.20.2         | mmcv-full>=1.3.13, \<=1.6.0 |            Not required             |          Not required          |
+|         0.19.0         | mmcv-full>=1.3.13, \<1.3.17 |            Not required             |          Not required          |
+|         0.18.0         | mmcv-full>=1.3.13, \<1.3.17 |            Not required             |          Not required          |
+|         0.17.0         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.16.0         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.15.0         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.14.1         | mmcv-full>=1.3.7, \<1.3.17  |            Not required             |          Not required          |
+|         0.14.0         |  mmcv-full>=1.3.1, \<1.3.2  |            Not required             |          Not required          |
+|         0.13.0         |  mmcv-full>=1.3.1, \<1.3.2  |            Not required             |          Not required          |
+|         0.12.0         |  mmcv-full>=1.1.4, \<1.3.2  |            Not required             |          Not required          |
+|         0.11.0         |  mmcv-full>=1.1.4, \<1.3.0  |            Not required             |          Not required          |
+|         0.10.0         |  mmcv-full>=1.1.4, \<1.3.0  |            Not required             |          Not required          |
+|         0.9.0          |  mmcv-full>=1.1.4, \<1.3.0  |            Not required             |          Not required          |
+|         0.8.0          |  mmcv-full>=1.1.4, \<1.2.0  |            Not required             |          Not required          |
+|         0.7.0          |  mmcv-full>=1.1.2, \<1.2.0  |            Not required             |          Not required          |
+|         0.6.0          |  mmcv-full>=1.1.2, \<1.2.0  |            Not required             |          Not required          |

 ## How to know the number of GPUs needed to train the model

--- a/docs/zh_cn/user_guides/visualization.md
+++ b/docs/zh_cn/user_guides/visualization.md
@ -1 +1,173 @@
 # 可视化
+
+MMSegmentation 1.x 提供了简便的方式监控训练时的状态以及可视化在模型预测时的数据。
+
+## 训练状态监控
+
+MMSegmentation 1.x 使用 TensorBoard 来监控训练时候的状态。
+
+### TensorBoard 的配置
+
+安装 TensorBoard 的过程可以按照 [官方安装指南](https://www.tensorflow.org/install) ，具体的步骤如下：
+
+```shell
+pip install tensorboardX
+pip install future tensorboard
+```
+
+在配置文件 `default_runtime.py` 的 `vis_backend` 中添加 `TensorboardVisBackend`。
+
+```python
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
+
+### 检查 TensorBoard 中的标量
+
+启动训练实验的命令如下
+
+```shell
+python tools/train.py configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py --work-dir work_dir/test_visual
+```
+
+开始训练后找到 `work_dir` 中的 `vis_data` 路径，例如：本次特定测试的 vis_data 路径如下所示：
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data
+```
+
+vis_data 路径中的标量文件包括了学习率、损失函数和 data_time 等，还记录了指标结果，您可以参考 MMEngine 中的 [记录日志教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html) 中的日志教程来帮助记录自己定义的数据。 Tensorboard 的可视化结果使用下面的命令执行：
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+## 数据和结果的可视化
+
+### 模型测试或验证期间的可视化数据样本
+
+MMSegmentation 提供了 `SegVisualizationHook` ，它是一个可以用于可视化 ground truth 和在模型测试和验证期间的预测分割结果的[钩子](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/hook.html) 。 它的配置在 `default_hooks` 中，更多详细信息请参见 [执行器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)。
+
+例如，在 `_base_/schedules/schedule_20k.py` 中，修改 `SegVisualizationHook` 配置，将 `draw` 设置为 `True` 以启用网络推理结果的存储，`interval` 表示预测结果的采样间隔， 设置为 1 时，将保存网络的每个推理结果。 `interval` 默认设置为 50：
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=1))
+
+```
+
+启动训练实验后，可视化结果将在 validation loop 存储到本地文件夹中，或者在一个数据集上启动评估模型时，预测结果将存储在本地。本地的可视化的存储结果保存在 `$WORK_DIRS/vis_data` 下的 `vis_image` 中，例如：
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data/vis_image
+```
+
+另外，如果在 `vis_backends` 中添加 `TensorboardVisBackend` ，如 [TensorBoard 的配置](#tensorboard-configuration)，我们还可以运行下面的命令在 TensorBoard 中查看它们：
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+### 可视化单个数据样本
+
+如果你想可视化单个样本数据，我们建议使用 `SegLocalVisualizer` 。
+
+`SegLocalVisualizer`是继承自 MMEngine 中`Visualizer` 类的子类，适用于 MMSegmentation 可视化，有关`Visualizer`的详细信息请参考在 MMEngine 中的[可视化教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html) 。
+
+以下是一个关于 `SegLocalVisualizer` 的示例，首先你可以使用下面的命令下载这个案例中的数据：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
+</div>
+
+```shell
+wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
+wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
+```
+
+然后你可以找到他们本地的路径和使用下面的脚本文件对其进行可视化：
+
+```python
+import mmcv
+import os.path as osp
+import torch
+
+# `PixelData` 是 MMEngine 中用于定义像素级标注或预测的数据结构。
+# 请参考下面的MMEngine数据结构教程文件：
+# https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html#pixeldata
+
+from mmengine.structures import PixelData
+
+# `SegDataSample` 是在 MMSegmentation 中定义的不同组件之间的数据结构接口，
+# 它包括 ground truth、语义分割的预测结果和预测逻辑。
+# 详情请参考下面的 `SegDataSample` 教程文件：
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/advanced_guides/structures.md
+
+from mmseg.structures import SegDataSample
+from mmseg.visualization import SegLocalVisualizer
+
+out_file = 'out_file_cityscapes'
+save_dir = './work_dirs'
+
+image = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_leftImg8bit.png'
+    ),
+    'color')
+sem_seg = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_gtFine_labelTrainIds.png'  # noqa
+    ),
+    'unchanged')
+sem_seg = torch.from_numpy(sem_seg)
+gt_sem_seg_data = dict(data=sem_seg)
+gt_sem_seg = PixelData(**gt_sem_seg_data)
+data_sample = SegDataSample()
+data_sample.gt_sem_seg = gt_sem_seg
+
+seg_local_visualizer = SegLocalVisualizer(
+    vis_backends=[dict(type='LocalVisBackend')],
+    save_dir=save_dir)
+
+# 数据集的元信息通常包括类名的 `classes` 和
+# 用于可视化每个前景颜色的 `palette` 。
+# 所有类名和调色板都在此文件中定义：
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/utils/class_names.py
+
+seg_local_visualizer.dataset_meta = dict(
+    classes=('road', 'sidewalk', 'building', 'wall', 'fence',
+             'pole', 'traffic light', 'traffic sign',
+             'vegetation', 'terrain', 'sky', 'person', 'rider',
+             'car', 'truck', 'bus', 'train', 'motorcycle',
+             'bicycle'),
+    palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70],
+             [102, 102, 156], [190, 153, 153], [153, 153, 153],
+             [250, 170, 30], [220, 220, 0], [107, 142, 35],
+             [152, 251, 152], [70, 130, 180], [220, 20, 60],
+             [255, 0, 0], [0, 0, 142], [0, 0, 70],
+             [0, 60, 100], [0, 80, 100], [0, 0, 230],
+             [119, 11, 32]])
+
+# 当`show=True`时，直接显示结果，
+# 当 `show=False`时，结果将保存在本地文件夹中。
+
+seg_local_visualizer.add_datasample(out_file, image,
+                                    data_sample, show=False)
+```
+
+可视化后的图像结果和它的对应的 ground truth 图像可以在 `./work_dirs/vis_data/vis_image/` 路径找到，文件名字是：`out_file_cityscapes_0.png` ：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189835713-c0534054-4bfa-4b75-9254-0afbeb5ff02e.png" width="70%"/>
+</div>
+
+如果你想知道更多的关于可视化的使用指引，你可以参考 MMEngine 中的[可视化教程](<[https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md)>)
--- a/mmseg/init.py
+++ b/mmseg/init.py
@ -7,7 +7,7 @@ from packaging.version import parse

 from .version import __version__, version_info

-MMCV_MIN = '2.0.0rc1'
+MMCV_MIN = '2.0.0rc3'
 MMCV_MAX = '2.1.0'
 MMENGINE_MIN = '0.1.0'
 MMENGINE_MAX = '1.0.0'
--- a/mmseg/datasets/init.py
+++ b/mmseg/datasets/init.py
@ -22,7 +22,8 @@ from .transforms import (CLAHE, AdjustGamma, GenerateEdge, LoadAnnotations,
                         LoadBiomedicalImageFromFile, LoadImageFromNDArray,
                         PackSegInputs, PhotoMetricDistortion, RandomCrop,
                         RandomCutOut, RandomMosaic, RandomRotate, Rerange,
-                         ResizeToMultiple, RGB2Gray, SegRescale)
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
 from .voc import PascalVOCDataset

 __all__ = [
@ -36,5 +37,5 @@ __all__ = [
    'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
    'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
-    'DecathlonDataset', 'LIPDataset'
+    'DecathlonDataset', 'LIPDataset', 'ResizeShortestEdge'
 ]
--- a/mmseg/datasets/ade.py
+++ b/mmseg/datasets/ade.py
@ -80,9 +80,13 @@ class ADE20KDataset(BaseSegDataset):
                 [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
                 [102, 255, 0], [92, 0, 255]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
--- a/mmseg/datasets/basesegdataset.py
+++ b/mmseg/datasets/basesegdataset.py
@ -65,7 +65,7 @@ class BaseSegDataset(BaseDataset):
            instantiation. In some cases, such as visualization, only the meta
            information of the dataset is needed, which is not necessary to
            load annotation file. ``Basedataset`` can skip load annotations to
-            save time by set ``lazy_init=False``. Defaults to False.
+            save time by set ``lazy_init=True``. Defaults to False.
        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
            None img. The maximum extra number of cycles to get a valid
            image. Defaults to 1000.
@ -179,7 +179,7 @@ class BaseSegDataset(BaseDataset):
                    f'subset of classes {old_classes} in METAINFO.')
            for i, c in enumerate(old_classes):
                if c not in new_classes:
-                    label_map[i] = -1
+                    label_map[i] = 255
                else:
                    label_map[i] = new_classes.index(c)
            return label_map
--- a/mmseg/datasets/chase_db1.py
+++ b/mmseg/datasets/chase_db1.py
@ -17,10 +17,14 @@ class ChaseDB1Dataset(BaseSegDataset):
        classes=('background', 'vessel'),
        palette=[[120, 120, 120], [6, 230, 230]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_1stHO.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_1stHO.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
        assert self.file_client.exists(self.data_prefix['img_path'])
--- a/mmseg/datasets/coco_stuff.py
+++ b/mmseg/datasets/coco_stuff.py
@ -91,6 +91,9 @@ class COCOStuffDataset(BaseSegDataset):
                 [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
                 [64, 192, 96], [64, 160, 64], [64, 64, 0]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_labelTrainIds.png',
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.jpg', seg_map_suffix='_labelTrainIds.png', **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
--- a/mmseg/datasets/dark_zurich.py
+++ b/mmseg/datasets/dark_zurich.py
@ -7,8 +7,9 @@ from .cityscapes import CityscapesDataset
 class DarkZurichDataset(CityscapesDataset):
    """DarkZurichDataset dataset."""

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='_rgb_anon.png',
+                 seg_map_suffix='_gt_labelTrainIds.png',
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='_rgb_anon.png',
-            seg_map_suffix='_gt_labelTrainIds.png',
-            **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
--- a/mmseg/datasets/drive.py
+++ b/mmseg/datasets/drive.py
@ -17,10 +17,14 @@ class DRIVEDataset(BaseSegDataset):
        classes=('background', 'vessel'),
        palette=[[120, 120, 120], [6, 230, 230]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_manual1.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_manual1.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
        assert self.file_client.exists(self.data_prefix['img_path'])
--- a/mmseg/datasets/hrf.py
+++ b/mmseg/datasets/hrf.py
@ -17,10 +17,14 @@ class HRFDataset(BaseSegDataset):
        classes=('background', 'vessel'),
        palette=[[120, 120, 120], [6, 230, 230]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
        assert self.file_client.exists(self.data_prefix['img_path'])
--- a/mmseg/datasets/isaid.py
+++ b/mmseg/datasets/isaid.py
@ -23,10 +23,14 @@ class iSAIDDataset(BaseSegDataset):
                 [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
                 [0, 127, 191], [0, 127, 255], [0, 100, 155]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_instance_color_RGB.png',
+                 ignore_index=255,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_instance_color_RGB.png',
-            ignore_index=255,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ignore_index=ignore_index,
            **kwargs)
        assert self.file_client.exists(self.data_prefix['img_path'])
--- a/mmseg/datasets/isprs.py
+++ b/mmseg/datasets/isprs.py
@ -17,9 +17,13 @@ class ISPRSDataset(BaseSegDataset):
        palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
                 [255, 255, 0], [255, 0, 0]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
--- a/mmseg/datasets/lip.py
+++ b/mmseg/datasets/lip.py
@ -39,5 +39,9 @@ class LIPDataset(BaseSegDataset):
            [255, 170, 0],
        ))

-    def __init__(self, **kwargs) -> None:
-        super().__init__(img_suffix='.jpg', seg_map_suffix='.png', **kwargs)
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
--- a/mmseg/datasets/loveda.py
+++ b/mmseg/datasets/loveda.py
@ -17,9 +17,13 @@ class LoveDADataset(BaseSegDataset):
        palette=[[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
                 [159, 129, 183], [0, 255, 0], [255, 195, 128]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
--- a/mmseg/datasets/night_driving.py
+++ b/mmseg/datasets/night_driving.py
@ -7,8 +7,9 @@ from .cityscapes import CityscapesDataset
 class NightDrivingDataset(CityscapesDataset):
    """NightDrivingDataset dataset."""

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtCoarse_labelTrainIds.png',
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='_leftImg8bit.png',
-            seg_map_suffix='_gtCoarse_labelTrainIds.png',
-            **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
--- a/mmseg/datasets/pascal_context.py
+++ b/mmseg/datasets/pascal_context.py
@ -45,10 +45,14 @@ class PascalContextDataset(BaseSegDataset):
                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])

-    def __init__(self, ann_file: str, **kwargs) -> None:
+    def __init__(self,
+                 ann_file: str,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
            ann_file=ann_file,
            reduce_zero_label=False,
            **kwargs)
@ -95,12 +99,17 @@ class PascalContextDataset59(BaseSegDataset):
                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])

-    def __init__(self, ann_file: str, **kwargs):
+    def __init__(self,
+                 ann_file: str,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs):
        super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
            ann_file=ann_file,
-            reduce_zero_label=True,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
        assert self.file_client.exists(
            self.data_prefix['img_path']) and osp.isfile(self.ann_file)
--- a/mmseg/datasets/potsdam.py
+++ b/mmseg/datasets/potsdam.py
@ -17,9 +17,13 @@ class PotsdamDataset(BaseSegDataset):
        palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
                 [255, 255, 0], [255, 0, 0]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
--- a/mmseg/datasets/stare.py
+++ b/mmseg/datasets/stare.py
@ -16,10 +16,14 @@ class STAREDataset(BaseSegDataset):
        classes=('background', 'vessel'),
        palette=[[120, 120, 120], [6, 230, 230]])

-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.ah.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.ah.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
            **kwargs)
        assert self.file_client.exists(self.data_prefix['img_path'])
--- a/mmseg/datasets/transforms/init.py
+++ b/mmseg/datasets/transforms/init.py
@ -5,13 +5,15 @@ from .loading import (LoadAnnotations, LoadBiomedicalAnnotation,
                      LoadImageFromNDArray)
 from .transforms import (CLAHE, AdjustGamma, GenerateEdge,
                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
-                         RandomMosaic, RandomRotate, Rerange, ResizeToMultiple,
-                         RGB2Gray, SegRescale)
+                         RandomMosaic, RandomRotate, Rerange,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)

 __all__ = [
    'LoadAnnotations', 'RandomCrop', 'SegRescale', 'PhotoMetricDistortion',
    'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray',
    'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
    'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
-    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge'
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'ResizeShortestEdge'
 ]
--- a/mmseg/datasets/transforms/transforms.py
+++ b/mmseg/datasets/transforms/transforms.py
@ -1226,3 +1226,87 @@ class GenerateEdge(BaseTransform):
        repr_str += f'edge_width={self.edge_width}, '
        repr_str += f'ignore_index={self.ignore_index})'
        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+    Copyright (c) Facebook, Inc. and its affiliates.
+    Licensed under the Apache-2.0 License
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional))
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self, scale: Union[int, Tuple[int, int]],
+                 max_size: int) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        # Create a empty Resize object
+        self.resize = TRANSFORMS.build({
+            'type': 'Resize',
+            'scale': 0,
+            'keep_ratio': True
+        })
+
+    def _get_output_shape(self, img, short_edge_length) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return (new_w, new_h)
+
+    def transform(self, results: Dict) -> Dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
--- a/mmseg/datasets/voc.py
+++ b/mmseg/datasets/voc.py
@ -24,10 +24,14 @@ class PascalVOCDataset(BaseSegDataset):
                 [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                 [0, 64, 128]])

-    def __init__(self, ann_file, **kwargs) -> None:
+    def __init__(self,
+                 ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
        super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
            ann_file=ann_file,
            **kwargs)
        assert self.file_client.exists(
--- a/mmseg/models/backbones/beit.py
+++ b/mmseg/models/backbones/beit.py
@ -11,6 +11,7 @@ from mmengine.model import BaseModule, ModuleList
 from mmengine.model.weight_init import (constant_init, kaiming_init,
                                        trunc_normal_)
 from mmengine.runner.checkpoint import _load_checkpoint
+from scipy import interpolate
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.modules.utils import _pair as to_2tuple

@ -18,11 +19,6 @@ from mmseg.registry import MODELS
 from ..utils import PatchEmbed
 from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer

-try:
-    from scipy import interpolate
-except ImportError:
-    interpolate = None
-

 class BEiTAttention(BaseModule):
    """Window based multi-head self-attention (W-MSA) module with relative
--- a/mmseg/models/backbones/swin.py
+++ b/mmseg/models/backbones/swin.py
@ -13,7 +13,7 @@ from mmengine.logging import print_log
 from mmengine.model import BaseModule, ModuleList
 from mmengine.model.weight_init import (constant_init, trunc_normal_,
                                        trunc_normal_init)
-from mmengine.runner import CheckpointLoader, load_state_dict
+from mmengine.runner import CheckpointLoader
 from mmengine.utils import to_2tuple

 from mmseg.registry import MODELS
@ -732,7 +732,7 @@ class SwinTransformer(BaseModule):
                        nH2, L2).permute(1, 0).contiguous()

            # load state_dict
-            load_state_dict(self, state_dict, strict=False, logger=None)
+            self.load_state_dict(state_dict, strict=False)

    def forward(self, x):
        x, hw_shape = self.patch_embed(x)
--- a/mmseg/models/data_preprocessor.py
+++ b/mmseg/models/data_preprocessor.py
@ -48,18 +48,24 @@ class SegDataPreProcessor(BaseDataPreprocessor):
        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
            Defaults to False.
        batch_augments (list[dict], optional): Batch-level augmentations
+        test_cfg (dict, optional): The padding size config in testing, if not
+            specify, will use `size` and `size_divisor` params as default.
+            Defaults to None, only supports keys `size` or `size_divisor`.
    """

-    def __init__(self,
-                 mean: Sequence[Number] = None,
-                 std: Sequence[Number] = None,
-                 size: Optional[tuple] = None,
-                 size_divisor: Optional[int] = None,
-                 pad_val: Number = 0,
-                 seg_pad_val: Number = 255,
-                 bgr_to_rgb: bool = False,
-                 rgb_to_bgr: bool = False,
-                 batch_augments: Optional[List[dict]] = None):
+    def __init__(
+        self,
+        mean: Sequence[Number] = None,
+        std: Sequence[Number] = None,
+        size: Optional[tuple] = None,
+        size_divisor: Optional[int] = None,
+        pad_val: Number = 0,
+        seg_pad_val: Number = 255,
+        bgr_to_rgb: bool = False,
+        rgb_to_bgr: bool = False,
+        batch_augments: Optional[List[dict]] = None,
+        test_cfg: dict = None,
+    ):
        super().__init__()
        self.size = size
        self.size_divisor = size_divisor
@ -86,6 +92,9 @@ class SegDataPreProcessor(BaseDataPreprocessor):
        # TODO: support batch augmentations.
        self.batch_augments = batch_augments

+        # Support different padding methods in testing
+        self.test_cfg = test_cfg
+
    def forward(self, data: dict, training: bool = False) -> Dict[str, Any]:
        """Perform normalization、padding and bgr2rgb conversion based on
        ``BaseDataPreprocessor``.
@ -122,10 +131,21 @@ class SegDataPreProcessor(BaseDataPreprocessor):
            if self.batch_augments is not None:
                inputs, data_samples = self.batch_augments(
                    inputs, data_samples)
-            return dict(inputs=inputs, data_samples=data_samples)
        else:
            assert len(inputs) == 1, (
                'Batch inference is not support currently, '
                'as the image size might be different in a batch')
-            return dict(
-                inputs=torch.stack(inputs, dim=0), data_samples=data_samples)
+            # pad images when testing
+            if self.test_cfg:
+                inputs, padded_samples = stack_batch(
+                    inputs=inputs,
+                    size=self.test_cfg.get('size', None),
+                    size_divisor=self.test_cfg.get('size_divisor', None),
+                    pad_val=self.pad_val,
+                    seg_pad_val=self.seg_pad_val)
+                for data_sample, pad_info in zip(data_samples, padded_samples):
+                    data_sample.set_metainfo({**pad_info})
+            else:
+                inputs = torch.stack(inputs, dim=0)
+
+        return dict(inputs=inputs, data_samples=data_samples)
--- a/mmseg/models/decode_heads/init.py
+++ b/mmseg/models/decode_heads/init.py
@ -15,6 +15,8 @@ from .gc_head import GCHead
 from .isa_head import ISAHead
 from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
 from .lraspp_head import LRASPPHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
 from .nl_head import NLHead
 from .ocr_head import OCRHead
 from .point_head import PointHead
@ -36,5 +38,5 @@ __all__ = [
    'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
    'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
    'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
-    'KernelUpdateHead', 'KernelUpdator'
+    'KernelUpdateHead', 'KernelUpdator', 'MaskFormerHead', 'Mask2FormerHead'
 ]
--- a/mmseg/models/decode_heads/decode_head.py
+++ b/mmseg/models/decode_heads/decode_head.py
@ -120,7 +120,7 @@ class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
                warnings.warn('For binary segmentation, we suggest using'
                              '`out_channels = 1` to define the output'
                              'channels of segmentor, and use `threshold`'
-                              'to convert seg_logist into a prediction'
+                              'to convert `seg_logits` into a prediction'
                              'applying a threshold')
            out_channels = num_classes

--- a/mmseg/models/decode_heads/knet_head.py
+++ b/mmseg/models/decode_heads/knet_head.py
@ -413,6 +413,9 @@ class IterativeDecodeHead(BaseDecodeHead):

    def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
                 **kwargs):
+        # ``IterativeDecodeHead`` would skip initialization of
+        # ``BaseDecodeHead`` which would be called when building
+        # ``self.kernel_generate_head``.
        super(BaseDecodeHead, self).__init__(**kwargs)
        assert num_stages == len(kernel_update_head)
        self.num_stages = num_stages
@ -422,6 +425,7 @@ class IterativeDecodeHead(BaseDecodeHead):
        self.num_classes = self.kernel_generate_head.num_classes
        self.input_transform = self.kernel_generate_head.input_transform
        self.ignore_index = self.kernel_generate_head.ignore_index
+        self.out_channels = self.num_classes

        for head_cfg in kernel_update_head:
            self.kernel_update_head.append(MODELS.build(head_cfg))
--- a/mmseg/models/decode_heads/mask2former_head.py
+++ b/mmseg/models/decode_heads/mask2former_head.py
@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from mmdet.models.dense_heads import \
+        Mask2FormerHead as MMDET_Mask2FormerHead
+except ModuleNotFoundError:
+    MMDET_Mask2FormerHead = None
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MMDET_Mask2FormerHead):
+    """Implements the Mask2Former head.
+
+    See `Mask2Former: Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/abs/2112.01527>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 align_corners=False,
+                 ignore_index=255,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_Mask2FormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros(
+                    (0, gt_sem_seg.shape[-2],
+                     gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+            else:
+                gt_masks = torch.stack(masks).squeeze(1).long()
+
+            instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+        batch_data_samples = [
+            SegDataSample(metainfo=metainfo) for metainfo in batch_img_metas
+        ]
+
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+        if 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape']
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred_results = F.interpolate(
+            mask_pred_results, size=size, mode='bilinear', align_corners=False)
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc, bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
--- a/mmseg/models/decode_heads/maskformer_head.py
+++ b/mmseg/models/decode_heads/maskformer_head.py
@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
+except ModuleNotFoundError:
+    MMDET_MaskFormerHead = None
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class MaskFormerHead(MMDET_MaskFormerHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes: int = 150,
+                 align_corners: bool = False,
+                 ignore_index: int = 255,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.out_channels = kwargs['out_channels']
+        self.align_corners = True
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_MaskFormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        for data_sample in batch_data_samples:
+            # Add `batch_input_shape` in metainfo of data_sample, which would
+            # be used in MaskFormerHead of MMDetection.
+            metainfo = data_sample.metainfo
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            data_sample.set_metainfo(metainfo)
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros((0, gt_sem_seg.shape[-2],
+                                        gt_sem_seg.shape[-1])).to(gt_sem_seg)
+            else:
+                gt_masks = torch.stack(masks).squeeze(1)
+
+            instance_data = InstanceData(
+                labels=gt_labels, masks=gt_masks.long())
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+
+        batch_data_samples = []
+        for metainfo in batch_img_metas:
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            batch_data_samples.append(SegDataSample(metainfo=metainfo))
+        # Forward function of MaskFormerHead from MMDetection needs
+        # 'batch_data_samples' as inputs, which is image shape　actually.
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=img_shape,
+            mode='bilinear',
+            align_corners=False)
+
+        # semantic inference
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
--- a/mmseg/models/segmentors/base.py
+++ b/mmseg/models/segmentors/base.py
@ -159,12 +159,17 @@ class BaseSegmentor(BaseModel, metaclass=ABCMeta):
            if not only_prediction:
                img_meta = data_samples[i].metainfo
                # remove padding area
-                padding_left, padding_right, padding_top, padding_bottom = \
-                    img_meta.get('padding_size', [0]*4)
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
                # i_seg_logits shape is 1, C, H, W after remove padding
                i_seg_logits = seg_logits[i:i + 1, :,
                                          padding_top:H - padding_bottom,
                                          padding_left:W - padding_right]
+
                # resize as original shape
                i_seg_logits = resize(
                    i_seg_logits,
--- a/mmseg/utils/misc.py
+++ b/mmseg/utils/misc.py
@ -105,6 +105,9 @@ def stack_batch(inputs: List[torch.Tensor],
            })
            padded_samples.append(data_sample)
        else:
-            padded_samples = None
+            padded_samples.append(
+                dict(
+                    img_padding_size=padding_size,
+                    pad_shape=pad_img.shape[-2:]))

    return torch.stack(padded_inputs, dim=0), padded_samples
--- a/mmseg/version.py
+++ b/mmseg/version.py
@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.

-__version__ = '1.0.0rc1'
+__version__ = '1.0.0rc2'


 def parse_version_info(version_str):
--- a/model-index.yml
+++ b/model-index.yml
@ -25,6 +25,8 @@ Import:
 - configs/isanet/isanet.yml
 - configs/knet/knet.yml
 - configs/mae/mae.yml
+- configs/mask2former/mask2former.yml
+- configs/maskformer/maskformer.yml
 - configs/mobilenet_v2/mobilenet_v2.yml
 - configs/mobilenet_v3/mobilenet_v3.yml
 - configs/nonlocal_net/nonlocal_net.yml
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@ -1,3 +1,4 @@
 mmcls>=1.0.0rc0
-mmcv>=2.0.0rc1,<2.1.0
+mmcv>=2.0.0rc3,<2.1.0
+mmdet>=3.0.0rc4
 mmengine>=0.1.0,<1.0.0
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -1,5 +1,5 @@
 matplotlib
-mmcls>=1.0.0rc0
 numpy
 packaging
 prettytable
+scipy
--- a/setup.py
+++ b/setup.py
@ -192,7 +192,6 @@ if __name__ == '__main__':
        extras_require={
            'all': parse_requirements('requirements.txt'),
            'tests': parse_requirements('requirements/tests.txt'),
-            'build': parse_requirements('requirements/build.txt'),
            'optional': parse_requirements('requirements/optional.txt'),
            'mim': parse_requirements('requirements/mminstall.txt'),
        },
--- a/tests/test_datasets/test_transform.py
+++ b/tests/test_datasets/test_transform.py
@ -10,6 +10,9 @@ from PIL import Image
 from mmseg.datasets.transforms import *  # noqa
 from mmseg.datasets.transforms import PhotoMetricDistortion, RandomCrop
 from mmseg.registry import TRANSFORMS
+from mmseg.utils import register_all_modules
+
+register_all_modules()


 def test_resize():
@ -71,6 +74,34 @@ def test_resize():
    resized_results = resize_module(results.copy())
    assert max(resized_results['img_shape'][:2]) <= 1333 * 1.1

+    # test RandomChoiceResize, which `resize_type` is `ResizeShortestEdge`
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[128, 256, 512],
+        resize_type='ResizeShortestEdge',
+        max_size=1333)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][0] in [128, 256, 512]
+
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[512],
+        resize_type='ResizeShortestEdge',
+        max_size=512)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][1] == 512
+
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[(128, 256), (256, 512), (512, 1024)],
+        resize_type='ResizeShortestEdge',
+        max_size=1333)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][0] in [128, 256, 512]
+
    # test scale=None and scale_factor is tuple.
    # img shape: (288, 512, 3)
    transform = dict(
--- a/tests/test_models/test_backbones/test_beit.py
+++ b/tests/test_models/test_backbones/test_beit.py
@ -140,8 +140,11 @@ def test_beit_init():
        }
    }
    model = BEiT(img_size=(512, 512))
-    with pytest.raises(AttributeError):
-        model.resize_rel_pos_embed(ckpt)
+    # If scipy is installed, this AttributeError would not be raised.
+    from mmengine.utils import is_installed
+    if not is_installed('scipy'):
+        with pytest.raises(AttributeError):
+            model.resize_rel_pos_embed(ckpt)

    # pretrained=None
    # init_cfg=123, whose type is unsupported
--- a/tests/test_models/test_backbones/test_mae.py
+++ b/tests/test_models/test_backbones/test_mae.py
@ -138,8 +138,11 @@ def test_mae_init():
        }
    }
    model = MAE(img_size=(512, 512))
-    with pytest.raises(AttributeError):
-        model.resize_rel_pos_embed(ckpt)
+    # If scipy is installed, this AttributeError would not be raised.
+    from mmengine.utils import is_installed
+    if not is_installed('scipy'):
+        with pytest.raises(AttributeError):
+            model.resize_rel_pos_embed(ckpt)

    # test resize abs pos embed
    ckpt = model.resize_abs_pos_embed(ckpt['state_dict'])
--- a/tests/test_models/test_data_preprocessor.py
+++ b/tests/test_models/test_data_preprocessor.py
@ -46,3 +46,19 @@ class TestSegDataPreProcessor(TestCase):
        out = processor(data, training=True)
        self.assertEqual(out['inputs'].shape, (2, 3, 20, 20))
        self.assertEqual(len(out['data_samples']), 2)
+
+        # test predict with padding
+        processor = SegDataPreProcessor(
+            mean=[0, 0, 0],
+            std=[1, 1, 1],
+            size=(20, 20),
+            test_cfg=dict(size_divisor=15))
+        data = {
+            'inputs': [
+                torch.randint(0, 256, (3, 11, 10)),
+            ],
+            'data_samples': [data_sample]
+        }
+        out = processor(data, training=False)
+        self.assertEqual(out['inputs'].shape[2] % 15, 0)
+        self.assertEqual(out['inputs'].shape[3] % 15, 0)
--- a/tests/test_models/test_heads/test_mask2former_head.py
+++ b/tests/test_models/test_heads/test_mask2former_head.py
@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine import Config
+from mmengine.structures import PixelData
+
+from mmseg.models.decode_heads import Mask2FormerHead
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList
+from .utils import to_cuda
+
+
+def test_mask2former_head():
+    num_classes = 19
+    cfg = dict(
+        in_channels=[96, 192, 384, 768],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                type='mmdet.DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='mmdet.BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='mmdet.MultiScaleDeformableAttention',
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=False,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True)),
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                init_cfg=None),
+            positional_encoding=dict(
+                type='mmdet.SinePositionalEncoding',
+                num_feats=128,
+                normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(
+            type='mmdet.SinePositionalEncoding', num_feats=128,
+            normalize=True),
+        transformer_decoder=dict(
+            type='mmdet.DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=9,
+            transformerlayers=dict(
+                type='mmdet.DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='mmdet.MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler')))
+    cfg = Config(cfg)
+    head = Mask2FormerHead(**cfg)
+
+    inputs = [
+        torch.rand((2, 96, 8, 8)),
+        torch.rand((2, 192, 4, 4)),
+        torch.rand((2, 384, 2, 2)),
+        torch.rand((2, 768, 1, 1))
+    ]
+
+    data_samples: SampleList = []
+    for i in range(2):
+        data_sample = SegDataSample()
+        img_meta = {}
+        img_meta['img_shape'] = (32, 32)
+        img_meta['ori_shape'] = (32, 32)
+        data_sample.gt_sem_seg = PixelData(
+            data=torch.randint(0, num_classes, (1, 32, 32)))
+        data_sample.set_metainfo(img_meta)
+        data_samples.append(data_sample)
+
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+        for data_sample in data_samples:
+            data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.data.cuda()
+
+    loss_dict = head.loss(inputs, data_samples, None)
+    assert isinstance(loss_dict, dict)
+
+    batch_img_metas = []
+    for data_sample in data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+
+    seg_logits = head.predict(inputs, batch_img_metas, None)
+    assert seg_logits.shape == torch.Size((2, num_classes, 32, 32))
--- a/tests/test_models/test_heads/test_maskformer_head.py
+++ b/tests/test_models/test_heads/test_maskformer_head.py
@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os.path import dirname, join
+
+import torch
+from mmengine import Config
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import register_all_modules
+
+
+def test_maskformer_head():
+    register_all_modules()
+    repo_dpath = dirname(dirname(__file__))
+    cfg = Config.fromfile(
+        join(
+            repo_dpath,
+            '../../configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'  # noqa
+        ))
+    cfg.model.train_cfg = None
+    decode_head = MODELS.build(cfg.model.decode_head)
+    inputs = (torch.randn(1, 256, 32, 32), torch.randn(1, 512, 16, 16),
+              torch.randn(1, 1024, 8, 8), torch.randn(1, 2048, 4, 4))
+    # test inference
+    batch_img_metas = [
+        dict(
+            scale_factor=(1.0, 1.0),
+            img_shape=(512, 683),
+            ori_shape=(512, 683))
+    ]
+    test_cfg = dict(mode='whole')
+    output = decode_head.predict(inputs, batch_img_metas, test_cfg)
+    assert output.shape == (1, 150, 512, 683)
+
+    # test training
+    inputs = (torch.randn(2, 256, 32, 32), torch.randn(2, 512, 16, 16),
+              torch.randn(2, 1024, 8, 8), torch.randn(2, 2048, 4, 4))
+    batch_data_samples = []
+    img_meta = {
+        'img_shape': (512, 512),
+        'ori_shape': (480, 640),
+        'pad_shape': (512, 512),
+        'scale_factor': (1.425, 1.425),
+    }
+    for _ in range(2):
+        data_sample = SegDataSample(
+            gt_sem_seg=PixelData(data=torch.ones(512, 512).long()))
+        data_sample.set_metainfo(img_meta)
+        batch_data_samples.append(data_sample)
+    train_cfg = {}
+    losses = decode_head.loss(inputs, batch_data_samples, train_cfg)
+    assert (loss in losses.keys()
+            for loss in ('loss_cls', 'loss_mask', 'loss_dice'))
--- a/tests/test_models/test_segmentors/test_encoder_decoder.py
+++ b/tests/test_models/test_segmentors/test_encoder_decoder.py
@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
+import torch
 from mmengine import ConfigDict
+from mmengine.structures import PixelData

 from mmseg.models import build_segmentor
+from mmseg.structures import SegDataSample
 from .utils import _segmentor_forward_train_test


@ -57,3 +59,42 @@ def test_encoder_decoder():
    cfg.test_cfg = ConfigDict(mode='whole')
    segmentor = build_segmentor(cfg)
    _segmentor_forward_train_test(segmentor)
+
+
+def test_postprocess_result():
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    model = build_segmentor(cfg)
+
+    # test postprocess
+    data_sample = SegDataSample()
+    data_sample.gt_sem_seg = PixelData(
+        **{'data': torch.randint(0, 10, (1, 8, 8))})
+    data_sample.set_metainfo({
+        'padding_size': (0, 2, 0, 2),
+        'ori_shape': (8, 8)
+    })
+    seg_logits = torch.zeros((1, 2, 10, 10))
+    seg_logits[:, :, :8, :8] = 1
+    data_samples = [data_sample]
+
+    outputs = model.postprocess_result(seg_logits, data_samples)
+    assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
+    assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))
+
+    data_sample = SegDataSample()
+    data_sample.gt_sem_seg = PixelData(
+        **{'data': torch.randint(0, 10, (1, 8, 8))})
+    data_sample.set_metainfo({
+        'img_padding_size': (0, 2, 0, 2),
+        'ori_shape': (8, 8)
+    })
+
+    data_samples = [data_sample]
+    outputs = model.postprocess_result(seg_logits, data_samples)
+    assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
+    assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))