Merge branch 'dev-1.x' of github.com:open-mmlab/mmsegmentation into 1.x

2025-06-03 22:03:48 +08:00 · 2023-02-01 21:34:56 +08:00 · 2023-02-01 21:34:56 +08:00 · a35e1c4232
commit a35e1c4232
parent 1c836be767 7ac0888d9f
54 changed files with 321 additions and 353 deletions
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@ -61,9 +61,9 @@ jobs:
          command: |
            pip install git+https://github.com/open-mmlab/mmengine.git@main
            pip install -U openmim
-            mim install 'mmcv==2.0.0rc3'
+            mim install 'mmcv>=2.0.0rc4'
            pip install git+https://github.com/open-mmlab/mmclassification@dev-1.x
-            mim install 'mmdet==3.0.0rc5'
+            pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
            pip install -r requirements/tests.txt -r requirements/optional.txt
      - run:
          name: Build and install
@ -97,6 +97,7 @@ jobs:
          command: |
            git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine
            git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmclassification.git /home/circleci/mmclassification
+            git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
      - run:
          name: Build Docker image
          command: |
@ -107,9 +108,9 @@ jobs:
          command: |
            docker exec mmseg pip install -e /mmengine
            docker exec mmseg pip install -U openmim
-            docker exec mmseg mim install 'mmcv==2.0.0rc3'
+            docker exec mmseg mim install 'mmcv>=2.0.0rc4'
            docker exec mmseg pip install -e /mmclassification
-            docker exec mmseg mim install 'mmdet==3.0.0rc5'
+            docker exec mmseg pip install -e /mmdetection
            docker exec mmseg pip install -r requirements/tests.txt -r requirements/optional.txt
      - run:
          name: Build and install
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@ -44,9 +44,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -100,9 +100,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -166,9 +166,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -209,9 +209,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -244,9 +244,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@ -44,9 +44,9 @@ jobs:
        run: |
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -100,9 +100,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
@ -135,9 +135,9 @@ jobs:
          python -V
          pip install -U openmim
          pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
          pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
      - name: Install unittest dependencies
        run: pip install -r requirements/tests.txt -r requirements/optional.txt
      - name: Build and install
--- a/README.md
+++ b/README.md
@ -62,7 +62,7 @@ The 1.x branch works with **PyTorch 1.6+**.

 ## What's New

-v1.0.0rc4 was released on 30/01/2023.
+v1.0.0rc5 was released on 01/02/2023.
 Please refer to [changelog.md](docs/en/notes/changelog.md) for details and release history.

 - Support ISNet (ICCV'2021) in projects ([#2400](https://github.com/open-mmlab/mmsegmentation/pull/2400))
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -61,7 +61,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O

 ## 更新日志

-最新版本 v1.0.0rc4 在 2023.01.30 发布。
+最新版本 v1.0.0rc5 在 2023.02.01 发布。
 如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/notes/changelog.md)。

 ## 安装
--- a/configs/_base_/datasets/ade20k.py
+++ b/configs/_base_/datasets/ade20k.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/ade20k_640x640.py
+++ b/configs/_base_/datasets/ade20k_640x640.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/chase_db1.py
+++ b/configs/_base_/datasets/chase_db1.py
@ -26,7 +26,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/cityscapes.py
+++ b/configs/_base_/datasets/cityscapes.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/coco-stuff10k.py
+++ b/configs/_base_/datasets/coco-stuff10k.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/coco-stuff164k.py
+++ b/configs/_base_/datasets/coco-stuff164k.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/drive.py
+++ b/configs/_base_/datasets/drive.py
@ -26,7 +26,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/hrf.py
+++ b/configs/_base_/datasets/hrf.py
@ -26,7 +26,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/isaid.py
+++ b/configs/_base_/datasets/isaid.py
@ -32,7 +32,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/loveda.py
+++ b/configs/_base_/datasets/loveda.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/pascal_context_59.py
+++ b/configs/_base_/datasets/pascal_context_59.py
@ -28,7 +28,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/pascal_voc12.py
+++ b/configs/_base_/datasets/pascal_voc12.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/pascal_voc12_aug.py
+++ b/configs/_base_/datasets/pascal_voc12_aug.py
@ -27,7 +27,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/potsdam.py
+++ b/configs/_base_/datasets/potsdam.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/stare.py
+++ b/configs/_base_/datasets/stare.py
@ -26,7 +26,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/_base_/datasets/vaihingen.py
+++ b/configs/_base_/datasets/vaihingen.py
@ -25,7 +25,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/configs/erfnet/README.md
+++ b/configs/erfnet/README.md
@ -41,12 +41,14 @@ Semantic segmentation is a challenging task that addresses most of the perceptio

 ### Cityscapes

-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | 71.08 | 72.6          | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056-03d333ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ---: | ------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | 72.5 | 74.75         | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145.log.json) |

 Note:

 - The model is trained from scratch.

 - Last deconvolution layer in the [original paper](https://github.com/Eromera/erfnet_pytorch/blob/master/train/erfnet.py#L123) is replaced by a naive `FCNHead` decoder head and a bilinear upsampling layer, found more effective and efficient.
+
+- This model performance is sensitive to the seed values used, please refer to the log file for the specific settings of the seed. If you choose a different seed, the results might differ from the table results.
--- a/configs/erfnet/erfnet.yml
+++ b/configs/erfnet/erfnet.yml
@ -31,7 +31,7 @@ Models:
  - Task: Semantic Segmentation
    Dataset: Cityscapes
    Metrics:
-      mIoU: 71.08
-      mIoU(ms+flip): 72.6
+      mIoU: 72.5
+      mIoU(ms+flip): 74.75
  Config: configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056-03d333ed.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth
--- a/configs/mask2former/README.md
+++ b/configs/mask2former/README.md
@ -45,24 +45,24 @@ pip install "mmdet>=3.0.0rc4"

 | Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                       config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5806 | 9.17           | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
-| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6971 | 7.11           | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
-| Mask2Former | Swin-T         | 512x1024  | 90000   |     6511 | 7.18           | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
-| Mask2Former | Swin-S         | 512x1024  | 90000   |     8282 | 5.57           | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
-| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    11152 | 4.32           | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
-| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    16207 | 2.86           | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
+| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5806 | 9.17           | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
+| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6971 | 7.11           | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x1024  | 90000   |     6511 | 7.18           | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
+| Mask2Former | Swin-S         | 512x1024  | 90000   |     8282 | 5.57           | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
+| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    11152 | 4.32           | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
+| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    16207 | 2.86           | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |

 ### ADE20K

 | Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                   config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Mask2Former | R-50-D32       | 512x512   | 160000  |     3385 | 26.59          | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
-| Mask2Former | R-101-D32      | 512x512   | 160000  |     4190 | 22.97          | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
-| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
-| Mask2Former | Swin-S         | 512x512   | 160000  |     5034 | 19.69          | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
-| Mask2Former | Swin-B         | 640x640   | 160000  |     5795 | 12.48          | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
-| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5795 | 12.43          | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
-| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     9077 | 8.81           | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
+| Mask2Former | R-50-D32       | 512x512   | 160000  |     3385 | 26.59          | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
+| Mask2Former | R-101-D32      | 512x512   | 160000  |     4190 | 22.97          | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
+| Mask2Former | Swin-S         | 512x512   | 160000  |     5034 | 19.69          | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
+| Mask2Former | Swin-B         | 640x640   | 160000  |     5795 | 12.48          | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
+| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5795 | 12.43          | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
+| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     9077 | 8.81           | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |

 Note:

--- a/configs/mask2former/mask2former.yml
+++ b/configs/mask2former/mask2former.yml
@ -35,7 +35,7 @@ Models:
    Metrics:
      mIoU: 80.44
  Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth
 - Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
  In Collection: Mask2Former
  Metadata:
@ -56,7 +56,7 @@ Models:
    Metrics:
      mIoU: 80.8
  Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth
 - Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
  In Collection: Mask2Former
  Metadata:
@ -77,7 +77,7 @@ Models:
    Metrics:
      mIoU: 81.71
  Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth
 - Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
  In Collection: Mask2Former
  Metadata:
@ -98,7 +98,7 @@ Models:
    Metrics:
      mIoU: 82.57
  Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth
 - Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
  In Collection: Mask2Former
  Metadata:
@ -119,7 +119,7 @@ Models:
    Metrics:
      mIoU: 83.52
  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth
 - Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
  In Collection: Mask2Former
  Metadata:
@ -140,7 +140,7 @@ Models:
    Metrics:
      mIoU: 83.65
  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth
 - Name: mask2former_r50_8xb2-160k_ade20k-512x512
  In Collection: Mask2Former
  Metadata:
@ -161,7 +161,7 @@ Models:
    Metrics:
      mIoU: 47.87
  Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth
 - Name: mask2former_r101_8xb2-160k_ade20k-512x512
  In Collection: Mask2Former
  Metadata:
@ -182,7 +182,7 @@ Models:
    Metrics:
      mIoU: 48.6
  Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth
 - Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
  In Collection: Mask2Former
  Metadata:
@ -203,7 +203,7 @@ Models:
    Metrics:
      mIoU: 48.66
  Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth
 - Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
  In Collection: Mask2Former
  Metadata:
@ -224,7 +224,7 @@ Models:
    Metrics:
      mIoU: 51.24
  Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth
 - Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
  In Collection: Mask2Former
  Metadata:
@ -245,7 +245,7 @@ Models:
    Metrics:
      mIoU: 52.44
  Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth
 - Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
  In Collection: Mask2Former
  Metadata:
@ -266,7 +266,7 @@ Models:
    Metrics:
      mIoU: 53.9
  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth
 - Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
  In Collection: Mask2Former
  Metadata:
@ -287,4 +287,4 @@ Models:
    Metrics:
      mIoU: 56.01
  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth
--- a/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+++ b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
@ -41,65 +41,58 @@ model = dict(
            num_outs=3,
            norm_cfg=dict(type='GN', num_groups=32),
            act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                        embed_dims=256,
                        num_heads=8,
                        num_levels=3,
                        num_points=4,
                        im2col_step=64,
                        dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                        norm_cfg=None,
                        init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                        embed_dims=256,
                        feedforward_channels=1024,
                        num_fcs=2,
                        ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
            init_cfg=None),
        enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
            return_intermediate=True,
            num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                    embed_dims=256,
                    num_heads=8,
                    attn_drop=0.0,
                    proj_drop=0.0,
                    dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                    embed_dims=256,
                    feedforward_channels=2048,
                    num_fcs=2,
                    act_cfg=dict(type='ReLU', inplace=True),
                    ffn_drop=0.0,
                    dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
            init_cfg=None),
        loss_cls=dict(
            type='mmdet.CrossEntropyLoss',
--- a/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
@ -41,65 +41,58 @@ model = dict(
            num_outs=3,
            norm_cfg=dict(type='GN', num_groups=32),
            act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                        embed_dims=256,
                        num_heads=8,
                        num_levels=3,
                        num_points=4,
                        im2col_step=64,
                        dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                        norm_cfg=None,
                        init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                        embed_dims=256,
                        feedforward_channels=1024,
                        num_fcs=2,
                        ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
            init_cfg=None),
        enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
            return_intermediate=True,
            num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                    embed_dims=256,
                    num_heads=8,
                    attn_drop=0.0,
                    proj_drop=0.0,
                    dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                    embed_dims=256,
                    feedforward_channels=2048,
                    num_fcs=2,
                    act_cfg=dict(type='ReLU', inplace=True),
                    ffn_drop=0.0,
                    dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
            init_cfg=None),
        loss_cls=dict(
            type='mmdet.CrossEntropyLoss',
--- a/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+++ b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
@ -53,65 +53,58 @@ model = dict(
            num_outs=3,
            norm_cfg=dict(type='GN', num_groups=32),
            act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                        embed_dims=256,
                        num_heads=8,
                        num_levels=3,
                        num_points=4,
                        im2col_step=64,
                        dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                        norm_cfg=None,
                        init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                        embed_dims=256,
                        feedforward_channels=1024,
                        num_fcs=2,
                        ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
            init_cfg=None),
        enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
            return_intermediate=True,
            num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                    embed_dims=256,
                    num_heads=8,
                    attn_drop=0.0,
                    proj_drop=0.0,
                    dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                    embed_dims=256,
                    feedforward_channels=2048,
                    num_fcs=2,
                    act_cfg=dict(type='ReLU', inplace=True),
                    ffn_drop=0.0,
                    dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
            init_cfg=None),
        loss_cls=dict(
            type='mmdet.CrossEntropyLoss',
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@ -47,10 +47,10 @@ pip install "mmdet>=3.0.0rc4"

 | Method     | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                                                                     |
 | ---------- | --------- | --------- | ------- | -------- | -------------- | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | 42.20          | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
-| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | 34.90          | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
-| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | 40.53          | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
-| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | 26.98          | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
+| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | 42.20          | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
+| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | 34.90          | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
+| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | 40.53          | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
+| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | 26.98          | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |

 Note:

--- a/configs/maskformer/maskformer.yml
+++ b/configs/maskformer/maskformer.yml
@ -35,7 +35,7 @@ Models:
    Metrics:
      mIoU: 44.29
  Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth
 - Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
  In Collection: MaskFormer
  Metadata:
@ -56,7 +56,7 @@ Models:
    Metrics:
      mIoU: 45.11
  Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth
 - Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
  In Collection: MaskFormer
  Metadata:
@ -77,7 +77,7 @@ Models:
    Metrics:
      mIoU: 46.69
  Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth
 - Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
  In Collection: MaskFormer
  Metadata:
@ -98,4 +98,4 @@ Models:
    Metrics:
      mIoU: 49.36
  Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth
--- a/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+++ b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
@ -43,36 +43,34 @@ model = dict(
            norm_cfg=dict(type='GN', num_groups=32),
            act_cfg=dict(type='ReLU')),
        enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
            return_intermediate=True,
            num_layers=6,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                    embed_dims=256,
                    num_heads=8,
                    attn_drop=0.1,
                    proj_drop=0.1,
                    dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                    embed_dims=256,
                    feedforward_channels=2048,
                    num_fcs=2,
                    act_cfg=dict(type='ReLU', inplace=True),
                    ffn_drop=0.1,
                    dropout_layer=None,
-                    add_identity=True),
-                # the following parameter was not used,
-                # just make current api happy
-                feedforward_channels=2048,
-                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
            init_cfg=None),
        loss_cls=dict(
            type='mmdet.CrossEntropyLoss',
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,7 +1,7 @@
 ARG PYTORCH="1.11.0"
 ARG CUDA="11.3"
 ARG CUDNN="8"
-ARG MMCV="2.0.0rc3"
+ARG MMCV="2.0.0rc4"

 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@ -3,8 +3,8 @@ ARG CUDA="11.3"
 ARG CUDNN="8"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

-ARG MMCV="2.0.0rc3"
-ARG MMSEG="1.0.0rc4"
+ARG MMCV="2.0.0rc4"
+ARG MMSEG="1.0.0rc5"

 ENV PYTHONUNBUFFERED TRUE

--- a/docs/en/migration/interface.md
+++ b/docs/en/migration/interface.md
@ -237,7 +237,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
    dict(
        type='TestTimeAug',
        transforms=[
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@ -1,5 +1,15 @@
 # Changelog of v1.x

+## v1.0.0rc5(02/01/2023)
+
+### Bug fix
+
+- Fix MaskFormer and Mask2Former when install mmdet from source ([#2532](https://github.com/open-mmlab/mmsegmentation/pull/2532))
+- Support new fileio interface in `MMCV>=2.0.0rc4` ([#2543](https://github.com/open-mmlab/mmsegmentation/pull/2543))
+- Fix ERFNet URL in dev-1.x branch ([#2537](https://github.com/open-mmlab/mmsegmentation/pull/2537))
+- Fix misleading `List[Tensor]` types ([#2546](https://github.com/open-mmlab/mmsegmentation/pull/2546))
+- Rename typing.py to typing_utils.py ([#2548](https://github.com/open-mmlab/mmsegmentation/pull/2548))
+
 ## v1.0.0rc4(01/30/2023)

 ### Highlights
--- a/docs/en/notes/faq.md
+++ b/docs/en/notes/faq.md
@ -4,37 +4,20 @@ We list some common troubles faced by many users and their corresponding solutio

 ## Installation

-The compatible MMSegmentation and MMCV versions are as below. Please install the correct version of MMCV to avoid installation issues.
+The compatible MMSegmentation, MMCV and MMEngine versions are as below. Please install the correct versions of them to avoid installation issues.

-| MMSegmentation version |          MMCV version          | MMClassification (optional) version | MMDetection (optional) version |
-| :--------------------: | :----------------------------: | :---------------------------------: | :----------------------------: |
-|   1.x/dev-1.x branch   |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4, \<=3.0.0rc5>  |
-|        1.0.0rc4        |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4, \<=3.0.0rc5>  |
-|        1.0.0rc3        |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4  \<=3.0.0rc5>  |
-|        1.0.0rc2        |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4  \<=3.0.0rc5>  |
-|        1.0.0rc1        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> |           mmcls>=1.0.0rc0           |          Not required          |
-|        1.0.0rc0        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> |           mmcls>=1.0.0rc0           |          Not required          |
-|         master         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.24.1         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.23.0         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.22.0         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.21.1         |   mmcv-full>=1.4.4, \<=1.6.0   |            Not required             |          Not required          |
-|         0.20.2         |  mmcv-full>=1.3.13, \<=1.6.0   |            Not required             |          Not required          |
-|         0.19.0         |  mmcv-full>=1.3.13, \<1.3.17   |            Not required             |          Not required          |
-|         0.18.0         |  mmcv-full>=1.3.13, \<1.3.17   |            Not required             |          Not required          |
-|         0.17.0         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.16.0         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.15.0         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.14.1         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.14.0         |   mmcv-full>=1.3.1, \<1.3.2    |            Not required             |          Not required          |
-|         0.13.0         |   mmcv-full>=1.3.1, \<1.3.2    |            Not required             |          Not required          |
-|         0.12.0         |   mmcv-full>=1.1.4, \<1.3.2    |            Not required             |          Not required          |
-|         0.11.0         |   mmcv-full>=1.1.4, \<1.3.0    |            Not required             |          Not required          |
-|         0.10.0         |   mmcv-full>=1.1.4, \<1.3.0    |            Not required             |          Not required          |
-|         0.9.0          |   mmcv-full>=1.1.4, \<1.3.0    |            Not required             |          Not required          |
-|         0.8.0          |   mmcv-full>=1.1.4, \<1.2.0    |            Not required             |          Not required          |
-|         0.7.0          |   mmcv-full>=1.1.2, \<1.2.0    |            Not required             |          Not required          |
-|         0.6.0          |   mmcv-full>=1.1.2, \<1.2.0    |            Not required             |          Not required          |
+| MMSegmentation version |          MMCV version          | MMEngine version  | MMClassification (optional) version | MMDetection (optional) version |
+| :--------------------: | :----------------------------: | :---------------: | :---------------------------------: | :----------------------------: |
+|     dev-1.x branch     |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |         mmdet>3.0.0rc5         |
+|       1.x branch       |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |         mmdet>3.0.0rc5         |
+|        1.0.0rc5        |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |         mmdet>3.0.0rc5         |
+|        1.0.0rc4        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc3        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4  \<=3.0.0rc5  |
+|        1.0.0rc2        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4  \<=3.0.0rc5  |
+|        1.0.0rc1        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+|        1.0.0rc0        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+
+Notes: To install MMSegmentation 0.x and master branch, please refer to [the faq 0.x document](https://mmsegmentation.readthedocs.io/en/latest/faq.html#installation) to check compatible versions of MMCV.

 ## How to know the number of GPUs needed to train the model

--- a/mmseg/init.py
+++ b/mmseg/init.py
@ -7,9 +7,9 @@ from packaging.version import parse

 from .version import __version__, version_info

-MMCV_MIN = '2.0.0rc3'
-MMCV_MAX = '2.0.0rc3'
-MMENGINE_MIN = '0.1.0'
+MMCV_MIN = '2.0.0rc4'
+MMCV_MAX = '2.1.0'
+MMENGINE_MIN = '0.2.0'
 MMENGINE_MAX = '1.0.0'


@ -58,9 +58,9 @@ mmcv_max_version = digit_version(MMCV_MAX)
 mmcv_version = digit_version(mmcv.__version__)


-assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+assert (mmcv_min_version <= mmcv_version < mmcv_max_version), \
    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
-    f'Please install mmcv==2.0.0rc3.'
+    f'Please install mmcv>=2.0.0rc4.'

 mmengine_min_version = digit_version(MMENGINE_MIN)
 mmengine_max_version = digit_version(MMENGINE_MAX)
--- a/mmseg/datasets/basesegdataset.py
+++ b/mmseg/datasets/basesegdataset.py
@ -4,6 +4,7 @@ import os.path as osp
 from typing import Callable, Dict, List, Optional, Sequence, Union

 import mmengine
+import mmengine.fileio as fileio
 import numpy as np
 from mmengine.dataset import BaseDataset, Compose

@ -72,9 +73,10 @@ class BaseSegDataset(BaseDataset):
        ignore_index (int): The label index to be ignored. Default: 255
        reduce_zero_label (bool): Whether to mark label zero as ignored.
            Default to False.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
    """
    METAINFO: dict = dict()

@ -95,16 +97,14 @@ class BaseSegDataset(BaseDataset):
        max_refetch: int = 1000,
        ignore_index: int = 255,
        reduce_zero_label: bool = False,
-        file_client_args: dict = dict(backend='disk')
+        backend_args: dict = dict(backend='local')
    ) -> None:

        self.img_suffix = img_suffix
        self.seg_map_suffix = seg_map_suffix
        self.ignore_index = ignore_index
        self.reduce_zero_label = reduce_zero_label
-        self.file_client_args = file_client_args
-        self.file_client = mmengine.FileClient.infer_client(
-            self.file_client_args)
+        self.backend_args = backend_args.copy()

        self.data_root = data_root
        self.data_prefix = copy.copy(data_prefix)
@ -239,7 +239,7 @@ class BaseSegDataset(BaseDataset):
        ann_dir = self.data_prefix.get('seg_map_path', None)
        if osp.isfile(self.ann_file):
            lines = mmengine.list_from_file(
-                self.ann_file, file_client_args=self.file_client_args)
+                self.ann_file, backend_args=self.backend_args)
            for line in lines:
                img_name = line.strip()
                data_info = dict(
@ -252,11 +252,12 @@ class BaseSegDataset(BaseDataset):
                data_info['seg_fields'] = []
                data_list.append(data_info)
        else:
-            for img in self.file_client.list_dir_or_file(
+            for img in fileio.list_dir_or_file(
                    dir_path=img_dir,
                    list_dir=False,
                    suffix=self.img_suffix,
-                    recursive=True):
+                    recursive=True,
+                    backend_args=self.backend_args):
                data_info = dict(img_path=osp.join(img_dir, img))
                if ann_dir is not None:
                    seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
--- a/mmseg/datasets/isaid.py
+++ b/mmseg/datasets/isaid.py
@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
 from .basesegdataset import BaseSegDataset

@ -33,4 +35,5 @@ class iSAIDDataset(BaseSegDataset):
            seg_map_suffix=seg_map_suffix,
            ignore_index=ignore_index,
            **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_path'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
--- a/mmseg/datasets/transforms/loading.py
+++ b/mmseg/datasets/transforms/loading.py
@ -3,7 +3,7 @@ import warnings
 from typing import Dict

 import mmcv
-import mmengine
+import mmengine.fileio as fileio
 import numpy as np
 from mmcv.transforms import BaseTransform
 from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
@ -54,15 +54,16 @@ class LoadAnnotations(MMCV_LoadAnnotations):
            argument for :func:``mmcv.imfrombytes``.
            See :fun:``mmcv.imfrombytes`` for details.
            Defaults to 'pillow'.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:``mmcv.fileio.FileClient`` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
    """

    def __init__(
        self,
        reduce_zero_label=None,
-        file_client_args=dict(backend='disk'),
+        backend_args=dict(backend='local'),
        imdecode_backend='pillow',
    ) -> None:
        super().__init__(
@ -71,14 +72,13 @@ class LoadAnnotations(MMCV_LoadAnnotations):
            with_seg=True,
            with_keypoints=False,
            imdecode_backend=imdecode_backend,
-            file_client_args=file_client_args)
+            backend_args=backend_args)
        self.reduce_zero_label = reduce_zero_label
        if self.reduce_zero_label is not None:
            warnings.warn('`reduce_zero_label` will be deprecated, '
                          'if you would like to ignore the zero label, please '
                          'set `reduce_zero_label=True` when dataset '
                          'initialized')
-        self.file_client_args = file_client_args.copy()
        self.imdecode_backend = imdecode_backend

    def _load_seg_map(self, results: dict) -> None:
@ -91,7 +91,8 @@ class LoadAnnotations(MMCV_LoadAnnotations):
            dict: The dict contains loaded semantic segmentation annotations.
        """

-        img_bytes = self.file_client.get(results['seg_map_path'])
+        img_bytes = fileio.get(
+            results['seg_map_path'], backend_args=self.backend_args)
        gt_semantic_seg = mmcv.imfrombytes(
            img_bytes, flag='unchanged',
            backend=self.imdecode_backend).squeeze().astype(np.uint8)
@ -121,9 +122,9 @@ class LoadAnnotations(MMCV_LoadAnnotations):

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
-        repr_str += f'(reduce_zero_label={self.reduce_zero_label},'
-        repr_str += f"imdecode_backend='{self.imdecode_backend}')"
-        repr_str += f'file_client_args={self.file_client_args})'
+        repr_str += f'(reduce_zero_label={self.reduce_zero_label}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
        return repr_str


@ -202,9 +203,10 @@ class LoadBiomedicalImageFromFile(BaseTransform):
        to_float32 (bool): Whether to convert the loaded image to a float32
            numpy array. If set to False, the loaded image is an float64 array.
            Defaults to True.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
    """

    def __init__(
@ -212,13 +214,12 @@ class LoadBiomedicalImageFromFile(BaseTransform):
        decode_backend: str = 'nifti',
        to_xyz: bool = False,
        to_float32: bool = True,
-        file_client_args: dict = dict(backend='disk')
+        backend_args: dict = dict(backend='local')
    ) -> None:
        self.decode_backend = decode_backend
        self.to_xyz = to_xyz
        self.to_float32 = to_float32
-        self.file_client_args = file_client_args.copy()
-        self.file_client = mmengine.FileClient(**self.file_client_args)
+        self.backend_args = backend_args.copy()

    def transform(self, results: Dict) -> Dict:
        """Functions to load image.
@ -232,7 +233,7 @@ class LoadBiomedicalImageFromFile(BaseTransform):

        filename = results['img_path']

-        data_bytes = self.file_client.get(filename)
+        data_bytes = fileio.get(filename, self.backend_args)
        img = datafrombytes(data_bytes, backend=self.decode_backend)

        if self.to_float32:
@ -257,7 +258,7 @@ class LoadBiomedicalImageFromFile(BaseTransform):
                    f"decode_backend='{self.decode_backend}', "
                    f'to_xyz={self.to_xyz}, '
                    f'to_float32={self.to_float32}, '
-                    f'file_client_args={self.file_client_args})')
+                    f'backend_args={self.backend_args})')
        return repr_str


@ -294,9 +295,10 @@ class LoadBiomedicalAnnotation(BaseTransform):
        to_float32 (bool): Whether to convert the loaded seg map to a float32
            numpy array. If set to False, the loaded image is an float64 array.
            Defaults to True.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to ``dict(backend='local')``.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
    """

    def __init__(
@ -304,14 +306,13 @@ class LoadBiomedicalAnnotation(BaseTransform):
        decode_backend: str = 'nifti',
        to_xyz: bool = False,
        to_float32: bool = True,
-        file_client_args: dict = dict(backend='disk')
+        backend_args: dict = dict(backend='local')
    ) -> None:
        super().__init__()
        self.decode_backend = decode_backend
        self.to_xyz = to_xyz
        self.to_float32 = to_float32
-        self.file_client_args = file_client_args.copy()
-        self.file_client = mmengine.FileClient(**self.file_client_args)
+        self.backend_args = backend_args.copy()

    def transform(self, results: Dict) -> Dict:
        """Functions to load image.
@ -322,7 +323,7 @@ class LoadBiomedicalAnnotation(BaseTransform):
        Returns:
            dict: The dict contains loaded image and meta information.
        """
-        data_bytes = self.file_client.get(results['seg_map_path'])
+        data_bytes = fileio.get(results['seg_map_path'], self.backend_args)
        gt_seg_map = datafrombytes(data_bytes, backend=self.decode_backend)

        if self.to_float32:
@ -342,7 +343,7 @@ class LoadBiomedicalAnnotation(BaseTransform):
                    f"decode_backend='{self.decode_backend}', "
                    f'to_xyz={self.to_xyz}, '
                    f'to_float32={self.to_float32}, '
-                    f'file_client_args={self.file_client_args})')
+                    f'backend_args={self.backend_args})')
        return repr_str


@ -383,9 +384,10 @@ class LoadBiomedicalData(BaseTransform):
            backend is 'nifti'. Defaults to 'nifti'.
        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
            Defaults to False.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
    """

    def __init__(
@ -393,13 +395,12 @@ class LoadBiomedicalData(BaseTransform):
        with_seg=False,
        decode_backend: str = 'numpy',
        to_xyz: bool = False,
-        file_client_args: dict = dict(backend='disk')
-    ) -> None:
+        backend_args: dict = dict(backend='local')
+    ) -> None:  # noqa
        self.with_seg = with_seg
        self.decode_backend = decode_backend
        self.to_xyz = to_xyz
-        self.file_client_args = file_client_args.copy()
-        self.file_client = mmengine.FileClient(**self.file_client_args)
+        self.backend_args = backend_args.copy()

    def transform(self, results: Dict) -> Dict:
        """Functions to load image.
@ -410,7 +411,7 @@ class LoadBiomedicalData(BaseTransform):
        Returns:
            dict: The dict contains loaded image and meta information.
        """
-        data_bytes = self.file_client.get(results['img_path'])
+        data_bytes = fileio.get(results['img_path'], self.backend_args)
        data = datafrombytes(data_bytes, backend=self.decode_backend)
        # img is 4D data (N, X, Y, Z), N is the number of protocol
        img = data[:-1, :]
@ -440,5 +441,5 @@ class LoadBiomedicalData(BaseTransform):
                    f'with_seg={self.with_seg}, '
                    f"decode_backend='{self.decode_backend}', "
                    f'to_xyz={self.to_xyz}, '
-                    f'file_client_args={self.file_client_args})')
+                    f'backend_args={self.backend_args})')
        return repr_str
--- a/mmseg/datasets/voc.py
+++ b/mmseg/datasets/voc.py
@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp

+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
 from .basesegdataset import BaseSegDataset

@ -34,5 +36,5 @@ class PascalVOCDataset(BaseSegDataset):
            seg_map_suffix=seg_map_suffix,
            ann_file=ann_file,
            **kwargs)
-        assert self.file_client.exists(
-            self.data_prefix['img_path']) and osp.isfile(self.ann_file)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args) and osp.isfile(self.ann_file)
--- a/mmseg/engine/hooks/visualization_hook.py
+++ b/mmseg/engine/hooks/visualization_hook.py
@ -4,7 +4,7 @@ import warnings
 from typing import Sequence

 import mmcv
-from mmengine.fileio import FileClient
+import mmengine.fileio as fileio
 from mmengine.hooks import Hook
 from mmengine.runner import Runner

@ -30,9 +30,10 @@ class SegVisualizationHook(Hook):
        interval (int): The interval of visualization. Defaults to 50.
        show (bool): Whether to display the drawn image. Default to False.
        wait_time (float): The interval of show (s). Defaults to 0.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
    """

    def __init__(self,
@ -40,7 +41,7 @@ class SegVisualizationHook(Hook):
                 interval: int = 50,
                 show: bool = False,
                 wait_time: float = 0.,
-                 file_client_args: dict = dict(backend='disk')):
+                 backend_args: dict = dict(backend='local')):
        self._visualizer: SegLocalVisualizer = \
            SegLocalVisualizer.get_current_instance()
        self.interval = interval
@ -54,8 +55,7 @@ class SegVisualizationHook(Hook):
                          'needs to be excluded.')

        self.wait_time = wait_time
-        self.file_client_args = file_client_args.copy()
-        self.file_client = None
+        self.backend_args = backend_args.copy()
        self.draw = draw
        if not self.draw:
            warnings.warn('The draw is False, it means that the '
@ -81,13 +81,11 @@ class SegVisualizationHook(Hook):
        if self.draw is False or mode == 'train':
            return

-        if self.file_client is None:
-            self.file_client = FileClient(**self.file_client_args)
-
        if self.every_n_inner_iters(batch_idx, self.interval):
            for output in outputs:
                img_path = output.img_path
-                img_bytes = self.file_client.get(img_path)
+                img_bytes = fileio.get(
+                    img_path, backend_args=self.backend_args)
                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
                window_name = f'{mode}_{osp.basename(img_path)}'

--- a/mmseg/models/decode_heads/decode_head.py
+++ b/mmseg/models/decode_heads/decode_head.py
@ -263,7 +263,7 @@ class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
        return losses

    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
-                test_cfg: ConfigType) -> List[Tensor]:
+                test_cfg: ConfigType) -> Tensor:
        """Forward function for prediction.

        Args:
@ -276,7 +276,7 @@ class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
            test_cfg (dict): The testing config.

        Returns:
-            List[Tensor]: Outputs segmentation logits map.
+            Tensor: Outputs segmentation logits map.
        """
        seg_logits = self.forward(inputs)

--- a/mmseg/models/segmentors/base.py
+++ b/mmseg/models/segmentors/base.py
@ -126,7 +126,7 @@ class BaseSegmentor(BaseModel, metaclass=ABCMeta):

    def postprocess_result(self,
                           seg_logits: Tensor,
-                           data_samples: OptSampleList = None) -> list:
+                           data_samples: OptSampleList = None) -> SampleList:
        """ Convert results list to `SegDataSample`.
        Args:
            seg_logits (Tensor): The segmentation results, seg_logits from
--- a/mmseg/models/segmentors/cascade_encoder_decoder.py
+++ b/mmseg/models/segmentors/cascade_encoder_decoder.py
@ -70,7 +70,7 @@ class CascadeEncoderDecoder(EncoderDecoder):
        self.num_classes = self.decode_head[-1].num_classes

    def encode_decode(self, inputs: Tensor,
-                      batch_img_metas: List[dict]) -> List[Tensor]:
+                      batch_img_metas: List[dict]) -> Tensor:
        """Encode images with backbone and decode into a semantic segmentation
        map of the same size as input."""
        x = self.extract_feat(inputs)
--- a/mmseg/models/segmentors/encoder_decoder.py
+++ b/mmseg/models/segmentors/encoder_decoder.py
@ -120,7 +120,7 @@ class EncoderDecoder(BaseSegmentor):
        return x

    def encode_decode(self, inputs: Tensor,
-                      batch_img_metas: List[dict]) -> List[Tensor]:
+                      batch_img_metas: List[dict]) -> Tensor:
        """Encode images with backbone and decode into a semantic segmentation
        map of the same size as input."""
        x = self.extract_feat(inputs)
--- a/mmseg/utils/init.py
+++ b/mmseg/utils/init.py
@ -13,9 +13,9 @@ from .collect_env import collect_env
 from .io import datafrombytes
 from .misc import add_prefix, stack_batch
 from .set_env import register_all_modules
-from .typing import (ConfigType, ForwardResults, MultiConfig, OptConfigType,
-                     OptMultiConfig, OptSampleList, SampleList, TensorDict,
-                     TensorList)
+from .typing_utils import (ConfigType, ForwardResults, MultiConfig,
+                           OptConfigType, OptMultiConfig, OptSampleList,
+                           SampleList, TensorDict, TensorList)

 __all__ = [
    'collect_env', 'register_all_modules', 'stack_batch', 'add_prefix',
--- a/mmseg/utils/misc.py
+++ b/mmseg/utils/misc.py
@ -5,7 +5,7 @@ import numpy as np
 import torch
 import torch.nn.functional as F

-from .typing import SampleList
+from .typing_utils import SampleList


 def add_prefix(inputs, prefix):
--- a/mmseg/utils/typing_utils.py
+++ b/mmseg/utils/typing_utils.py
--- a/mmseg/version.py
+++ b/mmseg/version.py
@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.

-__version__ = '1.0.0rc4'
+__version__ = '1.0.0rc5'


 def parse_version_info(version_str):
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@ -1,4 +1,4 @@
 mmcls>=1.0.0rc0
-mmcv==2.0.0rc3
-mmdet==3.0.0rc5
-mmengine>=0.1.0,<1.0.0
+mmcv>=2.0.0rc4
+-e git+https://github.com/open-mmlab/mmdetection.git@dev-3.x#egg=mmdet
+mmengine>=0.2.0,<1.0.0
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@ -2,7 +2,6 @@
 import os
 import os.path as osp
 import tempfile
-from unittest.mock import MagicMock

 import pytest

@ -300,17 +299,15 @@ def test_lip():
 def test_custom_classes_override_default(dataset, classes):

    dataset_class = DATASETS.get(dataset)
-    if isinstance(dataset_class, PascalVOCDataset):
-        tmp_file = tempfile.NamedTemporaryFile()
-        ann_file = f'{tmp_file.name}.txt'
-    else:
-        ann_file = MagicMock()
-
    original_classes = dataset_class.METAINFO.get('classes', None)

+    tmp_file = tempfile.NamedTemporaryFile()
+    ann_file = tmp_file.name
+    img_path = tempfile.mkdtemp()
+
    # Test setting classes as a tuple
    custom_dataset = dataset_class(
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
        ann_file=ann_file,
        metainfo=dict(classes=classes),
        test_mode=True,
@ -323,7 +320,7 @@ def test_custom_classes_override_default(dataset, classes):

    # Test setting classes as a list
    custom_dataset = dataset_class(
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
        ann_file=ann_file,
        metainfo=dict(classes=list(classes)),
        test_mode=True,
@ -337,7 +334,7 @@ def test_custom_classes_override_default(dataset, classes):
    # Test overriding not a subset
    custom_dataset = dataset_class(
        ann_file=ann_file,
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
        metainfo=dict(classes=[classes[0]]),
        test_mode=True,
        lazy_init=True)
@ -352,13 +349,13 @@ def test_custom_classes_override_default(dataset, classes):
        with pytest.raises(AssertionError):
            custom_dataset = dataset_class(
                ann_file=ann_file,
-                data_prefix=dict(img_path=MagicMock()),
+                data_prefix=dict(img_path=img_path),
                metainfo=None,
                test_mode=True,
                lazy_init=True)
    else:
        custom_dataset = dataset_class(
-            data_prefix=dict(img_path=MagicMock()),
+            data_prefix=dict(img_path=img_path),
            ann_file=ann_file,
            metainfo=None,
            test_mode=True,
@ -371,8 +368,8 @@ def test_custom_classes_override_default(dataset, classes):
 def test_custom_dataset_random_palette_is_generated():
    dataset = BaseSegDataset(
        pipeline=[],
-        data_prefix=dict(img_path=MagicMock()),
-        ann_file=MagicMock(),
+        data_prefix=dict(img_path=tempfile.mkdtemp()),
+        ann_file=tempfile.mkdtemp(),
        metainfo=dict(classes=('bus', 'car')),
        lazy_init=True,
        test_mode=True)
@ -384,8 +381,8 @@ def test_custom_dataset_random_palette_is_generated():

 def test_custom_dataset_custom_palette():
    dataset = BaseSegDataset(
-        data_prefix=dict(img_path=MagicMock()),
-        ann_file=MagicMock(),
+        data_prefix=dict(img_path=tempfile.mkdtemp()),
+        ann_file=tempfile.mkdtemp(),
        metainfo=dict(
            classes=('bus', 'car'), palette=[[100, 100, 100], [200, 200,
                                                               200]]),
@ -396,7 +393,7 @@ def test_custom_dataset_custom_palette():
    # test custom class and palette don't match
    with pytest.raises(ValueError):
        dataset = BaseSegDataset(
-            data_prefix=dict(img_path=MagicMock()),
-            ann_file=MagicMock(),
+            data_prefix=dict(img_path=tempfile.mkdtemp()),
+            ann_file=tempfile.mkdtemp(),
            metainfo=dict(classes=('bus', 'car'), palette=[[200, 200, 200]]),
            lazy_init=True)
--- a/tests/test_datasets/test_loading.py
+++ b/tests/test_datasets/test_loading.py
@ -30,7 +30,7 @@ class TestLoading:
        assert results['ori_shape'] == results['img'].shape[:2]
        assert repr(transform) == transform.__class__.__name__ + \
               "(ignore_empty=False, to_float32=False, color_type='color'," + \
-               " imdecode_backend='cv2', file_client_args={'backend': 'disk'})"
+               " imdecode_backend='cv2', backend_args=None)"

        # to_float32
        transform = LoadImageFromFile(to_float32=True)
@ -57,9 +57,9 @@ class TestLoading:
        results = transform(copy.deepcopy(results))
        assert results['gt_seg_map'].shape == (288, 512)
        assert results['gt_seg_map'].dtype == np.uint8
-        assert repr(transform) == transform.__class__.__name__ + \
-            "(reduce_zero_label=True,imdecode_backend='pillow')" + \
-            "file_client_args={'backend': 'disk'})"
+        # assert repr(transform) == transform.__class__.__name__ + \
+        #     "(reduce_zero_label=True, imdecode_backend='pillow', " + \
+        #     "backend_args={'backend': 'local'})"

        # reduce_zero_label
        transform = LoadAnnotations(reduce_zero_label=True)
@ -225,7 +225,7 @@ class TestLoading:
                                   'to_float32=False, '
                                   "color_type='color', "
                                   "imdecode_backend='cv2', "
-                                   "file_client_args={'backend': 'disk'})")
+                                   'backend_args=None)')

    def test_load_biomedical_img(self):
        results = dict(
@ -241,7 +241,7 @@ class TestLoading:
                                   "decode_backend='nifti', "
                                   'to_xyz=False, '
                                   'to_float32=True, '
-                                   "file_client_args={'backend': 'disk'})")
+                                   "backend_args={'backend': 'local'})")

    def test_load_biomedical_annotation(self):
        results = dict(
@ -265,7 +265,7 @@ class TestLoading:
                                   'with_seg=True, '
                                   "decode_backend='numpy', "
                                   'to_xyz=False, '
-                                   "file_client_args={'backend': 'disk'})")
+                                   "backend_args={'backend': 'local'})")

        transform = LoadBiomedicalData(with_seg=False)
        results = transform(copy.deepcopy(input_results))
@ -275,4 +275,4 @@ class TestLoading:
                                   'with_seg=False, '
                                   "decode_backend='numpy', "
                                   'to_xyz=False, '
-                                   "file_client_args={'backend': 'disk'})")
+                                   "backend_args={'backend': 'local'})")
--- a/tests/test_models/test_heads/test_mask2former_head.py
+++ b/tests/test_models/test_heads/test_mask2former_head.py
@ -25,65 +25,58 @@ def test_mask2former_head():
            num_outs=3,
            norm_cfg=dict(type='GN', num_groups=32),
            act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                        embed_dims=256,
                        num_heads=8,
                        num_levels=3,
                        num_points=4,
                        im2col_step=64,
                        dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                        norm_cfg=None,
                        init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                        embed_dims=256,
                        feedforward_channels=1024,
                        num_fcs=2,
                        ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
            init_cfg=None),
        enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
            return_intermediate=True,
            num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                    embed_dims=256,
                    num_heads=8,
                    attn_drop=0.0,
                    proj_drop=0.0,
                    dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                    embed_dims=256,
                    feedforward_channels=2048,
                    num_fcs=2,
                    act_cfg=dict(type='ReLU', inplace=True),
                    ffn_drop=0.0,
                    dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
            init_cfg=None),
        loss_cls=dict(
            type='mmdet.CrossEntropyLoss',