Merge pull request #2385 from open-mmlab/dev-1.x

Merge MMSegmentation 1.x development branch dev-1.x to main branch 1.x for v1.0.0rc2
pull/2604/head v1.0.0rc2
Miao Zheng 2022-12-06 17:11:02 +08:00 committed by GitHub
commit 8a611e122d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
78 changed files with 3088 additions and 184 deletions

View File

@ -61,8 +61,9 @@ jobs:
command: |
pip install git+https://github.com/open-mmlab/mmengine.git@main
pip install -U openmim
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
pip install -r requirements/tests.txt -r requirements/optional.txt
- run:
name: Build and install
@ -96,18 +97,20 @@ jobs:
command: |
git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine
git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmclassification.git /home/circleci/mmclassification
git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
- run:
name: Build Docker image
command: |
docker build .circleci/docker -t mmseg:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmclassification:/mmclassification -w /mmseg --name mmseg mmseg:gpu
docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmclassification:/mmclassification -v /home/circleci/mmdetection:/mmdetection -w /mmseg --name mmseg mmseg:gpu
- run:
name: Install mmseg dependencies
command: |
docker exec mmseg pip install -e /mmengine
docker exec mmseg pip install -U openmim
docker exec mmseg mim install 'mmcv>=2.0.0rc1'
docker exec mmseg mim install 'mmcv>=2.0.0rc3'
docker exec mmseg pip install -e /mmclassification
docker exec mmseg pip install -e /mmdetection
docker exec mmseg pip install -r requirements/tests.txt -r requirements/optional.txt
- run:
name: Build and install

View File

@ -20,11 +20,7 @@ jobs:
python -m pip install pre-commit
pre-commit install
- name: Linting
run: |
sudo apt-add-repository ppa:brightbox/ruby-ng -y
sudo apt-get update
sudo apt-get install -y ruby2.7
pre-commit run --all-files
run: pre-commit run --all-files
- name: Check docstring coverage
run: |
python -m pip install interrogate

View File

@ -44,8 +44,9 @@ jobs:
python -V
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install
@ -92,8 +93,9 @@ jobs:
python -V
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install
@ -155,8 +157,9 @@ jobs:
python -V
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install
@ -187,8 +190,9 @@ jobs:
python -V
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install

View File

@ -40,8 +40,9 @@ jobs:
run: |
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install
@ -92,8 +93,9 @@ jobs:
python -V
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install
@ -124,8 +126,9 @@ jobs:
python -V
pip install -U openmim
pip install git+https://github.com/open-mmlab/mmengine.git
mim install 'mmcv>=2.0.0rc1'
mim install 'mmcv>=2.0.0rc3'
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install unittest dependencies
run: pip install -r requirements/tests.txt -r requirements/optional.txt
- name: Build and install

1
.gitignore vendored
View File

@ -105,6 +105,7 @@ venv.bak/
# mypy
.mypy_cache/
data
.vscode
.idea

View File

@ -1,5 +1,5 @@
repos:
- repo: https://gitlab.com/pycqa/flake8.git
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8

View File

@ -62,11 +62,10 @@ The 1.x branch works with **PyTorch 1.6+**.
## What's New
v1.0.0rc1 was released in 2/11/2022.
v1.0.0rc2 was released in 6/12/2022.
Please refer to [changelog.md](docs/en/notes/changelog.md) for details and release history.
- Support PoolFormer ([#2191](https://github.com/open-mmlab/mmsegmentation/pull/2191))
- Add Decathlon dataset ([#2227](https://github.com/open-mmlab/mmsegmentation/pull/2227))
- Support MaskFormer and Mask2Former ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215), [2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))
## Installation
@ -139,6 +138,8 @@ Supported methods:
- [x] [Segmenter (ICCV'2021)](configs/segmenter)
- [x] [SegFormer (NeurIPS'2021)](configs/segformer)
- [x] [K-Net (NeurIPS'2021)](configs/knet)
- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
- [x] [Mask2Former (CVPR'2022)](configs/mask2former)
Supported datasets:
@ -194,6 +195,7 @@ This project is released under the [Apache 2.0 license](LICENSE).
- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models
- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries.
- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.

View File

@ -61,7 +61,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
## 更新日志
最新版本 v1.0.0rc1 在 2022.11.2 发布。
最新版本 v1.0.0rc2 在 2022.12.6 发布。
如果想了解更多版本更新细节和历史信息,请阅读[更新日志](docs/en/notes/changelog.md)。
## 安装
@ -134,6 +134,8 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
- [x] [Segmenter (ICCV'2021)](configs/segmenter)
- [x] [SegFormer (NeurIPS'2021)](configs/segformer)
- [x] [K-Net (NeurIPS'2021)](configs/knet)
- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
- [x] [Mask2Former (CVPR'2022)](configs/mask2former)
已支持的数据集:
@ -186,6 +188,7 @@ MMSegmentation 是一个由来自不同高校和企业的研发人员共同参
- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练库
- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库
- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台

View File

@ -0,0 +1,72 @@
# Mask2Former
[Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
## Introduction
<!-- [ALGORITHM] -->
<a href="https://github.com/facebookresearch/Mask2Former">Official Repo</a>
<a href="https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py">Code Snippet</a>
## Abstract
<!-- [ABSTRACT] -->
Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).
```bibtex
@inproceedings{cheng2021mask2former,
title={Masked-attention Mask Transformer for Universal Image Segmentation},
author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
journal={CVPR},
year={2022}
}
@inproceedings{cheng2021maskformer,
title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
journal={NeurIPS},
year={2021}
}
```
### Usage
- Mask2Former model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
```shell
pip install "mmdet>=3.0.0rc4"
```
## Results and models
### Cityscapes
| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
| ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Mask2Former | R-50-D32 | 512x1024 | 90000 | 5806 | 9.17 | 80.44 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json) |
| Mask2Former | R-101-D32 | 512x1024 | 90000 | 6971 | 7.11 | 80.80 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json)) |
| Mask2Former | Swin-T | 512x1024 | 90000 | 6511 | 7.18 | 81.71 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json)) |
| Mask2Former | Swin-S | 512x1024 | 90000 | 8282 | 5.57 | 82.57 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json)) |
| Mask2Former | Swin-B (in22k) | 512x1024 | 90000 | 11152 | 4.32 | 83.52 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
| Mask2Former | Swin-L (in22k) | 512x1024 | 90000 | 16207 | 2.86 | 83.65 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
### ADE20K
| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
| ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Mask2Former | R-50-D32 | 512x512 | 160000 | 3385 | 26.59 | 47.87 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json)) |
| Mask2Former | R-101-D32 | 512x512 | 160000 | 4190 | 22.97 | 48.60 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json)) |
| Mask2Former | Swin-T | 512x512 | 160000 | 3826 | 23.82 | 48.66 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json)) |
| Mask2Former | Swin-S | 512x512 | 160000 | 5034 | 19.69 | 51.24 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json)) |
| Mask2Former | Swin-B | 640x640 | 160000 | 5795 | 12.48 | 52.44 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json)) |
| Mask2Former | Swin-B (in22k) | 640x640 | 160000 | 5795 | 12.43 | 53.90 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
| Mask2Former | Swin-L (in22k) | 640x640 | 160000 | 9077 | 8.81 | 56.01 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
Note:
- All experiments of Mask2Former are implemented with 8 A100 GPUs with 2 samplers per GPU.
- As mentioned at [the official repo](https://github.com/facebookresearch/Mask2Former/issues/5), the results of Mask2Former are relatively not stable, the result of Mask2Former(swin-s) on ADE20K dataset in the table is the medium result obtained by training 5 times following the suggestion of the author.
- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.

View File

@ -0,0 +1,290 @@
Collections:
- Name: Mask2Former
Metadata:
Training Data:
- Usage
- Cityscapes
- ADE20K
Paper:
URL: https://arxiv.org/abs/2112.01527
Title: Masked-attention Mask Transformer for Universal Image Segmentation
README: configs/mask2former/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
Version: 3.x
Converted From:
Code: https://github.com/facebookresearch/Mask2Former
Models:
- Name: mask2former_r50_8xb2-90k_cityscapes-512x1024
In Collection: Mask2Former
Metadata:
backbone: R-50-D32
crop size: (512,1024)
lr schd: 90000
inference time (ms/im):
- value: 109.05
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,1024)
Training Memory (GB): 5806.0
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 80.44
Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth
- Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
In Collection: Mask2Former
Metadata:
backbone: R-101-D32
crop size: (512,1024)
lr schd: 90000
inference time (ms/im):
- value: 140.65
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,1024)
Training Memory (GB): 6971.0
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 80.8
Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth
- Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
In Collection: Mask2Former
Metadata:
backbone: Swin-T
crop size: (512,1024)
lr schd: 90000
inference time (ms/im):
- value: 139.28
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,1024)
Training Memory (GB): 6511.0
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 81.71
Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth
- Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
In Collection: Mask2Former
Metadata:
backbone: Swin-S
crop size: (512,1024)
lr schd: 90000
inference time (ms/im):
- value: 179.53
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,1024)
Training Memory (GB): 8282.0
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 82.57
Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth
- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
In Collection: Mask2Former
Metadata:
backbone: Swin-B (in22k)
crop size: (512,1024)
lr schd: 90000
inference time (ms/im):
- value: 231.48
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,1024)
Training Memory (GB): 11152.0
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 83.52
Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth
- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
In Collection: Mask2Former
Metadata:
backbone: Swin-L (in22k)
crop size: (512,1024)
lr schd: 90000
inference time (ms/im):
- value: 349.65
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,1024)
Training Memory (GB): 16207.0
Results:
- Task: Semantic Segmentation
Dataset: Cityscapes
Metrics:
mIoU: 83.65
Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth
- Name: mask2former_r50_8xb2-160k_ade20k-512x512
In Collection: Mask2Former
Metadata:
backbone: R-50-D32
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 37.61
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 3385.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 47.87
Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth
- Name: mask2former_r101_8xb2-160k_ade20k-512x512
In Collection: Mask2Former
Metadata:
backbone: R-101-D32
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 43.54
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 4190.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 48.6
Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth
- Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
In Collection: Mask2Former
Metadata:
backbone: Swin-T
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 41.98
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 3826.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 48.66
Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth
- Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
In Collection: Mask2Former
Metadata:
backbone: Swin-S
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 50.79
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 5034.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 51.24
Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth
- Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
In Collection: Mask2Former
Metadata:
backbone: Swin-B
crop size: (640,640)
lr schd: 160000
inference time (ms/im):
- value: 80.13
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (640,640)
Training Memory (GB): 5795.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 52.44
Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth
- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
In Collection: Mask2Former
Metadata:
backbone: Swin-B (in22k)
crop size: (640,640)
lr schd: 160000
inference time (ms/im):
- value: 80.45
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (640,640)
Training Memory (GB): 5795.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 53.9
Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth
- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
In Collection: Mask2Former
Metadata:
backbone: Swin-L (in22k)
crop size: (640,640)
lr schd: 160000
inference time (ms/im):
- value: 113.51
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (640,640)
Training Memory (GB): 9077.0
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 56.01
Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth

View File

@ -0,0 +1,7 @@
_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
model = dict(
backbone=dict(
depth=101,
init_cfg=dict(type='Pretrained',
checkpoint='torchvision://resnet101')))

View File

@ -0,0 +1,7 @@
_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
model = dict(
backbone=dict(
depth=101,
init_cfg=dict(type='Pretrained',
checkpoint='torchvision://resnet101')))

View File

@ -0,0 +1,207 @@
_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/ade20k.py']
custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
crop_size = (512, 512)
data_preprocessor = dict(
type='SegDataPreProcessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_val=0,
seg_pad_val=255,
size=crop_size,
test_cfg=dict(size_divisor=32))
num_classes = 150
model = dict(
type='EncoderDecoder',
data_preprocessor=data_preprocessor,
backbone=dict(
type='ResNet',
depth=50,
deep_stem=False,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN', requires_grad=False),
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
decode_head=dict(
type='Mask2FormerHead',
in_channels=[256, 512, 1024, 2048],
strides=[4, 8, 16, 32],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
num_transformer_feat_level=3,
align_corners=False,
pixel_decoder=dict(
type='mmdet.MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='mmdet.DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='mmdet.BaseTransformerLayer',
attn_cfgs=dict(
type='mmdet.MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='mmdet.SinePositionalEncoding',
num_feats=128,
normalize=True),
init_cfg=None),
enforce_decoder_input_project=False,
positional_encoding=dict(
type='mmdet.SinePositionalEncoding', num_feats=128,
normalize=True),
transformer_decoder=dict(
type='mmdet.DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='mmdet.DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='mmdet.MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1]),
loss_mask=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
reduction='mean',
loss_weight=5.0),
loss_dice=dict(
type='mmdet.DiceLoss',
use_sigmoid=True,
activate=True,
reduction='mean',
naive_dice=True,
eps=1.0,
loss_weight=5.0),
train_cfg=dict(
num_points=12544,
oversample_ratio=3.0,
importance_sample_ratio=0.75,
assigner=dict(
type='mmdet.HungarianAssigner',
match_costs=[
dict(type='mmdet.ClassificationCost', weight=2.0),
dict(
type='mmdet.CrossEntropyLossCost',
weight=5.0,
use_sigmoid=True),
dict(
type='mmdet.DiceCost',
weight=5.0,
pred_act=True,
eps=1.0)
]),
sampler=dict(type='mmdet.MaskPseudoSampler'))),
train_cfg=dict(),
test_cfg=dict(mode='whole'))
# dataset config
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=True),
dict(
type='RandomChoiceResize',
scales=[int(512 * x * 0.1) for x in range(5, 21)],
resize_type='ResizeShortestEdge',
max_size=2048),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='PackSegInputs')
]
train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
# optimizer
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
optim_wrapper = dict(
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(
custom_keys={
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi,
},
norm_decay_mult=0.0))
# learning policy
param_scheduler = [
dict(
type='PolyLR',
eta_min=0,
power=0.9,
begin=0,
end=160000,
by_epoch=False)
]
# training schedule for 160k
train_cfg = dict(
type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(
timer=dict(type='IterTimerHook'),
logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
param_scheduler=dict(type='ParamSchedulerHook'),
checkpoint=dict(
type='CheckpointHook', by_epoch=False, interval=5000,
save_best='mIoU'),
sampler_seed=dict(type='DistSamplerSeedHook'),
visualization=dict(type='SegVisualizationHook'))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)

View File

@ -0,0 +1,206 @@
_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py']
custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
crop_size = (512, 1024)
data_preprocessor = dict(
type='SegDataPreProcessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_val=0,
seg_pad_val=255,
size=crop_size,
test_cfg=dict(size_divisor=32))
num_classes = 19
model = dict(
type='EncoderDecoder',
data_preprocessor=data_preprocessor,
backbone=dict(
type='ResNet',
depth=50,
deep_stem=False,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='SyncBN', requires_grad=False),
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
decode_head=dict(
type='Mask2FormerHead',
in_channels=[256, 512, 1024, 2048],
strides=[4, 8, 16, 32],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
num_transformer_feat_level=3,
align_corners=False,
pixel_decoder=dict(
type='mmdet.MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='mmdet.DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='mmdet.BaseTransformerLayer',
attn_cfgs=dict(
type='mmdet.MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='mmdet.SinePositionalEncoding',
num_feats=128,
normalize=True),
init_cfg=None),
enforce_decoder_input_project=False,
positional_encoding=dict(
type='mmdet.SinePositionalEncoding', num_feats=128,
normalize=True),
transformer_decoder=dict(
type='mmdet.DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='mmdet.DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='mmdet.MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1]),
loss_mask=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
reduction='mean',
loss_weight=5.0),
loss_dice=dict(
type='mmdet.DiceLoss',
use_sigmoid=True,
activate=True,
reduction='mean',
naive_dice=True,
eps=1.0,
loss_weight=5.0),
train_cfg=dict(
num_points=12544,
oversample_ratio=3.0,
importance_sample_ratio=0.75,
assigner=dict(
type='mmdet.HungarianAssigner',
match_costs=[
dict(type='mmdet.ClassificationCost', weight=2.0),
dict(
type='mmdet.CrossEntropyLossCost',
weight=5.0,
use_sigmoid=True),
dict(
type='mmdet.DiceCost',
weight=5.0,
pred_act=True,
eps=1.0)
]),
sampler=dict(type='mmdet.MaskPseudoSampler'))),
train_cfg=dict(),
test_cfg=dict(mode='whole'))
# dataset config
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(
type='RandomChoiceResize',
scales=[int(1024 * x * 0.1) for x in range(5, 21)],
resize_type='ResizeShortestEdge',
max_size=4096),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='PackSegInputs')
]
train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
# optimizer
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
optim_wrapper = dict(
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(
custom_keys={
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi,
},
norm_decay_mult=0.0))
# learning policy
param_scheduler = [
dict(
type='PolyLR',
eta_min=0,
power=0.9,
begin=0,
end=90000,
by_epoch=False)
]
# training schedule for 90k
train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=5000)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(
timer=dict(type='IterTimerHook'),
logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
param_scheduler=dict(type='ParamSchedulerHook'),
checkpoint=dict(
type='CheckpointHook', by_epoch=False, interval=5000,
save_best='mIoU'),
sampler_seed=dict(type='DistSamplerSeedHook'),
visualization=dict(type='SegVisualizationHook'))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)

View File

@ -0,0 +1,237 @@
_base_ = [
'../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
]
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth' # noqa
custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
crop_size = (640, 640)
data_preprocessor = dict(
type='SegDataPreProcessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_val=0,
seg_pad_val=255,
size=crop_size)
num_classes = 150
depths = [2, 2, 18, 2]
model = dict(
type='EncoderDecoder',
data_preprocessor=data_preprocessor,
backbone=dict(
type='SwinTransformer',
pretrain_img_size=384,
embed_dims=128,
depths=depths,
num_heads=[4, 8, 16, 32],
window_size=12,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.3,
patch_norm=True,
out_indices=(0, 1, 2, 3),
with_cp=False,
frozen_stages=-1,
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(
type='Mask2FormerHead',
in_channels=[128, 256, 512, 1024],
strides=[4, 8, 16, 32],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
num_transformer_feat_level=3,
align_corners=False,
pixel_decoder=dict(
type='mmdet.MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='mmdet.DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='mmdet.BaseTransformerLayer',
attn_cfgs=dict(
type='mmdet.MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='mmdet.SinePositionalEncoding',
num_feats=128,
normalize=True),
init_cfg=None),
enforce_decoder_input_project=False,
positional_encoding=dict(
type='mmdet.SinePositionalEncoding', num_feats=128,
normalize=True),
transformer_decoder=dict(
type='mmdet.DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='mmdet.DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='mmdet.MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1]),
loss_mask=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
reduction='mean',
loss_weight=5.0),
loss_dice=dict(
type='mmdet.DiceLoss',
use_sigmoid=True,
activate=True,
reduction='mean',
naive_dice=True,
eps=1.0,
loss_weight=5.0),
train_cfg=dict(
num_points=12544,
oversample_ratio=3.0,
importance_sample_ratio=0.75,
assigner=dict(
type='mmdet.HungarianAssigner',
match_costs=[
dict(type='mmdet.ClassificationCost', weight=2.0),
dict(
type='mmdet.CrossEntropyLossCost',
weight=5.0,
use_sigmoid=True),
dict(
type='mmdet.DiceCost',
weight=5.0,
pred_act=True,
eps=1.0)
]),
sampler=dict(type='mmdet.MaskPseudoSampler'))),
train_cfg=dict(),
test_cfg=dict(mode='whole'))
# dataset config
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=True),
dict(
type='RandomChoiceResize',
scales=[int(x * 0.1 * 640) for x in range(5, 21)],
resize_type='ResizeShortestEdge',
max_size=2560),
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='PackSegInputs')
]
train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
optim_wrapper = dict(
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
# learning policy
param_scheduler = [
dict(
type='PolyLR',
eta_min=0,
power=0.9,
begin=0,
end=160000,
by_epoch=False)
]
# training schedule for 160k
train_cfg = dict(
type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
default_hooks = dict(
timer=dict(type='IterTimerHook'),
logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
param_scheduler=dict(type='ParamSchedulerHook'),
checkpoint=dict(
type='CheckpointHook', by_epoch=False, interval=5000,
save_best='mIoU'),
sampler_seed=dict(type='DistSamplerSeedHook'),
visualization=dict(type='SegVisualizationHook'))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)

View File

@ -0,0 +1,5 @@
_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth' # noqa
model = dict(
backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))

View File

@ -0,0 +1,42 @@
_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth' # noqa
depths = [2, 2, 18, 2]
model = dict(
backbone=dict(
pretrain_img_size=384,
embed_dims=128,
depths=depths,
num_heads=[4, 8, 16, 32],
window_size=12,
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(in_channels=[128, 256, 512, 1024]))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))

View File

@ -0,0 +1,9 @@
_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth' # noqa
model = dict(
backbone=dict(
embed_dims=192,
num_heads=[6, 12, 24, 48],
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(num_queries=100, in_channels=[192, 384, 768, 1536]))

View File

@ -0,0 +1,42 @@
_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth' # noqa
depths = [2, 2, 18, 2]
model = dict(
backbone=dict(
pretrain_img_size=384,
embed_dims=192,
depths=depths,
num_heads=[6, 12, 24, 48],
window_size=12,
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(in_channels=[192, 384, 768, 1536]))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))

View File

@ -0,0 +1,37 @@
_base_ = ['./mask2former_swin-t_8xb2-160k_ade20k-512x512.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth' # noqa
depths = [2, 2, 18, 2]
model = dict(
backbone=dict(
depths=depths, init_cfg=dict(type='Pretrained',
checkpoint=pretrained)))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))

View File

@ -0,0 +1,37 @@
_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth' # noqa
depths = [2, 2, 18, 2]
model = dict(
backbone=dict(
depths=depths, init_cfg=dict(type='Pretrained',
checkpoint=pretrained)))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))

View File

@ -0,0 +1,52 @@
_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth' # noqa
depths = [2, 2, 6, 2]
model = dict(
backbone=dict(
_delete_=True,
type='SwinTransformer',
embed_dims=96,
depths=depths,
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.3,
patch_norm=True,
out_indices=(0, 1, 2, 3),
with_cp=False,
frozen_stages=-1,
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(in_channels=[96, 192, 384, 768]))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))

View File

@ -0,0 +1,52 @@
_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth' # noqa
depths = [2, 2, 6, 2]
model = dict(
backbone=dict(
_delete_=True,
type='SwinTransformer',
embed_dims=96,
depths=depths,
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.3,
patch_norm=True,
out_indices=(0, 1, 2, 3),
with_cp=False,
frozen_stages=-1,
init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
decode_head=dict(in_channels=[96, 192, 384, 768]))
# set all layers in backbone to lr_mult=0.1
# set all norm layers, position_embeding,
# query_embeding, level_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=0.1, decay_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'absolute_pos_embed': backbone_embed_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
'query_feat': embed_multi,
'level_embed': embed_multi
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))

View File

@ -0,0 +1,60 @@
# MaskFormer
[MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
## Introduction
<!-- [ALGORITHM] -->
<a href="https://github.com/facebookresearch/MaskFormer/">Official Repo</a>
<a href="https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21">Code Snippet</a>
## Abstract
<!-- [ABSTRACT] -->
Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
<!-- [IMAGE] -->
<div align=center>
<img src="https://user-images.githubusercontent.com/24582831/199215459-ea507126-aafe-4823-8eb1-ae6487509d5c.png" width="90%"/>
</div>
```bibtex
@article{cheng2021per,
title={Per-pixel classification is not all you need for semantic segmentation},
author={Cheng, Bowen and Schwing, Alex and Kirillov, Alexander},
journal={Advances in Neural Information Processing Systems},
volume={34},
pages={17864--17875},
year={2021}
}
```
### Usage
- MaskFormer model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
```shell
pip install "mmdet>=3.0.0rc4"
```
## Results and models
### ADE20K
| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
| ---------- | --------- | --------- | ------- | -------- | -------------- | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| MaskFormer | R-50-D32 | 512x512 | 160000 | 3.29 | 42.20 | 44.29 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json) |
| MaskFormer | R-101-D32 | 512x512 | 160000 | 4.12 | 34.90 | 45.11 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json) |
| MaskFormer | Swin-T | 512x512 | 160000 | 3.73 | 40.53 | 46.69 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
| MaskFormer | Swin-S | 512x512 | 160000 | 5.33 | 26.98 | 49.36 | - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
Note:
- All experiments of MaskFormer are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
- The results of MaskFormer are relatively not stable. The accuracy (mIoU) of model with `R-101-D32` is from 44.7 to 46.0, and with `Swin-S` is from 49.0 to 49.8.
- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.

View File

@ -0,0 +1,101 @@
Collections:
- Name: MaskFormer
Metadata:
Training Data:
- Usage
- ADE20K
Paper:
URL: https://arxiv.org/abs/2107.06278
Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
Segmentation'
README: configs/maskformer/README.md
Code:
URL: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
Version: dev-3.x
Converted From:
Code: https://github.com/facebookresearch/MaskFormer/
Models:
- Name: maskformer_r50-d32_8xb2-160k_ade20k-512x512
In Collection: MaskFormer
Metadata:
backbone: R-50-D32
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 23.7
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 3.29
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 44.29
Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth
- Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
In Collection: MaskFormer
Metadata:
backbone: R-101-D32
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 28.65
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 4.12
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 45.11
Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth
- Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
In Collection: MaskFormer
Metadata:
backbone: Swin-T
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 24.67
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 3.73
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 46.69
Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth
- Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
In Collection: MaskFormer
Metadata:
backbone: Swin-S
crop size: (512,512)
lr schd: 160000
inference time (ms/im):
- value: 37.06
hardware: V100
backend: PyTorch
batch size: 1
mode: FP32
resolution: (512,512)
Training Memory (GB): 5.33
Results:
- Task: Semantic Segmentation
Dataset: ADE20K
Metrics:
mIoU: 49.36
Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth

View File

@ -0,0 +1,7 @@
_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
model = dict(
backbone=dict(
depth=101,
init_cfg=dict(type='Pretrained',
checkpoint='torchvision://resnet101')))

View File

@ -0,0 +1,143 @@
_base_ = [
'../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
'../_base_/schedules/schedule_160k.py'
]
norm_cfg = dict(type='SyncBN', requires_grad=True)
crop_size = (512, 512)
data_preprocessor = dict(
type='SegDataPreProcessor',
size=crop_size,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_val=0,
seg_pad_val=255)
# model_cfg
num_classes = 150
model = dict(
type='EncoderDecoder',
data_preprocessor=data_preprocessor,
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
dilations=(1, 1, 1, 1),
strides=(1, 2, 2, 2),
norm_cfg=norm_cfg,
norm_eval=True,
style='pytorch',
contract_dilation=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
decode_head=dict(
type='MaskFormerHead',
in_channels=[256, 512, 1024,
2048], # input channels of pixel_decoder modules
feat_channels=256,
in_index=[0, 1, 2, 3],
num_classes=150,
out_channels=256,
num_queries=100,
pixel_decoder=dict(
type='mmdet.PixelDecoder',
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU')),
enforce_decoder_input_project=False,
positional_encoding=dict(
type='mmdet.SinePositionalEncoding', num_feats=128,
normalize=True),
transformer_decoder=dict(
type='mmdet.DetrTransformerDecoder',
return_intermediate=True,
num_layers=6,
transformerlayers=dict(
type='mmdet.DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='mmdet.MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.1,
proj_drop=0.1,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.1,
dropout_layer=None,
add_identity=True),
# the following parameter was not used,
# just make current api happy
feedforward_channels=2048,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1]),
loss_mask=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
reduction='mean',
loss_weight=20.0),
loss_dice=dict(
type='mmdet.DiceLoss',
use_sigmoid=True,
activate=True,
reduction='mean',
naive_dice=True,
eps=1.0,
loss_weight=1.0),
train_cfg=dict(
assigner=dict(
type='mmdet.HungarianAssigner',
match_costs=[
dict(type='mmdet.ClassificationCost', weight=1.0),
dict(
type='mmdet.FocalLossCost',
weight=20.0,
binary_input=True),
dict(
type='mmdet.DiceCost',
weight=1.0,
pred_act=True,
eps=1.0)
]),
sampler=dict(type='mmdet.MaskPseudoSampler'))),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole'),
)
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(custom_keys={
'backbone': dict(lr_mult=0.1),
}))
# learning policy
param_scheduler = [
dict(
type='PolyLR',
eta_min=0,
power=0.9,
begin=0,
end=160000,
by_epoch=False)
]
# In MaskFormer implementation we use batch size 2 per GPU as default
train_dataloader = dict(batch_size=2, num_workers=2)
val_dataloader = dict(batch_size=1, num_workers=4)
test_dataloader = val_dataloader

View File

@ -0,0 +1,79 @@
checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth' # noqa
_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
backbone_norm_cfg = dict(type='LN', requires_grad=True)
depths = [2, 2, 18, 2]
model = dict(
backbone=dict(
_delete_=True,
type='SwinTransformer',
pretrain_img_size=224,
embed_dims=96,
patch_size=4,
window_size=7,
mlp_ratio=4,
depths=depths,
num_heads=[3, 6, 12, 24],
strides=(4, 2, 2, 2),
out_indices=(0, 1, 2, 3),
qkv_bias=True,
qk_scale=None,
patch_norm=True,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.3,
use_abs_pos_embed=False,
act_cfg=dict(type='GELU'),
norm_cfg=backbone_norm_cfg,
init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
decode_head=dict(
type='MaskFormerHead',
in_channels=[96, 192, 384,
768], # input channels of pixel_decoder modules
))
# optimizer
optimizer = dict(
type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
# set all layers in backbone to lr_mult=1.0
# set all norm layers, position_embeding,
# query_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
embed_multi = dict(decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(custom_keys=custom_keys))
# learning policy
param_scheduler = [
dict(
type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
dict(
type='PolyLR',
eta_min=0.0,
power=1.0,
begin=1500,
end=160000,
by_epoch=False,
)
]

View File

@ -0,0 +1,81 @@
_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth' # noqa
backbone_norm_cfg = dict(type='LN', requires_grad=True)
depths = [2, 2, 6, 2]
model = dict(
backbone=dict(
_delete_=True,
type='SwinTransformer',
pretrain_img_size=224,
embed_dims=96,
patch_size=4,
window_size=7,
mlp_ratio=4,
depths=depths,
num_heads=[3, 6, 12, 24],
strides=(4, 2, 2, 2),
out_indices=(0, 1, 2, 3),
qkv_bias=True,
qk_scale=None,
patch_norm=True,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.3,
use_abs_pos_embed=False,
act_cfg=dict(type='GELU'),
norm_cfg=backbone_norm_cfg,
init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
decode_head=dict(
type='MaskFormerHead',
in_channels=[96, 192, 384,
768], # input channels of pixel_decoder modules
))
# optimizer
optimizer = dict(
type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
# set all layers in backbone to lr_mult=1.0
# set all norm layers, position_embeding,
# query_embeding to decay_multi=0.0
backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
embed_multi = dict(decay_mult=0.0)
custom_keys = {
'backbone': dict(lr_mult=1.0),
'backbone.patch_embed.norm': backbone_norm_multi,
'backbone.norm': backbone_norm_multi,
'relative_position_bias_table': backbone_embed_multi,
'query_embed': embed_multi,
}
custom_keys.update({
f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
for stage_id, num_blocks in enumerate(depths)
for block_id in range(num_blocks)
})
custom_keys.update({
f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
for stage_id in range(len(depths) - 1)
})
# optimizer
optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(custom_keys=custom_keys))
# learning policy
param_scheduler = [
dict(
type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
dict(
type='PolyLR',
eta_min=0.0,
power=1.0,
begin=1500,
end=160000,
by_epoch=False,
)
]

View File

@ -33,7 +33,7 @@
"## Install MMSegmentation\n",
"This step may take several minutes. \n",
"\n",
"We use PyTorch 1.10 and CUDA 11.1 for this tutorial. You may install other versions by change the version number in pip install command. "
"We use PyTorch 1.12 and CUDA 11.3 for this tutorial. You may install other versions by change the version number in pip install command. "
]
},
{
@ -67,13 +67,13 @@
"outputs": [],
"source": [
"# Install PyTorch\n",
"!conda install pytorch=1.10.0 torchvision cudatoolkit=11.1 -c pytorch\n",
"!conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch\n",
"# Install mim\n",
"!pip install -U openmim\n",
"# Install mmengine\n",
"!mim install mmengine\n",
"# Install MMCV\n",
"!mim install 'mmcv >= 2.0.0rc1'"
"!mim install 'mmcv >= 2.0.0rc1'\n"
]
},
{
@ -500,16 +500,17 @@
},
"outputs": [],
"source": [
"from mmseg.apis import inference_model, show_result_pyplot\n",
"from mmseg.apis import init_model, inference_model, show_result_pyplot\n",
"\n",
"model=runner.model\n",
"model.cfg=cfg\n",
"# Init the model from the config and the checkpoint\n",
"checkpoint_path = './work_dirs/tutorial/iter_200.pth'\n",
"model = init_model(cfg, checkpoint_path, 'cuda:0')\n",
"\n",
"img = mmcv.imread('iccv09Data/images/6000124.jpg')\n",
"result = inference_model(model, img)\n",
"plt.figure(figsize=(8, 6))\n",
"vis_result = show_result_pyplot(model, img, result, palette)\n",
"plt.imshow(mmcv.bgr2rgb(vis_result))"
"vis_result = show_result_pyplot(model, img, result)\n",
"plt.imshow(mmcv.bgr2rgb(vis_result))\n"
]
}
],
@ -522,7 +523,7 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3.7.13 ('pt1.12')",
"display_name": "Python 3.8.5 ('tensorflow')",
"language": "python",
"name": "python3"
},
@ -536,7 +537,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
"version": "3.8.5"
},
"pycharm": {
"stem_cell": {
@ -549,7 +550,7 @@
},
"vscode": {
"interpreter": {
"hash": "ffdb7915c29738c259ec7ee5d0d1b9253c264f1fd267d45dd77f1a420396c120"
"hash": "20d4b83e0c8b3730b580c42434163d64f4b735d580303a8fade7c849d4d29eba"
}
}
},

View File

@ -1,7 +1,7 @@
ARG PYTORCH="1.11.0"
ARG CUDA="11.3"
ARG CUDNN="8"
ARG MMCV="2.0.0rc1"
ARG MMCV="2.0.0rc3"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

View File

@ -3,8 +3,8 @@ ARG CUDA="11.3"
ARG CUDNN="8"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ARG MMCV="2.0.0rc1"
ARG MMSEG="1.0.0rc1"
ARG MMCV="2.0.0rc3"
ARG MMSEG="1.0.0rc2"
ENV PYTHONUNBUFFERED TRUE

View File

@ -1,5 +1,13 @@
# Data Transforms
In this tutorial, we introduce the design of transforms pipeline in MMSegmentation.
The structure of this guide is as follows:
- [Data Transforms](#data-transforms)
- [Design of Data pipelines](#design-of-data-pipelines)
- [Customization data transformation](#customization-data-transformation)
## Design of Data pipelines
Following typical conventions, we use `Dataset` and `DataLoader` for data loading
@ -10,13 +18,31 @@ we introduce a new `DataContainer` type in MMCV to help collect and distribute
data of different size.
See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
The data preparation pipeline and the dataset is decomposed. Usually a dataset
In 1.x version of MMSegmentation, all data transformations are inherited from [`BaseTransform`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/transforms/base.py#L6).
The input and output types of transformations are both dict. A simple example is as follows:
```python
>>> from mmseg.datasets.transforms import LoadAnnotations
>>> transforms = LoadAnnotations()
>>> img_path = './data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png.png'
>>> gt_path = './data/cityscapes/gtFine/train/aachen/aachen_000015_000019_gtFine_instanceTrainIds.png'
>>> results = dict(
>>> img_path=img_path,
>>> seg_map_path=gt_path,
>>> reduce_zero_label=False,
>>> seg_fields=[])
>>> data_dict = transforms(results)
>>> print(data_dict.keys())
dict_keys(['img_path', 'seg_map_path', 'reduce_zero_label', 'seg_fields', 'gt_seg_map'])
```
The data preparation pipeline and the dataset are decomposed. Usually a dataset
defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
A pipeline consists of a sequence of operations. Each operation takes a dict as input and also outputs a dict for the next transform.
The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
Here is an pipeline example for PSPNet.
Here is a pipeline example for PSPNet.
```python
crop_size = (512, 1024)
@ -37,53 +63,110 @@ test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
# add loading annotation after ``Resize`` because ground truth
# does not need to do resize data transform
# does not need to resize data transform
dict(type='LoadAnnotations'),
dict(type='PackSegInputs')
]
```
For each operation, we list the related dict fields that are added/updated/removed.
Before pipelines, the information we can directly obtain from the datasets are img_path, seg_map_path.
For each operation, we list the related dict fields that are `added`/`updated`/`removed`.
Before pipelines, the information we can directly obtain from the datasets are `img_path` and `seg_map_path`.
### Data loading
`LoadImageFromFile`
`LoadImageFromFile`: Load an image from file.
- add: img, img_shape, ori_shape
- add: `img`, `img_shape`, `ori_shape`
`LoadAnnotations`
`LoadAnnotations`: Load semantic segmentation maps provided by dataset.
- add: seg_fields, gt_seg_map
- add: `seg_fields`, `gt_seg_map`
### Pre-processing
`RandomResize`
`RandomResize`: Random resize image & segmentation map.
- add: scale, scale_factor, keep_ratio
- update: img, img_shape, gt_seg_map
- add: `scale`, `scale_factor`, `keep_ratio`
- update: `img`, `img_shape`, `gt_seg_map`
`Resize`
`Resize`: Resize image & segmentation map.
- add: scale, scale_factor, keep_ratio
- update: img, gt_seg_map, img_shape
- add: `scale`, `scale_factor`, `keep_ratio`
- update: `img`, `gt_seg_map`, `img_shape`
`RandomCrop`
`RandomCrop`: Random crop image & segmentation map.
- update: img, pad_shape, gt_seg_map
- update: `img`, `gt_seg_map`, `img_shape`.
`RandomFlip`
`RandomFlip`: Flip the image & segmentation map.
- add: flip, flip_direction
- update: img, gt_seg_map
- add: `flip`, `flip_direction`
- update: `img`, `gt_seg_map`
`PhotoMetricDistortion`
`PhotoMetricDistortion`: Apply photometric distortion to image sequentially,
every transformation is applied with a probability of 0.5.
The position of random contrast is in second or second to last(mode 0 or 1 below, respectively).
- update: img
```
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
```
- update: `img`
### Formatting
`PackSegInputs`
`PackSegInputs`: Pack the inputs data for the semantic segmentation.
- add: inputs, data_sample
- add: `inputs`, `data_sample`
- remove: keys specified by `meta_keys` (merged into the metainfo of data_sample), all other keys
## Customization data transformation
The customized data transformation must inherited from `BaseTransform` and implement `transform` function.
Here we use a simple flipping transformation as example:
```python
import random
import mmcv
from mmcv.transforms import BaseTransform, TRANSFORMS
@TRANSFORMS.register_module()
class MyFlip(BaseTransform):
def __init__(self, direction: str):
super().__init__()
self.direction = direction
def transform(self, results: dict) -> dict:
img = results['img']
results['img'] = mmcv.imflip(img, direction=self.direction)
return results
```
Thus, we can instantiate a `MyFlip` object and use it to process the data dict.
```python
import numpy as np
transform = MyFlip(direction='horizontal')
data_dict = {'img': np.random.rand(224, 224, 3)}
data_dict = transform(data_dict)
processed_img = data_dict['img']
```
Or, we can use `MyFlip` transformation in data pipeline in our config file.
```python
pipeline = [
...
dict(type='MyFlip', direction='horizontal'),
...
]
```
Note that if you want to use `MyFlip` in config, you must ensure the file containing `MyFlip` is imported during runtime.

View File

@ -1,5 +1,47 @@
# Changelog of v1.x
## v1.0.0rc2(6/12/2022)
### Highlights
- Support MaskFormer ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215))
- Support Mask2Former ([#2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))
### Features
- Add ResizeShortestEdge transform ([#2339](https://github.com/open-mmlab/mmsegmentation/pull/2339))
- Support padding in data pre-processor for model testing([#2290](https://github.com/open-mmlab/mmsegmentation/pull/2290))
- Fix the problem of post-processing not removing padding ([#2367](https://github.com/open-mmlab/mmsegmentation/pull/2367))
### Bug fix
- Fix links in README ([#2024](https://github.com/open-mmlab/mmsegmentation/pull/2024))
- Fix swin load state_dict ([#2304](https://github.com/open-mmlab/mmsegmentation/pull/2304))
- Fix typo of BaseSegDataset docstring ([#2322](https://github.com/open-mmlab/mmsegmentation/pull/2322))
- Fix the bug in the visualization step ([#2326](https://github.com/open-mmlab/mmsegmentation/pull/2326))
- Fix ignore class id from -1 to 255 in BaseSegDataset ([#2332](https://github.com/open-mmlab/mmsegmentation/pull/2332))
- Fix KNet IterativeDecodeHead bug ([#2334](https://github.com/open-mmlab/mmsegmentation/pull/2334))
- Add input argument for datasets ([#2379](https://github.com/open-mmlab/mmsegmentation/pull/2379))
- Fix typo in warning on binary classification ([#2382](https://github.com/open-mmlab/mmsegmentation/pull/2382))
### Enhancement
- Fix ci for 1.x ([#2011](https://github.com/open-mmlab/mmsegmentation/pull/2011), [#2019](https://github.com/open-mmlab/mmsegmentation/pull/2019))
- Fix lint and pre-commit hook ([#2308](https://github.com/open-mmlab/mmsegmentation/pull/2308))
- Add `data` string in .gitignore file in dev-1.x branch ([#2336](https://github.com/open-mmlab/mmsegmentation/pull/2336))
- Make scipy as a default dependency in runtime ([#2362](https://github.com/open-mmlab/mmsegmentation/pull/2362))
- Delete mmcls in runtime.txt ([#2368](https://github.com/open-mmlab/mmsegmentation/pull/2368))
### Documentation
- Update configuration documentation ([#2048](https://github.com/open-mmlab/mmsegmentation/pull/2048))
- Update inference documentation ([#2052](https://github.com/open-mmlab/mmsegmentation/pull/2052))
- Update train test documentation ([#2061](https://github.com/open-mmlab/mmsegmentation/pull/2061))
- Update get started documentatin ([#2148](https://github.com/open-mmlab/mmsegmentation/pull/2148))
- Update transforms documentation ([#2088](https://github.com/open-mmlab/mmsegmentation/pull/2088))
- Add MMEval projects like in README ([#2259](https://github.com/open-mmlab/mmsegmentation/pull/2259))
- Translate the visualization.md ([#2298](https://github.com/open-mmlab/mmsegmentation/pull/2298))
## v1.0.0rc1 (2/11/2022)
### Highlights

View File

@ -6,31 +6,32 @@ We list some common troubles faced by many users and their corresponding solutio
The compatible MMSegmentation and MMCV versions are as below. Please install the correct version of MMCV to avoid installation issues.
| MMSegmentation version | MMCV version | MMClassification version |
| :--------------------: | :-------------------------: | :----------------------: |
| 1.0.0rc1 | mmcv >= 2.0.0rc1 | mmcls>=1.0.0rc0 |
| 1.0.0rc0 | mmcv >= 2.0.0rc1 | mmcls>=1.0.0rc0 |
| master | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 |
| 0.24.1 | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 |
| 0.23.0 | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 |
| 0.22.0 | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 |
| 0.21.1 | mmcv-full>=1.4.4, \<=1.6.0 | Not required |
| 0.20.2 | mmcv-full>=1.3.13, \<=1.6.0 | Not required |
| 0.19.0 | mmcv-full>=1.3.13, \<1.3.17 | Not required |
| 0.18.0 | mmcv-full>=1.3.13, \<1.3.17 | Not required |
| 0.17.0 | mmcv-full>=1.3.7, \<1.3.17 | Not required |
| 0.16.0 | mmcv-full>=1.3.7, \<1.3.17 | Not required |
| 0.15.0 | mmcv-full>=1.3.7, \<1.3.17 | Not required |
| 0.14.1 | mmcv-full>=1.3.7, \<1.3.17 | Not required |
| 0.14.0 | mmcv-full>=1.3.1, \<1.3.2 | Not required |
| 0.13.0 | mmcv-full>=1.3.1, \<1.3.2 | Not required |
| 0.12.0 | mmcv-full>=1.1.4, \<1.3.2 | Not required |
| 0.11.0 | mmcv-full>=1.1.4, \<1.3.0 | Not required |
| 0.10.0 | mmcv-full>=1.1.4, \<1.3.0 | Not required |
| 0.9.0 | mmcv-full>=1.1.4, \<1.3.0 | Not required |
| 0.8.0 | mmcv-full>=1.1.4, \<1.2.0 | Not required |
| 0.7.0 | mmcv-full>=1.1.2, \<1.2.0 | Not required |
| 0.6.0 | mmcv-full>=1.1.2, \<1.2.0 | Not required |
| MMSegmentation version | MMCV version | MMClassification (optional) version | MMDetection (optional) version |
| :--------------------: | :-------------------------: | :---------------------------------: | :----------------------------: |
| 1.0.0rc2 | mmcv >= 2.0.0rc3 | mmcls>=1.0.0rc0 | mmdet>=3.0.0rc4 |
| 1.0.0rc1 | mmcv >= 2.0.0rc1 | mmcls>=1.0.0rc0 | Not required |
| 1.0.0rc0 | mmcv >= 2.0.0rc1 | mmcls>=1.0.0rc0 | Not required |
| master | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 | Not required |
| 0.24.1 | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 | Not required |
| 0.23.0 | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 | Not required |
| 0.22.0 | mmcv-full>=1.4.4, \<=1.6.0 | mmcls>=0.20.1, \<=1.0.0 | Not required |
| 0.21.1 | mmcv-full>=1.4.4, \<=1.6.0 | Not required | Not required |
| 0.20.2 | mmcv-full>=1.3.13, \<=1.6.0 | Not required | Not required |
| 0.19.0 | mmcv-full>=1.3.13, \<1.3.17 | Not required | Not required |
| 0.18.0 | mmcv-full>=1.3.13, \<1.3.17 | Not required | Not required |
| 0.17.0 | mmcv-full>=1.3.7, \<1.3.17 | Not required | Not required |
| 0.16.0 | mmcv-full>=1.3.7, \<1.3.17 | Not required | Not required |
| 0.15.0 | mmcv-full>=1.3.7, \<1.3.17 | Not required | Not required |
| 0.14.1 | mmcv-full>=1.3.7, \<1.3.17 | Not required | Not required |
| 0.14.0 | mmcv-full>=1.3.1, \<1.3.2 | Not required | Not required |
| 0.13.0 | mmcv-full>=1.3.1, \<1.3.2 | Not required | Not required |
| 0.12.0 | mmcv-full>=1.1.4, \<1.3.2 | Not required | Not required |
| 0.11.0 | mmcv-full>=1.1.4, \<1.3.0 | Not required | Not required |
| 0.10.0 | mmcv-full>=1.1.4, \<1.3.0 | Not required | Not required |
| 0.9.0 | mmcv-full>=1.1.4, \<1.3.0 | Not required | Not required |
| 0.8.0 | mmcv-full>=1.1.4, \<1.2.0 | Not required | Not required |
| 0.7.0 | mmcv-full>=1.1.2, \<1.2.0 | Not required | Not required |
| 0.6.0 | mmcv-full>=1.1.2, \<1.2.0 | Not required | Not required |
## How to know the number of GPUs needed to train the model

View File

@ -1 +1,173 @@
# 可视化
MMSegmentation 1.x 提供了简便的方式监控训练时的状态以及可视化在模型预测时的数据。
## 训练状态监控
MMSegmentation 1.x 使用 TensorBoard 来监控训练时候的状态。
### TensorBoard 的配置
安装 TensorBoard 的过程可以按照 [官方安装指南](https://www.tensorflow.org/install) ,具体的步骤如下:
```shell
pip install tensorboardX
pip install future tensorboard
```
在配置文件 `default_runtime.py``vis_backend` 中添加 `TensorboardVisBackend`
```python
vis_backends = [dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend')]
visualizer = dict(
type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
```
### 检查 TensorBoard 中的标量
启动训练实验的命令如下
```shell
python tools/train.py configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py --work-dir work_dir/test_visual
```
开始训练后找到 `work_dir` 中的 `vis_data` 路径,例如:本次特定测试的 vis_data 路径如下所示:
```shell
work_dirs/test_visual/20220810_115248/vis_data
```
vis_data 路径中的标量文件包括了学习率、损失函数和 data_time 等,还记录了指标结果,您可以参考 MMEngine 中的 [记录日志教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html) 中的日志教程来帮助记录自己定义的数据。 Tensorboard 的可视化结果使用下面的命令执行:
```shell
tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
```
## 数据和结果的可视化
### 模型测试或验证期间的可视化数据样本
MMSegmentation 提供了 `SegVisualizationHook` ,它是一个可以用于可视化 ground truth 和在模型测试和验证期间的预测分割结果的[钩子](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/hook.html) 。 它的配置在 `default_hooks` 中,更多详细信息请参见 [执行器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)。
例如,在 `_base_/schedules/schedule_20k.py` 中,修改 `SegVisualizationHook` 配置,将 `draw` 设置为 `True` 以启用网络推理结果的存储,`interval` 表示预测结果的采样间隔, 设置为 1 时,将保存网络的每个推理结果。 `interval` 默认设置为 50
```python
default_hooks = dict(
timer=dict(type='IterTimerHook'),
logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
param_scheduler=dict(type='ParamSchedulerHook'),
checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
sampler_seed=dict(type='DistSamplerSeedHook'),
visualization=dict(type='SegVisualizationHook', draw=True, interval=1))
```
启动训练实验后,可视化结果将在 validation loop 存储到本地文件夹中,或者在一个数据集上启动评估模型时,预测结果将存储在本地。本地的可视化的存储结果保存在 `$WORK_DIRS/vis_data` 下的 `vis_image` 中,例如:
```shell
work_dirs/test_visual/20220810_115248/vis_data/vis_image
```
另外,如果在 `vis_backends` 中添加 `TensorboardVisBackend` ,如 [TensorBoard 的配置](#tensorboard-configuration),我们还可以运行下面的命令在 TensorBoard 中查看它们:
```shell
tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
```
### 可视化单个数据样本
如果你想可视化单个样本数据,我们建议使用 `SegLocalVisualizer`
`SegLocalVisualizer`是继承自 MMEngine 中`Visualizer` 类的子类,适用于 MMSegmentation 可视化,有关`Visualizer`的详细信息请参考在 MMEngine 中的[可视化教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html) 。
以下是一个关于 `SegLocalVisualizer` 的示例,首先你可以使用下面的命令下载这个案例中的数据:
<div align=center>
<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
</div>
```shell
wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
```
然后你可以找到他们本地的路径和使用下面的脚本文件对其进行可视化:
```python
import mmcv
import os.path as osp
import torch
# `PixelData` 是 MMEngine 中用于定义像素级标注或预测的数据结构。
# 请参考下面的MMEngine数据结构教程文件
# https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html#pixeldata
from mmengine.structures import PixelData
# `SegDataSample` 是在 MMSegmentation 中定义的不同组件之间的数据结构接口,
# 它包括 ground truth、语义分割的预测结果和预测逻辑。
# 详情请参考下面的 `SegDataSample` 教程文件:
# https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/advanced_guides/structures.md
from mmseg.structures import SegDataSample
from mmseg.visualization import SegLocalVisualizer
out_file = 'out_file_cityscapes'
save_dir = './work_dirs'
image = mmcv.imread(
osp.join(
osp.dirname(__file__),
'./aachen_000000_000019_leftImg8bit.png'
),
'color')
sem_seg = mmcv.imread(
osp.join(
osp.dirname(__file__),
'./aachen_000000_000019_gtFine_labelTrainIds.png' # noqa
),
'unchanged')
sem_seg = torch.from_numpy(sem_seg)
gt_sem_seg_data = dict(data=sem_seg)
gt_sem_seg = PixelData(**gt_sem_seg_data)
data_sample = SegDataSample()
data_sample.gt_sem_seg = gt_sem_seg
seg_local_visualizer = SegLocalVisualizer(
vis_backends=[dict(type='LocalVisBackend')],
save_dir=save_dir)
# 数据集的元信息通常包括类名的 `classes`
# 用于可视化每个前景颜色的 `palette`
# 所有类名和调色板都在此文件中定义:
# https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/utils/class_names.py
seg_local_visualizer.dataset_meta = dict(
classes=('road', 'sidewalk', 'building', 'wall', 'fence',
'pole', 'traffic light', 'traffic sign',
'vegetation', 'terrain', 'sky', 'person', 'rider',
'car', 'truck', 'bus', 'train', 'motorcycle',
'bicycle'),
palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70],
[102, 102, 156], [190, 153, 153], [153, 153, 153],
[250, 170, 30], [220, 220, 0], [107, 142, 35],
[152, 251, 152], [70, 130, 180], [220, 20, 60],
[255, 0, 0], [0, 0, 142], [0, 0, 70],
[0, 60, 100], [0, 80, 100], [0, 0, 230],
[119, 11, 32]])
# 当`show=True`时,直接显示结果,
# 当 `show=False`时,结果将保存在本地文件夹中。
seg_local_visualizer.add_datasample(out_file, image,
data_sample, show=False)
```
可视化后的图像结果和它的对应的 ground truth 图像可以在 `./work_dirs/vis_data/vis_image/` 路径找到,文件名字是:`out_file_cityscapes_0.png`
<div align=center>
<img src="https://user-images.githubusercontent.com/24582831/189835713-c0534054-4bfa-4b75-9254-0afbeb5ff02e.png" width="70%"/>
</div>
如果你想知道更多的关于可视化的使用指引,你可以参考 MMEngine 中的[可视化教程](<[https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md)>)

View File

@ -7,7 +7,7 @@ from packaging.version import parse
from .version import __version__, version_info
MMCV_MIN = '2.0.0rc1'
MMCV_MIN = '2.0.0rc3'
MMCV_MAX = '2.1.0'
MMENGINE_MIN = '0.1.0'
MMENGINE_MAX = '1.0.0'

View File

@ -22,7 +22,8 @@ from .transforms import (CLAHE, AdjustGamma, GenerateEdge, LoadAnnotations,
LoadBiomedicalImageFromFile, LoadImageFromNDArray,
PackSegInputs, PhotoMetricDistortion, RandomCrop,
RandomCutOut, RandomMosaic, RandomRotate, Rerange,
ResizeToMultiple, RGB2Gray, SegRescale)
ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
SegRescale)
from .voc import PascalVOCDataset
__all__ = [
@ -36,5 +37,5 @@ __all__ = [
'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
'DecathlonDataset', 'LIPDataset'
'DecathlonDataset', 'LIPDataset', 'ResizeShortestEdge'
]

View File

@ -80,9 +80,13 @@ class ADE20KDataset(BaseSegDataset):
[184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
[102, 255, 0], [92, 0, 255]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.jpg',
seg_map_suffix='.png',
reduce_zero_label=True,
**kwargs) -> None:
super().__init__(
img_suffix='.jpg',
seg_map_suffix='.png',
reduce_zero_label=True,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)

View File

@ -65,7 +65,7 @@ class BaseSegDataset(BaseDataset):
instantiation. In some cases, such as visualization, only the meta
information of the dataset is needed, which is not necessary to
load annotation file. ``Basedataset`` can skip load annotations to
save time by set ``lazy_init=False``. Defaults to False.
save time by set ``lazy_init=True``. Defaults to False.
max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
None img. The maximum extra number of cycles to get a valid
image. Defaults to 1000.
@ -179,7 +179,7 @@ class BaseSegDataset(BaseDataset):
f'subset of classes {old_classes} in METAINFO.')
for i, c in enumerate(old_classes):
if c not in new_classes:
label_map[i] = -1
label_map[i] = 255
else:
label_map[i] = new_classes.index(c)
return label_map

View File

@ -17,10 +17,14 @@ class ChaseDB1Dataset(BaseSegDataset):
classes=('background', 'vessel'),
palette=[[120, 120, 120], [6, 230, 230]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='_1stHO.png',
reduce_zero_label=False,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='_1stHO.png',
reduce_zero_label=False,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)
assert self.file_client.exists(self.data_prefix['img_path'])

View File

@ -91,6 +91,9 @@ class COCOStuffDataset(BaseSegDataset):
[192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
[64, 192, 96], [64, 160, 64], [64, 64, 0]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.jpg',
seg_map_suffix='_labelTrainIds.png',
**kwargs) -> None:
super().__init__(
img_suffix='.jpg', seg_map_suffix='_labelTrainIds.png', **kwargs)
img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)

View File

@ -7,8 +7,9 @@ from .cityscapes import CityscapesDataset
class DarkZurichDataset(CityscapesDataset):
"""DarkZurichDataset dataset."""
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='_rgb_anon.png',
seg_map_suffix='_gt_labelTrainIds.png',
**kwargs) -> None:
super().__init__(
img_suffix='_rgb_anon.png',
seg_map_suffix='_gt_labelTrainIds.png',
**kwargs)
img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)

View File

@ -17,10 +17,14 @@ class DRIVEDataset(BaseSegDataset):
classes=('background', 'vessel'),
palette=[[120, 120, 120], [6, 230, 230]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='_manual1.png',
reduce_zero_label=False,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='_manual1.png',
reduce_zero_label=False,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)
assert self.file_client.exists(self.data_prefix['img_path'])

View File

@ -17,10 +17,14 @@ class HRFDataset(BaseSegDataset):
classes=('background', 'vessel'),
palette=[[120, 120, 120], [6, 230, 230]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=False,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=False,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)
assert self.file_client.exists(self.data_prefix['img_path'])

View File

@ -23,10 +23,14 @@ class iSAIDDataset(BaseSegDataset):
[0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
[0, 127, 191], [0, 127, 255], [0, 100, 155]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='_instance_color_RGB.png',
ignore_index=255,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='_instance_color_RGB.png',
ignore_index=255,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
ignore_index=ignore_index,
**kwargs)
assert self.file_client.exists(self.data_prefix['img_path'])

View File

@ -17,9 +17,13 @@ class ISPRSDataset(BaseSegDataset):
palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
[255, 255, 0], [255, 0, 0]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=True,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=True,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)

View File

@ -39,5 +39,9 @@ class LIPDataset(BaseSegDataset):
[255, 170, 0],
))
def __init__(self, **kwargs) -> None:
super().__init__(img_suffix='.jpg', seg_map_suffix='.png', **kwargs)
def __init__(self,
img_suffix='.jpg',
seg_map_suffix='.png',
**kwargs) -> None:
super().__init__(
img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)

View File

@ -17,9 +17,13 @@ class LoveDADataset(BaseSegDataset):
palette=[[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
[159, 129, 183], [0, 255, 0], [255, 195, 128]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=True,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=True,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)

View File

@ -7,8 +7,9 @@ from .cityscapes import CityscapesDataset
class NightDrivingDataset(CityscapesDataset):
"""NightDrivingDataset dataset."""
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='_leftImg8bit.png',
seg_map_suffix='_gtCoarse_labelTrainIds.png',
**kwargs) -> None:
super().__init__(
img_suffix='_leftImg8bit.png',
seg_map_suffix='_gtCoarse_labelTrainIds.png',
**kwargs)
img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)

View File

@ -45,10 +45,14 @@ class PascalContextDataset(BaseSegDataset):
[255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
[255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
def __init__(self, ann_file: str, **kwargs) -> None:
def __init__(self,
ann_file: str,
img_suffix='.jpg',
seg_map_suffix='.png',
**kwargs) -> None:
super().__init__(
img_suffix='.jpg',
seg_map_suffix='.png',
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
ann_file=ann_file,
reduce_zero_label=False,
**kwargs)
@ -95,12 +99,17 @@ class PascalContextDataset59(BaseSegDataset):
[255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
[255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
def __init__(self, ann_file: str, **kwargs):
def __init__(self,
ann_file: str,
img_suffix='.jpg',
seg_map_suffix='.png',
reduce_zero_label=True,
**kwargs):
super().__init__(
img_suffix='.jpg',
seg_map_suffix='.png',
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
ann_file=ann_file,
reduce_zero_label=True,
reduce_zero_label=reduce_zero_label,
**kwargs)
assert self.file_client.exists(
self.data_prefix['img_path']) and osp.isfile(self.ann_file)

View File

@ -17,9 +17,13 @@ class PotsdamDataset(BaseSegDataset):
palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
[255, 255, 0], [255, 0, 0]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=True,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='.png',
reduce_zero_label=True,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)

View File

@ -16,10 +16,14 @@ class STAREDataset(BaseSegDataset):
classes=('background', 'vessel'),
palette=[[120, 120, 120], [6, 230, 230]])
def __init__(self, **kwargs) -> None:
def __init__(self,
img_suffix='.png',
seg_map_suffix='.ah.png',
reduce_zero_label=False,
**kwargs) -> None:
super().__init__(
img_suffix='.png',
seg_map_suffix='.ah.png',
reduce_zero_label=False,
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
reduce_zero_label=reduce_zero_label,
**kwargs)
assert self.file_client.exists(self.data_prefix['img_path'])

View File

@ -5,13 +5,15 @@ from .loading import (LoadAnnotations, LoadBiomedicalAnnotation,
LoadImageFromNDArray)
from .transforms import (CLAHE, AdjustGamma, GenerateEdge,
PhotoMetricDistortion, RandomCrop, RandomCutOut,
RandomMosaic, RandomRotate, Rerange, ResizeToMultiple,
RGB2Gray, SegRescale)
RandomMosaic, RandomRotate, Rerange,
ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
SegRescale)
__all__ = [
'LoadAnnotations', 'RandomCrop', 'SegRescale', 'PhotoMetricDistortion',
'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray',
'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge'
'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
'ResizeShortestEdge'
]

View File

@ -1226,3 +1226,87 @@ class GenerateEdge(BaseTransform):
repr_str += f'edge_width={self.edge_width}, '
repr_str += f'ignore_index={self.ignore_index})'
return repr_str
@TRANSFORMS.register_module()
class ResizeShortestEdge(BaseTransform):
"""Resize the image and mask while keeping the aspect ratio unchanged.
Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
Copyright (c) Facebook, Inc. and its affiliates.
Licensed under the Apache-2.0 License
This transform attempts to scale the shorter edge to the given
`scale`, as long as the longer edge does not exceed `max_size`.
If `max_size` is reached, then downscale so that the longer
edge does not exceed `max_size`.
Required Keys:
- img
- gt_seg_map (optional)
Modified Keys:
- img
- img_shape
- gt_seg_map (optional))
Added Keys:
- scale
- scale_factor
- keep_ratio
Args:
scale (Union[int, Tuple[int, int]]): The target short edge length.
If it's tuple, will select the min value as the short edge length.
max_size (int): The maximum allowed longest edge length.
"""
def __init__(self, scale: Union[int, Tuple[int, int]],
max_size: int) -> None:
super().__init__()
self.scale = scale
self.max_size = max_size
# Create a empty Resize object
self.resize = TRANSFORMS.build({
'type': 'Resize',
'scale': 0,
'keep_ratio': True
})
def _get_output_shape(self, img, short_edge_length) -> Tuple[int, int]:
"""Compute the target image shape with the given `short_edge_length`.
Args:
img (np.ndarray): The input image.
short_edge_length (Union[int, Tuple[int, int]]): The target short
edge length. If it's tuple, will select the min value as the
short edge length.
"""
h, w = img.shape[:2]
if isinstance(short_edge_length, int):
size = short_edge_length * 1.0
elif isinstance(short_edge_length, tuple):
size = min(short_edge_length) * 1.0
scale = size / min(h, w)
if h < w:
new_h, new_w = size, scale * w
else:
new_h, new_w = scale * h, size
if max(new_h, new_w) > self.max_size:
scale = self.max_size * 1.0 / max(new_h, new_w)
new_h *= scale
new_w *= scale
new_h = int(new_h + 0.5)
new_w = int(new_w + 0.5)
return (new_w, new_h)
def transform(self, results: Dict) -> Dict:
self.resize.scale = self._get_output_shape(results['img'], self.scale)
return self.resize(results)

View File

@ -24,10 +24,14 @@ class PascalVOCDataset(BaseSegDataset):
[0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
[0, 64, 128]])
def __init__(self, ann_file, **kwargs) -> None:
def __init__(self,
ann_file,
img_suffix='.jpg',
seg_map_suffix='.png',
**kwargs) -> None:
super().__init__(
img_suffix='.jpg',
seg_map_suffix='.png',
img_suffix=img_suffix,
seg_map_suffix=seg_map_suffix,
ann_file=ann_file,
**kwargs)
assert self.file_client.exists(

View File

@ -11,6 +11,7 @@ from mmengine.model import BaseModule, ModuleList
from mmengine.model.weight_init import (constant_init, kaiming_init,
trunc_normal_)
from mmengine.runner.checkpoint import _load_checkpoint
from scipy import interpolate
from torch.nn.modules.batchnorm import _BatchNorm
from torch.nn.modules.utils import _pair as to_2tuple
@ -18,11 +19,6 @@ from mmseg.registry import MODELS
from ..utils import PatchEmbed
from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer
try:
from scipy import interpolate
except ImportError:
interpolate = None
class BEiTAttention(BaseModule):
"""Window based multi-head self-attention (W-MSA) module with relative

View File

@ -13,7 +13,7 @@ from mmengine.logging import print_log
from mmengine.model import BaseModule, ModuleList
from mmengine.model.weight_init import (constant_init, trunc_normal_,
trunc_normal_init)
from mmengine.runner import CheckpointLoader, load_state_dict
from mmengine.runner import CheckpointLoader
from mmengine.utils import to_2tuple
from mmseg.registry import MODELS
@ -732,7 +732,7 @@ class SwinTransformer(BaseModule):
nH2, L2).permute(1, 0).contiguous()
# load state_dict
load_state_dict(self, state_dict, strict=False, logger=None)
self.load_state_dict(state_dict, strict=False)
def forward(self, x):
x, hw_shape = self.patch_embed(x)

View File

@ -48,18 +48,24 @@ class SegDataPreProcessor(BaseDataPreprocessor):
rgb_to_bgr (bool): whether to convert image from RGB to RGB.
Defaults to False.
batch_augments (list[dict], optional): Batch-level augmentations
test_cfg (dict, optional): The padding size config in testing, if not
specify, will use `size` and `size_divisor` params as default.
Defaults to None, only supports keys `size` or `size_divisor`.
"""
def __init__(self,
mean: Sequence[Number] = None,
std: Sequence[Number] = None,
size: Optional[tuple] = None,
size_divisor: Optional[int] = None,
pad_val: Number = 0,
seg_pad_val: Number = 255,
bgr_to_rgb: bool = False,
rgb_to_bgr: bool = False,
batch_augments: Optional[List[dict]] = None):
def __init__(
self,
mean: Sequence[Number] = None,
std: Sequence[Number] = None,
size: Optional[tuple] = None,
size_divisor: Optional[int] = None,
pad_val: Number = 0,
seg_pad_val: Number = 255,
bgr_to_rgb: bool = False,
rgb_to_bgr: bool = False,
batch_augments: Optional[List[dict]] = None,
test_cfg: dict = None,
):
super().__init__()
self.size = size
self.size_divisor = size_divisor
@ -86,6 +92,9 @@ class SegDataPreProcessor(BaseDataPreprocessor):
# TODO: support batch augmentations.
self.batch_augments = batch_augments
# Support different padding methods in testing
self.test_cfg = test_cfg
def forward(self, data: dict, training: bool = False) -> Dict[str, Any]:
"""Perform normalization、padding and bgr2rgb conversion based on
``BaseDataPreprocessor``.
@ -122,10 +131,21 @@ class SegDataPreProcessor(BaseDataPreprocessor):
if self.batch_augments is not None:
inputs, data_samples = self.batch_augments(
inputs, data_samples)
return dict(inputs=inputs, data_samples=data_samples)
else:
assert len(inputs) == 1, (
'Batch inference is not support currently, '
'as the image size might be different in a batch')
return dict(
inputs=torch.stack(inputs, dim=0), data_samples=data_samples)
# pad images when testing
if self.test_cfg:
inputs, padded_samples = stack_batch(
inputs=inputs,
size=self.test_cfg.get('size', None),
size_divisor=self.test_cfg.get('size_divisor', None),
pad_val=self.pad_val,
seg_pad_val=self.seg_pad_val)
for data_sample, pad_info in zip(data_samples, padded_samples):
data_sample.set_metainfo({**pad_info})
else:
inputs = torch.stack(inputs, dim=0)
return dict(inputs=inputs, data_samples=data_samples)

View File

@ -15,6 +15,8 @@ from .gc_head import GCHead
from .isa_head import ISAHead
from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
from .lraspp_head import LRASPPHead
from .mask2former_head import Mask2FormerHead
from .maskformer_head import MaskFormerHead
from .nl_head import NLHead
from .ocr_head import OCRHead
from .point_head import PointHead
@ -36,5 +38,5 @@ __all__ = [
'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
'KernelUpdateHead', 'KernelUpdator'
'KernelUpdateHead', 'KernelUpdator', 'MaskFormerHead', 'Mask2FormerHead'
]

View File

@ -120,7 +120,7 @@ class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
warnings.warn('For binary segmentation, we suggest using'
'`out_channels = 1` to define the output'
'channels of segmentor, and use `threshold`'
'to convert seg_logist into a prediction'
'to convert `seg_logits` into a prediction'
'applying a threshold')
out_channels = num_classes

View File

@ -413,6 +413,9 @@ class IterativeDecodeHead(BaseDecodeHead):
def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
**kwargs):
# ``IterativeDecodeHead`` would skip initialization of
# ``BaseDecodeHead`` which would be called when building
# ``self.kernel_generate_head``.
super(BaseDecodeHead, self).__init__(**kwargs)
assert num_stages == len(kernel_update_head)
self.num_stages = num_stages
@ -422,6 +425,7 @@ class IterativeDecodeHead(BaseDecodeHead):
self.num_classes = self.kernel_generate_head.num_classes
self.input_transform = self.kernel_generate_head.input_transform
self.ignore_index = self.kernel_generate_head.ignore_index
self.out_channels = self.num_classes
for head_cfg in kernel_update_head:
self.kernel_update_head.append(MODELS.build(head_cfg))

View File

@ -0,0 +1,162 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
try:
from mmdet.models.dense_heads import \
Mask2FormerHead as MMDET_Mask2FormerHead
except ModuleNotFoundError:
MMDET_Mask2FormerHead = None
from mmengine.structures import InstanceData
from torch import Tensor
from mmseg.registry import MODELS
from mmseg.structures.seg_data_sample import SegDataSample
from mmseg.utils import ConfigType, SampleList
@MODELS.register_module()
class Mask2FormerHead(MMDET_Mask2FormerHead):
"""Implements the Mask2Former head.
See `Mask2Former: Masked-attention Mask Transformer for Universal Image
Segmentation <https://arxiv.org/abs/2112.01527>`_ for details.
Args:
num_classes (int): Number of classes. Default: 150.
align_corners (bool): align_corners argument of F.interpolate.
Default: False.
ignore_index (int): The label index to be ignored. Default: 255.
"""
def __init__(self,
num_classes,
align_corners=False,
ignore_index=255,
**kwargs):
super().__init__(**kwargs)
self.num_classes = num_classes
self.align_corners = align_corners
self.out_channels = num_classes
self.ignore_index = ignore_index
feat_channels = kwargs['feat_channels']
self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
"""Perform forward propagation to convert paradigm from MMSegmentation
to MMDetection to ensure ``MMDET_Mask2FormerHead`` could be called
normally. Specifically, ``batch_gt_instances`` would be added.
Args:
batch_data_samples (List[:obj:`SegDataSample`]): The Data
Samples. It usually includes information such as
`gt_sem_seg`.
Returns:
tuple[Tensor]: A tuple contains two lists.
- batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``labels``, each is
unique ground truth label id of images, with
shape (num_gt, ) and ``masks``, each is ground truth
masks of each instances of a image, shape (num_gt, h, w).
- batch_img_metas (list[dict]): List of image meta information.
"""
batch_img_metas = []
batch_gt_instances = []
for data_sample in batch_data_samples:
batch_img_metas.append(data_sample.metainfo)
gt_sem_seg = data_sample.gt_sem_seg.data
classes = torch.unique(
gt_sem_seg,
sorted=False,
return_inverse=False,
return_counts=False)
# remove ignored region
gt_labels = classes[classes != self.ignore_index]
masks = []
for class_id in gt_labels:
masks.append(gt_sem_seg == class_id)
if len(masks) == 0:
gt_masks = torch.zeros(
(0, gt_sem_seg.shape[-2],
gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
else:
gt_masks = torch.stack(masks).squeeze(1).long()
instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
batch_gt_instances.append(instance_data)
return batch_gt_instances, batch_img_metas
def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
train_cfg: ConfigType) -> dict:
"""Perform forward propagation and loss calculation of the decoder head
on the features of the upstream network.
Args:
x (tuple[Tensor]): Multi-level features from the upstream
network, each is a 4D-tensor.
batch_data_samples (List[:obj:`SegDataSample`]): The Data
Samples. It usually includes information such as
`gt_sem_seg`.
train_cfg (ConfigType): Training config.
Returns:
dict[str, Tensor]: a dictionary of loss components.
"""
# batch SegDataSample to InstanceDataSample
batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
batch_data_samples)
# forward
all_cls_scores, all_mask_preds = self(x, batch_data_samples)
# loss
losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
batch_gt_instances, batch_img_metas)
return losses
def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
test_cfg: ConfigType) -> Tuple[Tensor]:
"""Test without augmentaton.
Args:
x (tuple[Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
batch_img_metas (List[:obj:`SegDataSample`]): The Data
Samples. It usually includes information such as
`gt_sem_seg`.
test_cfg (ConfigType): Test config.
Returns:
Tensor: A tensor of segmentation mask.
"""
batch_data_samples = [
SegDataSample(metainfo=metainfo) for metainfo in batch_img_metas
]
all_cls_scores, all_mask_preds = self(x, batch_data_samples)
mask_cls_results = all_cls_scores[-1]
mask_pred_results = all_mask_preds[-1]
if 'pad_shape' in batch_img_metas[0]:
size = batch_img_metas[0]['pad_shape']
else:
size = batch_img_metas[0]['img_shape']
# upsample mask
mask_pred_results = F.interpolate(
mask_pred_results, size=size, mode='bilinear', align_corners=False)
cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
mask_pred = mask_pred_results.sigmoid()
seg_logits = torch.einsum('bqc, bqhw->bchw', cls_score, mask_pred)
return seg_logits

View File

@ -0,0 +1,173 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
try:
from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
except ModuleNotFoundError:
MMDET_MaskFormerHead = None
from mmengine.structures import InstanceData
from torch import Tensor
from mmseg.registry import MODELS
from mmseg.structures.seg_data_sample import SegDataSample
from mmseg.utils import ConfigType, SampleList
@MODELS.register_module()
class MaskFormerHead(MMDET_MaskFormerHead):
"""Implements the MaskFormer head.
See `Per-Pixel Classification is Not All You Need for Semantic Segmentation
<https://arxiv.org/pdf/2107.06278>`_ for details.
Args:
num_classes (int): Number of classes. Default: 150.
align_corners (bool): align_corners argument of F.interpolate.
Default: False.
ignore_index (int): The label index to be ignored. Default: 255.
"""
def __init__(self,
num_classes: int = 150,
align_corners: bool = False,
ignore_index: int = 255,
**kwargs) -> None:
super().__init__(**kwargs)
self.out_channels = kwargs['out_channels']
self.align_corners = True
self.num_classes = num_classes
self.align_corners = align_corners
self.out_channels = num_classes
self.ignore_index = ignore_index
feat_channels = kwargs['feat_channels']
self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
"""Perform forward propagation to convert paradigm from MMSegmentation
to MMDetection to ensure ``MMDET_MaskFormerHead`` could be called
normally. Specifically, ``batch_gt_instances`` would be added.
Args:
batch_data_samples (List[:obj:`SegDataSample`]): The Data
Samples. It usually includes information such as
`gt_sem_seg`.
Returns:
tuple[Tensor]: A tuple contains two lists.
- batch_gt_instances (list[:obj:`InstanceData`]): Batch of
gt_instance. It usually includes ``labels``, each is
unique ground truth label id of images, with
shape (num_gt, ) and ``masks``, each is ground truth
masks of each instances of a image, shape (num_gt, h, w).
- batch_img_metas (list[dict]): List of image meta information.
"""
batch_img_metas = []
batch_gt_instances = []
for data_sample in batch_data_samples:
# Add `batch_input_shape` in metainfo of data_sample, which would
# be used in MaskFormerHead of MMDetection.
metainfo = data_sample.metainfo
metainfo['batch_input_shape'] = metainfo['img_shape']
data_sample.set_metainfo(metainfo)
batch_img_metas.append(data_sample.metainfo)
gt_sem_seg = data_sample.gt_sem_seg.data
classes = torch.unique(
gt_sem_seg,
sorted=False,
return_inverse=False,
return_counts=False)
# remove ignored region
gt_labels = classes[classes != self.ignore_index]
masks = []
for class_id in gt_labels:
masks.append(gt_sem_seg == class_id)
if len(masks) == 0:
gt_masks = torch.zeros((0, gt_sem_seg.shape[-2],
gt_sem_seg.shape[-1])).to(gt_sem_seg)
else:
gt_masks = torch.stack(masks).squeeze(1)
instance_data = InstanceData(
labels=gt_labels, masks=gt_masks.long())
batch_gt_instances.append(instance_data)
return batch_gt_instances, batch_img_metas
def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
train_cfg: ConfigType) -> dict:
"""Perform forward propagation and loss calculation of the decoder head
on the features of the upstream network.
Args:
x (tuple[Tensor]): Multi-level features from the upstream
network, each is a 4D-tensor.
batch_data_samples (List[:obj:`SegDataSample`]): The Data
Samples. It usually includes information such as
`gt_sem_seg`.
train_cfg (ConfigType): Training config.
Returns:
dict[str, Tensor]: a dictionary of loss components.
"""
# batch SegDataSample to InstanceDataSample
batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
batch_data_samples)
# forward
all_cls_scores, all_mask_preds = self(x, batch_data_samples)
# loss
losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
batch_gt_instances, batch_img_metas)
return losses
def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
test_cfg: ConfigType) -> Tuple[Tensor]:
"""Test without augmentaton.
Args:
x (tuple[Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
batch_img_metas (List[:obj:`SegDataSample`]): The Data
Samples. It usually includes information such as
`gt_sem_seg`.
test_cfg (ConfigType): Test config.
Returns:
Tensor: A tensor of segmentation mask.
"""
batch_data_samples = []
for metainfo in batch_img_metas:
metainfo['batch_input_shape'] = metainfo['img_shape']
batch_data_samples.append(SegDataSample(metainfo=metainfo))
# Forward function of MaskFormerHead from MMDetection needs
# 'batch_data_samples' as inputs, which is image shape actually.
all_cls_scores, all_mask_preds = self(x, batch_data_samples)
mask_cls_results = all_cls_scores[-1]
mask_pred_results = all_mask_preds[-1]
# upsample masks
img_shape = batch_img_metas[0]['batch_input_shape']
mask_pred_results = F.interpolate(
mask_pred_results,
size=img_shape,
mode='bilinear',
align_corners=False)
# semantic inference
cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
mask_pred = mask_pred_results.sigmoid()
seg_logits = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
return seg_logits

View File

@ -159,12 +159,17 @@ class BaseSegmentor(BaseModel, metaclass=ABCMeta):
if not only_prediction:
img_meta = data_samples[i].metainfo
# remove padding area
padding_left, padding_right, padding_top, padding_bottom = \
img_meta.get('padding_size', [0]*4)
if 'img_padding_size' not in img_meta:
padding_size = img_meta.get('padding_size', [0] * 4)
else:
padding_size = img_meta['img_padding_size']
padding_left, padding_right, padding_top, padding_bottom =\
padding_size
# i_seg_logits shape is 1, C, H, W after remove padding
i_seg_logits = seg_logits[i:i + 1, :,
padding_top:H - padding_bottom,
padding_left:W - padding_right]
# resize as original shape
i_seg_logits = resize(
i_seg_logits,

View File

@ -105,6 +105,9 @@ def stack_batch(inputs: List[torch.Tensor],
})
padded_samples.append(data_sample)
else:
padded_samples = None
padded_samples.append(
dict(
img_padding_size=padding_size,
pad_shape=pad_img.shape[-2:]))
return torch.stack(padded_inputs, dim=0), padded_samples

View File

@ -1,6 +1,6 @@
# Copyright (c) Open-MMLab. All rights reserved.
__version__ = '1.0.0rc1'
__version__ = '1.0.0rc2'
def parse_version_info(version_str):

View File

@ -25,6 +25,8 @@ Import:
- configs/isanet/isanet.yml
- configs/knet/knet.yml
- configs/mae/mae.yml
- configs/mask2former/mask2former.yml
- configs/maskformer/maskformer.yml
- configs/mobilenet_v2/mobilenet_v2.yml
- configs/mobilenet_v3/mobilenet_v3.yml
- configs/nonlocal_net/nonlocal_net.yml

View File

@ -1,3 +1,4 @@
mmcls>=1.0.0rc0
mmcv>=2.0.0rc1,<2.1.0
mmcv>=2.0.0rc3,<2.1.0
mmdet>=3.0.0rc4
mmengine>=0.1.0,<1.0.0

View File

@ -1,5 +1,5 @@
matplotlib
mmcls>=1.0.0rc0
numpy
packaging
prettytable
scipy

View File

@ -192,7 +192,6 @@ if __name__ == '__main__':
extras_require={
'all': parse_requirements('requirements.txt'),
'tests': parse_requirements('requirements/tests.txt'),
'build': parse_requirements('requirements/build.txt'),
'optional': parse_requirements('requirements/optional.txt'),
'mim': parse_requirements('requirements/mminstall.txt'),
},

View File

@ -10,6 +10,9 @@ from PIL import Image
from mmseg.datasets.transforms import * # noqa
from mmseg.datasets.transforms import PhotoMetricDistortion, RandomCrop
from mmseg.registry import TRANSFORMS
from mmseg.utils import register_all_modules
register_all_modules()
def test_resize():
@ -71,6 +74,34 @@ def test_resize():
resized_results = resize_module(results.copy())
assert max(resized_results['img_shape'][:2]) <= 1333 * 1.1
# test RandomChoiceResize, which `resize_type` is `ResizeShortestEdge`
transform = dict(
type='RandomChoiceResize',
scales=[128, 256, 512],
resize_type='ResizeShortestEdge',
max_size=1333)
resize_module = TRANSFORMS.build(transform)
resized_results = resize_module(results.copy())
assert resized_results['img_shape'][0] in [128, 256, 512]
transform = dict(
type='RandomChoiceResize',
scales=[512],
resize_type='ResizeShortestEdge',
max_size=512)
resize_module = TRANSFORMS.build(transform)
resized_results = resize_module(results.copy())
assert resized_results['img_shape'][1] == 512
transform = dict(
type='RandomChoiceResize',
scales=[(128, 256), (256, 512), (512, 1024)],
resize_type='ResizeShortestEdge',
max_size=1333)
resize_module = TRANSFORMS.build(transform)
resized_results = resize_module(results.copy())
assert resized_results['img_shape'][0] in [128, 256, 512]
# test scale=None and scale_factor is tuple.
# img shape: (288, 512, 3)
transform = dict(

View File

@ -140,8 +140,11 @@ def test_beit_init():
}
}
model = BEiT(img_size=(512, 512))
with pytest.raises(AttributeError):
model.resize_rel_pos_embed(ckpt)
# If scipy is installed, this AttributeError would not be raised.
from mmengine.utils import is_installed
if not is_installed('scipy'):
with pytest.raises(AttributeError):
model.resize_rel_pos_embed(ckpt)
# pretrained=None
# init_cfg=123, whose type is unsupported

View File

@ -138,8 +138,11 @@ def test_mae_init():
}
}
model = MAE(img_size=(512, 512))
with pytest.raises(AttributeError):
model.resize_rel_pos_embed(ckpt)
# If scipy is installed, this AttributeError would not be raised.
from mmengine.utils import is_installed
if not is_installed('scipy'):
with pytest.raises(AttributeError):
model.resize_rel_pos_embed(ckpt)
# test resize abs pos embed
ckpt = model.resize_abs_pos_embed(ckpt['state_dict'])

View File

@ -46,3 +46,19 @@ class TestSegDataPreProcessor(TestCase):
out = processor(data, training=True)
self.assertEqual(out['inputs'].shape, (2, 3, 20, 20))
self.assertEqual(len(out['data_samples']), 2)
# test predict with padding
processor = SegDataPreProcessor(
mean=[0, 0, 0],
std=[1, 1, 1],
size=(20, 20),
test_cfg=dict(size_divisor=15))
data = {
'inputs': [
torch.randint(0, 256, (3, 11, 10)),
],
'data_samples': [data_sample]
}
out = processor(data, training=False)
self.assertEqual(out['inputs'].shape[2] % 15, 0)
self.assertEqual(out['inputs'].shape[3] % 15, 0)

View File

@ -0,0 +1,160 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmengine import Config
from mmengine.structures import PixelData
from mmseg.models.decode_heads import Mask2FormerHead
from mmseg.structures import SegDataSample
from mmseg.utils import SampleList
from .utils import to_cuda
def test_mask2former_head():
num_classes = 19
cfg = dict(
in_channels=[96, 192, 384, 768],
strides=[4, 8, 16, 32],
feat_channels=256,
out_channels=256,
num_classes=num_classes,
num_queries=100,
num_transformer_feat_level=3,
align_corners=False,
pixel_decoder=dict(
type='mmdet.MSDeformAttnPixelDecoder',
num_outs=3,
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU'),
encoder=dict(
type='mmdet.DetrTransformerEncoder',
num_layers=6,
transformerlayers=dict(
type='mmdet.BaseTransformerLayer',
attn_cfgs=dict(
type='mmdet.MultiScaleDeformableAttention',
embed_dims=256,
num_heads=8,
num_levels=3,
num_points=4,
im2col_step=64,
dropout=0.0,
batch_first=False,
norm_cfg=None,
init_cfg=None),
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.0,
act_cfg=dict(type='ReLU', inplace=True)),
operation_order=('self_attn', 'norm', 'ffn', 'norm')),
init_cfg=None),
positional_encoding=dict(
type='mmdet.SinePositionalEncoding',
num_feats=128,
normalize=True),
init_cfg=None),
enforce_decoder_input_project=False,
positional_encoding=dict(
type='mmdet.SinePositionalEncoding', num_feats=128,
normalize=True),
transformer_decoder=dict(
type='mmdet.DetrTransformerDecoder',
return_intermediate=True,
num_layers=9,
transformerlayers=dict(
type='mmdet.DetrTransformerDecoderLayer',
attn_cfgs=dict(
type='mmdet.MultiheadAttention',
embed_dims=256,
num_heads=8,
attn_drop=0.0,
proj_drop=0.0,
dropout_layer=None,
batch_first=False),
ffn_cfgs=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.0,
dropout_layer=None,
add_identity=True),
feedforward_channels=2048,
operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
'ffn', 'norm')),
init_cfg=None),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=2.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1]),
loss_mask=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=True,
reduction='mean',
loss_weight=5.0),
loss_dice=dict(
type='mmdet.DiceLoss',
use_sigmoid=True,
activate=True,
reduction='mean',
naive_dice=True,
eps=1.0,
loss_weight=5.0),
train_cfg=dict(
num_points=12544,
oversample_ratio=3.0,
importance_sample_ratio=0.75,
assigner=dict(
type='mmdet.HungarianAssigner',
match_costs=[
dict(type='mmdet.ClassificationCost', weight=2.0),
dict(
type='mmdet.CrossEntropyLossCost',
weight=5.0,
use_sigmoid=True),
dict(
type='mmdet.DiceCost',
weight=5.0,
pred_act=True,
eps=1.0)
]),
sampler=dict(type='mmdet.MaskPseudoSampler')))
cfg = Config(cfg)
head = Mask2FormerHead(**cfg)
inputs = [
torch.rand((2, 96, 8, 8)),
torch.rand((2, 192, 4, 4)),
torch.rand((2, 384, 2, 2)),
torch.rand((2, 768, 1, 1))
]
data_samples: SampleList = []
for i in range(2):
data_sample = SegDataSample()
img_meta = {}
img_meta['img_shape'] = (32, 32)
img_meta['ori_shape'] = (32, 32)
data_sample.gt_sem_seg = PixelData(
data=torch.randint(0, num_classes, (1, 32, 32)))
data_sample.set_metainfo(img_meta)
data_samples.append(data_sample)
if torch.cuda.is_available():
head, inputs = to_cuda(head, inputs)
for data_sample in data_samples:
data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.data.cuda()
loss_dict = head.loss(inputs, data_samples, None)
assert isinstance(loss_dict, dict)
batch_img_metas = []
for data_sample in data_samples:
batch_img_metas.append(data_sample.metainfo)
seg_logits = head.predict(inputs, batch_img_metas, None)
assert seg_logits.shape == torch.Size((2, num_classes, 32, 32))

View File

@ -0,0 +1,54 @@
# Copyright (c) OpenMMLab. All rights reserved.
from os.path import dirname, join
import torch
from mmengine import Config
from mmengine.structures import PixelData
from mmseg.registry import MODELS
from mmseg.structures import SegDataSample
from mmseg.utils import register_all_modules
def test_maskformer_head():
register_all_modules()
repo_dpath = dirname(dirname(__file__))
cfg = Config.fromfile(
join(
repo_dpath,
'../../configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py' # noqa
))
cfg.model.train_cfg = None
decode_head = MODELS.build(cfg.model.decode_head)
inputs = (torch.randn(1, 256, 32, 32), torch.randn(1, 512, 16, 16),
torch.randn(1, 1024, 8, 8), torch.randn(1, 2048, 4, 4))
# test inference
batch_img_metas = [
dict(
scale_factor=(1.0, 1.0),
img_shape=(512, 683),
ori_shape=(512, 683))
]
test_cfg = dict(mode='whole')
output = decode_head.predict(inputs, batch_img_metas, test_cfg)
assert output.shape == (1, 150, 512, 683)
# test training
inputs = (torch.randn(2, 256, 32, 32), torch.randn(2, 512, 16, 16),
torch.randn(2, 1024, 8, 8), torch.randn(2, 2048, 4, 4))
batch_data_samples = []
img_meta = {
'img_shape': (512, 512),
'ori_shape': (480, 640),
'pad_shape': (512, 512),
'scale_factor': (1.425, 1.425),
}
for _ in range(2):
data_sample = SegDataSample(
gt_sem_seg=PixelData(data=torch.ones(512, 512).long()))
data_sample.set_metainfo(img_meta)
batch_data_samples.append(data_sample)
train_cfg = {}
losses = decode_head.loss(inputs, batch_data_samples, train_cfg)
assert (loss in losses.keys()
for loss in ('loss_cls', 'loss_mask', 'loss_dice'))

View File

@ -1,8 +1,10 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmengine import ConfigDict
from mmengine.structures import PixelData
from mmseg.models import build_segmentor
from mmseg.structures import SegDataSample
from .utils import _segmentor_forward_train_test
@ -57,3 +59,42 @@ def test_encoder_decoder():
cfg.test_cfg = ConfigDict(mode='whole')
segmentor = build_segmentor(cfg)
_segmentor_forward_train_test(segmentor)
def test_postprocess_result():
cfg = ConfigDict(
type='EncoderDecoder',
backbone=dict(type='ExampleBackbone'),
decode_head=dict(type='ExampleDecodeHead'),
train_cfg=None,
test_cfg=dict(mode='whole'))
model = build_segmentor(cfg)
# test postprocess
data_sample = SegDataSample()
data_sample.gt_sem_seg = PixelData(
**{'data': torch.randint(0, 10, (1, 8, 8))})
data_sample.set_metainfo({
'padding_size': (0, 2, 0, 2),
'ori_shape': (8, 8)
})
seg_logits = torch.zeros((1, 2, 10, 10))
seg_logits[:, :, :8, :8] = 1
data_samples = [data_sample]
outputs = model.postprocess_result(seg_logits, data_samples)
assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))
data_sample = SegDataSample()
data_sample.gt_sem_seg = PixelData(
**{'data': torch.randint(0, 10, (1, 8, 8))})
data_sample.set_metainfo({
'img_padding_size': (0, 2, 0, 2),
'ori_shape': (8, 8)
})
data_samples = [data_sample]
outputs = model.postprocess_result(seg_logits, data_samples)
assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))