mirror of
https://github.com/open-mmlab/mmsegmentation.git
synced 2025-06-03 22:03:48 +08:00
[Feature] Support BiSeNetV1 (#851)
* First Commit * fix typos * fix typos * Fix assertion bug * Adding Assert * Adding Unittest * Fixing typo * Uploading models & logs * Fixing unittest error * changing README.md * changing README.md
This commit is contained in:
parent
2aa632ebe7
commit
ab12009414
@ -75,6 +75,7 @@ Supported methods:
|
|||||||
- [x] [PSPNet (CVPR'2017)](configs/pspnet)
|
- [x] [PSPNet (CVPR'2017)](configs/pspnet)
|
||||||
- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
|
- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
|
||||||
- [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
|
- [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
|
||||||
|
- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
|
||||||
- [x] [PSANet (ECCV'2018)](configs/psanet)
|
- [x] [PSANet (ECCV'2018)](configs/psanet)
|
||||||
- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
|
- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
|
||||||
- [x] [UPerNet (ECCV'2018)](configs/upernet)
|
- [x] [UPerNet (ECCV'2018)](configs/upernet)
|
||||||
|
@ -74,6 +74,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
|
|||||||
- [x] [PSPNet (CVPR'2017)](configs/pspnet)
|
- [x] [PSPNet (CVPR'2017)](configs/pspnet)
|
||||||
- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
|
- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
|
||||||
- [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
|
- [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
|
||||||
|
- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
|
||||||
- [x] [PSANet (ECCV'2018)](configs/psanet)
|
- [x] [PSANet (ECCV'2018)](configs/psanet)
|
||||||
- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
|
- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
|
||||||
- [x] [UPerNet (ECCV'2018)](configs/upernet)
|
- [x] [UPerNet (ECCV'2018)](configs/upernet)
|
||||||
|
68
configs/_base_/models/bisenetv1_r18-d32.py
Normal file
68
configs/_base_/models/bisenetv1_r18-d32.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
# model settings
|
||||||
|
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||||
|
model = dict(
|
||||||
|
type='EncoderDecoder',
|
||||||
|
backbone=dict(
|
||||||
|
type='BiSeNetV1',
|
||||||
|
in_channels=3,
|
||||||
|
context_channels=(128, 256, 512),
|
||||||
|
spatial_channels=(64, 64, 64, 128),
|
||||||
|
out_indices=(0, 1, 2),
|
||||||
|
out_channels=256,
|
||||||
|
backbone_cfg=dict(
|
||||||
|
type='ResNet',
|
||||||
|
in_channels=3,
|
||||||
|
depth=18,
|
||||||
|
num_stages=4,
|
||||||
|
out_indices=(0, 1, 2, 3),
|
||||||
|
dilations=(1, 1, 1, 1),
|
||||||
|
strides=(1, 2, 2, 2),
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
norm_eval=False,
|
||||||
|
style='pytorch',
|
||||||
|
contract_dilation=True),
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
align_corners=False,
|
||||||
|
init_cfg=None),
|
||||||
|
decode_head=dict(
|
||||||
|
type='FCNHead',
|
||||||
|
in_channels=256,
|
||||||
|
in_index=0,
|
||||||
|
channels=256,
|
||||||
|
num_convs=1,
|
||||||
|
concat_input=False,
|
||||||
|
dropout_ratio=0.1,
|
||||||
|
num_classes=19,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
align_corners=False,
|
||||||
|
loss_decode=dict(
|
||||||
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||||
|
auxiliary_head=[
|
||||||
|
dict(
|
||||||
|
type='FCNHead',
|
||||||
|
in_channels=128,
|
||||||
|
channels=64,
|
||||||
|
num_convs=1,
|
||||||
|
num_classes=19,
|
||||||
|
in_index=1,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
concat_input=False,
|
||||||
|
align_corners=False,
|
||||||
|
loss_decode=dict(
|
||||||
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||||
|
dict(
|
||||||
|
type='FCNHead',
|
||||||
|
in_channels=128,
|
||||||
|
channels=64,
|
||||||
|
num_convs=1,
|
||||||
|
num_classes=19,
|
||||||
|
in_index=2,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
concat_input=False,
|
||||||
|
align_corners=False,
|
||||||
|
loss_decode=dict(
|
||||||
|
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
||||||
|
],
|
||||||
|
# model training and testing settings
|
||||||
|
train_cfg=dict(),
|
||||||
|
test_cfg=dict(mode='whole'))
|
42
configs/bisenetv1/README.md
Normal file
42
configs/bisenetv1/README.md
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
<!-- [ALGORITHM] -->
|
||||||
|
|
||||||
|
<a href="https://github.com/ycszen/TorchSeg/tree/master/model/bisenet">Official Repo</a>
|
||||||
|
|
||||||
|
<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266">Code Snippet</a>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary align="right"><a href="https://arxiv.org/abs/1808.00897">BiSeNetV1 (ECCV'2018)</a></summary>
|
||||||
|
|
||||||
|
```latex
|
||||||
|
@inproceedings{yu2018bisenet,
|
||||||
|
title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
|
||||||
|
author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
|
||||||
|
booktitle={Proceedings of the European conference on computer vision (ECCV)},
|
||||||
|
pages={325--341},
|
||||||
|
year={2018}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
## Results and models
|
||||||
|
|
||||||
|
### Cityscapes
|
||||||
|
|
||||||
|
| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
|
||||||
|
| --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
|
||||||
|
| BiSeNetV1| R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
|
||||||
|
| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024 | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
|
||||||
|
| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024 | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
|
||||||
|
| BiSeNetV1 | R-50-D32 | 1024x1024 | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
|
||||||
|
|
||||||
|
Note:
|
||||||
|
|
||||||
|
- `4x8`: Using 4 GPUs with 8 samples per GPU in training.
|
||||||
|
- Default setting is 4 GPUs with 4 samples per GPU in training.
|
||||||
|
- `No Pretrain` means the model is trained from scratch.
|
125
configs/bisenetv1/bisenetv1.yml
Normal file
125
configs/bisenetv1/bisenetv1.yml
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
Collections:
|
||||||
|
- Name: bisenetv1
|
||||||
|
Metadata:
|
||||||
|
Training Data:
|
||||||
|
- Cityscapes
|
||||||
|
Paper:
|
||||||
|
URL: https://arxiv.org/abs/1808.00897
|
||||||
|
Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
|
||||||
|
README: configs/bisenetv1/README.md
|
||||||
|
Code:
|
||||||
|
URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
|
||||||
|
Version: v0.18.0
|
||||||
|
Converted From:
|
||||||
|
Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet
|
||||||
|
Models:
|
||||||
|
- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
|
||||||
|
In Collection: bisenetv1
|
||||||
|
Metadata:
|
||||||
|
backbone: R-18-D32
|
||||||
|
crop size: (1024,1024)
|
||||||
|
lr schd: 160000
|
||||||
|
inference time (ms/im):
|
||||||
|
- value: 31.48
|
||||||
|
hardware: V100
|
||||||
|
backend: PyTorch
|
||||||
|
batch size: 1
|
||||||
|
mode: FP32
|
||||||
|
resolution: (1024,1024)
|
||||||
|
memory (GB): 5.69
|
||||||
|
Results:
|
||||||
|
- Task: Semantic Segmentation
|
||||||
|
Dataset: Cityscapes
|
||||||
|
Metrics:
|
||||||
|
mIoU: 74.44
|
||||||
|
mIoU(ms+flip): 77.05
|
||||||
|
Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
|
||||||
|
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
|
||||||
|
- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
|
||||||
|
In Collection: bisenetv1
|
||||||
|
Metadata:
|
||||||
|
backbone: R-18-D32
|
||||||
|
crop size: (1024,1024)
|
||||||
|
lr schd: 160000
|
||||||
|
inference time (ms/im):
|
||||||
|
- value: 31.48
|
||||||
|
hardware: V100
|
||||||
|
backend: PyTorch
|
||||||
|
batch size: 1
|
||||||
|
mode: FP32
|
||||||
|
resolution: (1024,1024)
|
||||||
|
memory (GB): 5.69
|
||||||
|
Results:
|
||||||
|
- Task: Semantic Segmentation
|
||||||
|
Dataset: Cityscapes
|
||||||
|
Metrics:
|
||||||
|
mIoU: 74.37
|
||||||
|
mIoU(ms+flip): 76.91
|
||||||
|
Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
|
||||||
|
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
|
||||||
|
- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
|
||||||
|
In Collection: bisenetv1
|
||||||
|
Metadata:
|
||||||
|
backbone: R-18-D32
|
||||||
|
crop size: (1024,1024)
|
||||||
|
lr schd: 160000
|
||||||
|
inference time (ms/im):
|
||||||
|
- value: 31.48
|
||||||
|
hardware: V100
|
||||||
|
backend: PyTorch
|
||||||
|
batch size: 1
|
||||||
|
mode: FP32
|
||||||
|
resolution: (1024,1024)
|
||||||
|
memory (GB): 11.17
|
||||||
|
Results:
|
||||||
|
- Task: Semantic Segmentation
|
||||||
|
Dataset: Cityscapes
|
||||||
|
Metrics:
|
||||||
|
mIoU: 75.16
|
||||||
|
mIoU(ms+flip): 77.24
|
||||||
|
Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
|
||||||
|
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
|
||||||
|
- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
|
||||||
|
In Collection: bisenetv1
|
||||||
|
Metadata:
|
||||||
|
backbone: R-50-D32
|
||||||
|
crop size: (1024,1024)
|
||||||
|
lr schd: 160000
|
||||||
|
inference time (ms/im):
|
||||||
|
- value: 129.7
|
||||||
|
hardware: V100
|
||||||
|
backend: PyTorch
|
||||||
|
batch size: 1
|
||||||
|
mode: FP32
|
||||||
|
resolution: (1024,1024)
|
||||||
|
memory (GB): 3.3
|
||||||
|
Results:
|
||||||
|
- Task: Semantic Segmentation
|
||||||
|
Dataset: Cityscapes
|
||||||
|
Metrics:
|
||||||
|
mIoU: 76.92
|
||||||
|
mIoU(ms+flip): 78.87
|
||||||
|
Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
|
||||||
|
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
|
||||||
|
- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
|
||||||
|
In Collection: bisenetv1
|
||||||
|
Metadata:
|
||||||
|
backbone: R-50-D32
|
||||||
|
crop size: (1024,1024)
|
||||||
|
lr schd: 160000
|
||||||
|
inference time (ms/im):
|
||||||
|
- value: 129.7
|
||||||
|
hardware: V100
|
||||||
|
backend: PyTorch
|
||||||
|
batch size: 1
|
||||||
|
mode: FP32
|
||||||
|
resolution: (1024,1024)
|
||||||
|
memory (GB): 15.39
|
||||||
|
Results:
|
||||||
|
- Task: Semantic Segmentation
|
||||||
|
Dataset: Cityscapes
|
||||||
|
Metrics:
|
||||||
|
mIoU: 77.68
|
||||||
|
mIoU(ms+flip): 79.57
|
||||||
|
Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
|
||||||
|
Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
|
@ -0,0 +1,11 @@
|
|||||||
|
_base_ = [
|
||||||
|
'../_base_/models/bisenetv1_r18-d32.py',
|
||||||
|
'../_base_/datasets/cityscapes_1024x1024.py',
|
||||||
|
'../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
|
||||||
|
]
|
||||||
|
lr_config = dict(warmup='linear', warmup_iters=1000)
|
||||||
|
optimizer = dict(lr=0.025)
|
||||||
|
data = dict(
|
||||||
|
samples_per_gpu=4,
|
||||||
|
workers_per_gpu=4,
|
||||||
|
)
|
@ -0,0 +1,16 @@
|
|||||||
|
_base_ = [
|
||||||
|
'../_base_/models/bisenetv1_r18-d32.py',
|
||||||
|
'../_base_/datasets/cityscapes_1024x1024.py',
|
||||||
|
'../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
|
||||||
|
]
|
||||||
|
model = dict(
|
||||||
|
backbone=dict(
|
||||||
|
backbone_cfg=dict(
|
||||||
|
init_cfg=dict(
|
||||||
|
type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
|
||||||
|
lr_config = dict(warmup='linear', warmup_iters=1000)
|
||||||
|
optimizer = dict(lr=0.025)
|
||||||
|
data = dict(
|
||||||
|
samples_per_gpu=4,
|
||||||
|
workers_per_gpu=4,
|
||||||
|
)
|
@ -0,0 +1,5 @@
|
|||||||
|
_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py'
|
||||||
|
data = dict(
|
||||||
|
samples_per_gpu=8,
|
||||||
|
workers_per_gpu=8,
|
||||||
|
)
|
@ -0,0 +1,46 @@
|
|||||||
|
_base_ = [
|
||||||
|
'../_base_/models/bisenetv1_r18-d32.py',
|
||||||
|
'../_base_/datasets/cityscapes_1024x1024.py',
|
||||||
|
'../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
|
||||||
|
]
|
||||||
|
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
||||||
|
model = dict(
|
||||||
|
type='EncoderDecoder',
|
||||||
|
backbone=dict(
|
||||||
|
type='BiSeNetV1',
|
||||||
|
context_channels=(512, 1024, 2048),
|
||||||
|
spatial_channels=(256, 256, 256, 512),
|
||||||
|
out_channels=1024,
|
||||||
|
backbone_cfg=dict(
|
||||||
|
init_cfg=dict(
|
||||||
|
type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'),
|
||||||
|
type='ResNet',
|
||||||
|
depth=50)),
|
||||||
|
decode_head=dict(
|
||||||
|
type='FCNHead', in_channels=1024, in_index=0, channels=1024),
|
||||||
|
auxiliary_head=[
|
||||||
|
dict(
|
||||||
|
type='FCNHead',
|
||||||
|
in_channels=512,
|
||||||
|
channels=256,
|
||||||
|
num_convs=1,
|
||||||
|
num_classes=19,
|
||||||
|
in_index=1,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
concat_input=False),
|
||||||
|
dict(
|
||||||
|
type='FCNHead',
|
||||||
|
in_channels=512,
|
||||||
|
channels=256,
|
||||||
|
num_convs=1,
|
||||||
|
num_classes=19,
|
||||||
|
in_index=2,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
concat_input=False),
|
||||||
|
])
|
||||||
|
lr_config = dict(warmup='linear', warmup_iters=1000)
|
||||||
|
optimizer = dict(lr=0.05)
|
||||||
|
data = dict(
|
||||||
|
samples_per_gpu=4,
|
||||||
|
workers_per_gpu=4,
|
||||||
|
)
|
@ -0,0 +1,7 @@
|
|||||||
|
_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py'
|
||||||
|
model = dict(
|
||||||
|
type='EncoderDecoder',
|
||||||
|
backbone=dict(
|
||||||
|
backbone_cfg=dict(
|
||||||
|
init_cfg=dict(
|
||||||
|
type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
|
@ -1,4 +1,5 @@
|
|||||||
# Copyright (c) OpenMMLab. All rights reserved.
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||||||
|
from .bisenetv1 import BiSeNetV1
|
||||||
from .bisenetv2 import BiSeNetV2
|
from .bisenetv2 import BiSeNetV2
|
||||||
from .cgnet import CGNet
|
from .cgnet import CGNet
|
||||||
from .fast_scnn import FastSCNN
|
from .fast_scnn import FastSCNN
|
||||||
@ -16,5 +17,6 @@ from .vit import VisionTransformer
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
|
'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
|
||||||
'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
|
'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
|
||||||
'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV2'
|
'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
|
||||||
|
'BiSeNetV1', 'BiSeNetV2'
|
||||||
]
|
]
|
||||||
|
332
mmseg/models/backbones/bisenetv1.py
Normal file
332
mmseg/models/backbones/bisenetv1.py
Normal file
@ -0,0 +1,332 @@
|
|||||||
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from mmcv.cnn import ConvModule
|
||||||
|
from mmcv.runner import BaseModule
|
||||||
|
|
||||||
|
from mmseg.ops import resize
|
||||||
|
from ..builder import BACKBONES, build_backbone
|
||||||
|
|
||||||
|
|
||||||
|
class SpatialPath(BaseModule):
|
||||||
|
"""Spatial Path to preserve the spatial size of the original input image
|
||||||
|
and encode affluent spatial information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_channels(int): The number of channels of input
|
||||||
|
image. Default: 3.
|
||||||
|
num_channels (Tuple[int]): The number of channels of
|
||||||
|
each layers in Spatial Path.
|
||||||
|
Default: (64, 64, 64, 128).
|
||||||
|
Returns:
|
||||||
|
x (torch.Tensor): Feature map for Feature Fusion Module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=3,
|
||||||
|
num_channels=(64, 64, 64, 128),
|
||||||
|
conv_cfg=None,
|
||||||
|
norm_cfg=dict(type='BN'),
|
||||||
|
act_cfg=dict(type='ReLU'),
|
||||||
|
init_cfg=None):
|
||||||
|
super(SpatialPath, self).__init__(init_cfg=init_cfg)
|
||||||
|
assert len(num_channels) == 4, 'Length of input channels \
|
||||||
|
of Spatial Path must be 4!'
|
||||||
|
|
||||||
|
self.layers = []
|
||||||
|
for i in range(len(num_channels)):
|
||||||
|
layer_name = f'layer{i + 1}'
|
||||||
|
self.layers.append(layer_name)
|
||||||
|
if i == 0:
|
||||||
|
self.add_module(
|
||||||
|
layer_name,
|
||||||
|
ConvModule(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=num_channels[i],
|
||||||
|
kernel_size=7,
|
||||||
|
stride=2,
|
||||||
|
padding=3,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg))
|
||||||
|
elif i == len(num_channels) - 1:
|
||||||
|
self.add_module(
|
||||||
|
layer_name,
|
||||||
|
ConvModule(
|
||||||
|
in_channels=num_channels[i - 1],
|
||||||
|
out_channels=num_channels[i],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg))
|
||||||
|
else:
|
||||||
|
self.add_module(
|
||||||
|
layer_name,
|
||||||
|
ConvModule(
|
||||||
|
in_channels=num_channels[i - 1],
|
||||||
|
out_channels=num_channels[i],
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for i, layer_name in enumerate(self.layers):
|
||||||
|
layer_stage = getattr(self, layer_name)
|
||||||
|
x = layer_stage(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class AttentionRefinementModule(BaseModule):
|
||||||
|
"""Attention Refinement Module (ARM) to refine the features of each stage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_channels (int): The number of input channels.
|
||||||
|
out_channels (int): The number of output channels.
|
||||||
|
Returns:
|
||||||
|
x_out (torch.Tensor): Feature map of Attention Refinement Module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channel,
|
||||||
|
conv_cfg=None,
|
||||||
|
norm_cfg=dict(type='BN'),
|
||||||
|
act_cfg=dict(type='ReLU'),
|
||||||
|
init_cfg=None):
|
||||||
|
super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
|
||||||
|
self.conv_layer = ConvModule(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channel,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg)
|
||||||
|
self.atten_conv_layer = nn.Sequential(
|
||||||
|
nn.AdaptiveAvgPool2d((1, 1)),
|
||||||
|
ConvModule(
|
||||||
|
in_channels=out_channel,
|
||||||
|
out_channels=out_channel,
|
||||||
|
kernel_size=1,
|
||||||
|
bias=False,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=None), nn.Sigmoid())
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.conv_layer(x)
|
||||||
|
x_atten = self.atten_conv_layer(x)
|
||||||
|
x_out = x * x_atten
|
||||||
|
return x_out
|
||||||
|
|
||||||
|
|
||||||
|
class ContextPath(BaseModule):
|
||||||
|
"""Context Path to provide sufficient receptive field.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backbone_cfg:(dict): Config of backbone of
|
||||||
|
Context Path.
|
||||||
|
context_channels (Tuple[int]): The number of channel numbers
|
||||||
|
of various modules in Context Path.
|
||||||
|
Default: (128, 256, 512).
|
||||||
|
align_corners (bool, optional): The align_corners argument of
|
||||||
|
resize operation. Default: False.
|
||||||
|
Returns:
|
||||||
|
x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps
|
||||||
|
undergoing upsampling from 1/16 and 1/32 downsampling
|
||||||
|
feature maps. These two feature maps are used for Feature
|
||||||
|
Fusion Module and Auxiliary Head.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
backbone_cfg,
|
||||||
|
context_channels=(128, 256, 512),
|
||||||
|
align_corners=False,
|
||||||
|
conv_cfg=None,
|
||||||
|
norm_cfg=dict(type='BN'),
|
||||||
|
act_cfg=dict(type='ReLU'),
|
||||||
|
init_cfg=None):
|
||||||
|
super(ContextPath, self).__init__(init_cfg=init_cfg)
|
||||||
|
assert len(context_channels) == 3, 'Length of input channels \
|
||||||
|
of Context Path must be 3!'
|
||||||
|
|
||||||
|
self.backbone = build_backbone(backbone_cfg)
|
||||||
|
|
||||||
|
self.align_corners = align_corners
|
||||||
|
self.arm16 = AttentionRefinementModule(context_channels[1],
|
||||||
|
context_channels[0])
|
||||||
|
self.arm32 = AttentionRefinementModule(context_channels[2],
|
||||||
|
context_channels[0])
|
||||||
|
self.conv_head32 = ConvModule(
|
||||||
|
in_channels=context_channels[0],
|
||||||
|
out_channels=context_channels[0],
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg)
|
||||||
|
self.conv_head16 = ConvModule(
|
||||||
|
in_channels=context_channels[0],
|
||||||
|
out_channels=context_channels[0],
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg)
|
||||||
|
self.gap_conv = nn.Sequential(
|
||||||
|
nn.AdaptiveAvgPool2d((1, 1)),
|
||||||
|
ConvModule(
|
||||||
|
in_channels=context_channels[2],
|
||||||
|
out_channels=context_channels[0],
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x_4, x_8, x_16, x_32 = self.backbone(x)
|
||||||
|
x_gap = self.gap_conv(x_32)
|
||||||
|
|
||||||
|
x_32_arm = self.arm32(x_32)
|
||||||
|
x_32_sum = x_32_arm + x_gap
|
||||||
|
x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest')
|
||||||
|
x_32_up = self.conv_head32(x_32_up)
|
||||||
|
|
||||||
|
x_16_arm = self.arm16(x_16)
|
||||||
|
x_16_sum = x_16_arm + x_32_up
|
||||||
|
x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
|
||||||
|
x_16_up = self.conv_head16(x_16_up)
|
||||||
|
|
||||||
|
return x_16_up, x_32_up
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureFusionModule(BaseModule):
|
||||||
|
"""Feature Fusion Module to fuse low level output feature of Spatial Path
|
||||||
|
and high level output feature of Context Path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_channels (int): The number of input channels.
|
||||||
|
out_channels (int): The number of output channels.
|
||||||
|
Returns:
|
||||||
|
x_out (torch.Tensor): Feature map of Feature Fusion Module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
conv_cfg=None,
|
||||||
|
norm_cfg=dict(type='BN'),
|
||||||
|
act_cfg=dict(type='ReLU'),
|
||||||
|
init_cfg=None):
|
||||||
|
super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
|
||||||
|
self.conv1 = ConvModule(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg)
|
||||||
|
self.gap = nn.AdaptiveAvgPool2d((1, 1))
|
||||||
|
self.conv_atten = nn.Sequential(
|
||||||
|
ConvModule(
|
||||||
|
in_channels=out_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias=False,
|
||||||
|
conv_cfg=conv_cfg,
|
||||||
|
norm_cfg=norm_cfg,
|
||||||
|
act_cfg=act_cfg), nn.Sigmoid())
|
||||||
|
|
||||||
|
def forward(self, x_sp, x_cp):
|
||||||
|
x_concat = torch.cat([x_sp, x_cp], dim=1)
|
||||||
|
x_fuse = self.conv1(x_concat)
|
||||||
|
x_atten = self.gap(x_fuse)
|
||||||
|
# Note: No BN and more 1x1 conv in paper.
|
||||||
|
x_atten = self.conv_atten(x_atten)
|
||||||
|
x_atten = x_fuse * x_atten
|
||||||
|
x_out = x_atten + x_fuse
|
||||||
|
return x_out
|
||||||
|
|
||||||
|
|
||||||
|
@BACKBONES.register_module()
|
||||||
|
class BiSeNetV1(BaseModule):
|
||||||
|
"""BiSeNetV1 backbone.
|
||||||
|
|
||||||
|
This backbone is the implementation of `BiSeNet: Bilateral
|
||||||
|
Segmentation Network for Real-time Semantic
|
||||||
|
Segmentation <https://arxiv.org/abs/1808.00897>`_.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backbone_cfg:(dict): Config of backbone of
|
||||||
|
Context Path.
|
||||||
|
in_channels (int): The number of channels of input
|
||||||
|
image. Default: 3.
|
||||||
|
spatial_channels (Tuple[int]): Size of channel numbers of
|
||||||
|
various layers in Spatial Path.
|
||||||
|
Default: (64, 64, 64, 128).
|
||||||
|
context_channels (Tuple[int]): Size of channel numbers of
|
||||||
|
various modules in Context Path.
|
||||||
|
Default: (128, 256, 512).
|
||||||
|
out_indices (Tuple[int] | int, optional): Output from which stages.
|
||||||
|
Default: (0, 1, 2).
|
||||||
|
align_corners (bool, optional): The align_corners argument of
|
||||||
|
resize operation in Bilateral Guided Aggregation Layer.
|
||||||
|
Default: False.
|
||||||
|
out_channels(int): The number of channels of output.
|
||||||
|
It must be the same with `in_channels` of decode_head.
|
||||||
|
Default: 256.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
backbone_cfg,
|
||||||
|
in_channels=3,
|
||||||
|
spatial_channels=(64, 64, 64, 128),
|
||||||
|
context_channels=(128, 256, 512),
|
||||||
|
out_indices=(0, 1, 2),
|
||||||
|
align_corners=False,
|
||||||
|
out_channels=256,
|
||||||
|
conv_cfg=None,
|
||||||
|
norm_cfg=dict(type='BN', requires_grad=True),
|
||||||
|
act_cfg=dict(type='ReLU'),
|
||||||
|
init_cfg=None):
|
||||||
|
|
||||||
|
super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
|
||||||
|
assert len(spatial_channels) == 4, 'Length of input channels \
|
||||||
|
of Spatial Path must be 4!'
|
||||||
|
|
||||||
|
assert len(context_channels) == 3, 'Length of input channels \
|
||||||
|
of Context Path must be 3!'
|
||||||
|
|
||||||
|
self.out_indices = out_indices
|
||||||
|
self.align_corners = align_corners
|
||||||
|
self.context_path = ContextPath(backbone_cfg, context_channels,
|
||||||
|
self.align_corners)
|
||||||
|
self.spatial_path = SpatialPath(in_channels, spatial_channels)
|
||||||
|
self.ffm = FeatureFusionModule(context_channels[1], out_channels)
|
||||||
|
self.conv_cfg = conv_cfg
|
||||||
|
self.norm_cfg = norm_cfg
|
||||||
|
self.act_cfg = act_cfg
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# stole refactoring code from Coin Cheung, thanks
|
||||||
|
x_context8, x_context16 = self.context_path(x)
|
||||||
|
x_spatial = self.spatial_path(x)
|
||||||
|
x_fuse = self.ffm(x_spatial, x_context8)
|
||||||
|
|
||||||
|
outs = [x_fuse, x_context8, x_context16]
|
||||||
|
outs = [outs[i] for i in self.out_indices]
|
||||||
|
return tuple(outs)
|
@ -1,6 +1,7 @@
|
|||||||
Import:
|
Import:
|
||||||
- configs/ann/ann.yml
|
- configs/ann/ann.yml
|
||||||
- configs/apcnet/apcnet.yml
|
- configs/apcnet/apcnet.yml
|
||||||
|
- configs/bisenetv1/bisenetv1.yml
|
||||||
- configs/bisenetv2/bisenetv2.yml
|
- configs/bisenetv2/bisenetv2.yml
|
||||||
- configs/ccnet/ccnet.yml
|
- configs/ccnet/ccnet.yml
|
||||||
- configs/cgnet/cgnet.yml
|
- configs/cgnet/cgnet.yml
|
||||||
|
109
tests/test_models/test_backbones/test_bisenetv1.py
Normal file
109
tests/test_models/test_backbones/test_bisenetv1.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from mmseg.models.backbones import BiSeNetV1
|
||||||
|
from mmseg.models.backbones.bisenetv1 import (AttentionRefinementModule,
|
||||||
|
ContextPath, FeatureFusionModule,
|
||||||
|
SpatialPath)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bisenetv1_backbone():
|
||||||
|
# Test BiSeNetV1 Standard Forward
|
||||||
|
backbone_cfg = dict(
|
||||||
|
type='ResNet',
|
||||||
|
in_channels=3,
|
||||||
|
depth=18,
|
||||||
|
num_stages=4,
|
||||||
|
out_indices=(0, 1, 2, 3),
|
||||||
|
dilations=(1, 1, 1, 1),
|
||||||
|
strides=(1, 2, 2, 2),
|
||||||
|
norm_eval=False,
|
||||||
|
style='pytorch',
|
||||||
|
contract_dilation=True)
|
||||||
|
model = BiSeNetV1(in_channels=3, backbone_cfg=backbone_cfg)
|
||||||
|
model.init_weights()
|
||||||
|
model.train()
|
||||||
|
batch_size = 2
|
||||||
|
imgs = torch.randn(batch_size, 3, 256, 512)
|
||||||
|
feat = model(imgs)
|
||||||
|
|
||||||
|
assert len(feat) == 3
|
||||||
|
# output for segment Head
|
||||||
|
assert feat[0].shape == torch.Size([batch_size, 256, 32, 64])
|
||||||
|
# for auxiliary head 1
|
||||||
|
assert feat[1].shape == torch.Size([batch_size, 128, 32, 64])
|
||||||
|
# for auxiliary head 2
|
||||||
|
assert feat[2].shape == torch.Size([batch_size, 128, 16, 32])
|
||||||
|
|
||||||
|
# Test input with rare shape
|
||||||
|
batch_size = 2
|
||||||
|
imgs = torch.randn(batch_size, 3, 527, 279)
|
||||||
|
feat = model(imgs)
|
||||||
|
assert len(feat) == 3
|
||||||
|
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
# BiSeNetV1 spatial path channel constraints.
|
||||||
|
BiSeNetV1(
|
||||||
|
backbone_cfg=backbone_cfg,
|
||||||
|
in_channels=3,
|
||||||
|
spatial_channels=(64, 64, 64))
|
||||||
|
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
# BiSeNetV1 context path constraints.
|
||||||
|
BiSeNetV1(
|
||||||
|
backbone_cfg=backbone_cfg,
|
||||||
|
in_channels=3,
|
||||||
|
context_channels=(128, 256, 512, 1024))
|
||||||
|
|
||||||
|
|
||||||
|
def test_bisenetv1_spatial_path():
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
# BiSeNetV1 spatial path channel constraints.
|
||||||
|
SpatialPath(num_channels=(64, 64, 64), in_channels=3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bisenetv1_context_path():
|
||||||
|
backbone_cfg = dict(
|
||||||
|
type='ResNet',
|
||||||
|
in_channels=3,
|
||||||
|
depth=50,
|
||||||
|
num_stages=4,
|
||||||
|
out_indices=(0, 1, 2, 3),
|
||||||
|
dilations=(1, 1, 1, 1),
|
||||||
|
strides=(1, 2, 2, 2),
|
||||||
|
norm_eval=False,
|
||||||
|
style='pytorch',
|
||||||
|
contract_dilation=True)
|
||||||
|
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
# BiSeNetV1 context path constraints.
|
||||||
|
ContextPath(
|
||||||
|
backbone_cfg=backbone_cfg, context_channels=(128, 256, 512, 1024))
|
||||||
|
|
||||||
|
|
||||||
|
def test_bisenetv1_attention_refinement_module():
|
||||||
|
x_arm = AttentionRefinementModule(256, 64)
|
||||||
|
assert x_arm.conv_layer.in_channels == 256
|
||||||
|
assert x_arm.conv_layer.out_channels == 64
|
||||||
|
assert x_arm.conv_layer.kernel_size == (3, 3)
|
||||||
|
x = torch.randn(2, 256, 32, 64)
|
||||||
|
x_out = x_arm(x)
|
||||||
|
assert x_out.shape == torch.Size([2, 64, 32, 64])
|
||||||
|
|
||||||
|
|
||||||
|
def test_bisenetv1_feature_fusion_module():
|
||||||
|
ffm = FeatureFusionModule(128, 256)
|
||||||
|
assert ffm.conv1.in_channels == 128
|
||||||
|
assert ffm.conv1.out_channels == 256
|
||||||
|
assert ffm.conv1.kernel_size == (1, 1)
|
||||||
|
assert ffm.gap.output_size == (1, 1)
|
||||||
|
assert ffm.conv_atten[0].in_channels == 256
|
||||||
|
assert ffm.conv_atten[0].out_channels == 256
|
||||||
|
assert ffm.conv_atten[0].kernel_size == (1, 1)
|
||||||
|
|
||||||
|
ffm = FeatureFusionModule(128, 128)
|
||||||
|
x1 = torch.randn(2, 64, 64, 128)
|
||||||
|
x2 = torch.randn(2, 64, 64, 128)
|
||||||
|
x_out = ffm(x1, x2)
|
||||||
|
assert x_out.shape == torch.Size([2, 128, 64, 128])
|
Loading…
x
Reference in New Issue
Block a user