diff --git a/README.md b/README.md index affcab7fa..2c5b5b94c 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ Supported methods: - [x] [PointRend](configs/point_rend) - [x] [EMANet](configs/emanet) - [x] [DNLNet](configs/dnlnet) +- [x] [CGNet](configs/cgnet) - [x] [Mixed Precision (FP16) Training](configs/fp16/README.md) ## Installation diff --git a/configs/_base_/models/cgnet.py b/configs/_base_/models/cgnet.py new file mode 100644 index 000000000..e598abca2 --- /dev/null +++ b/configs/_base_/models/cgnet.py @@ -0,0 +1,35 @@ +# model settings +norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='CGNet', + norm_cfg=norm_cfg, + in_channels=3, + num_channels=(32, 64, 128), + num_blocks=(3, 21), + dilations=(2, 4), + reductions=(8, 16)), + decode_head=dict( + type='FCNHead', + in_channels=256, + in_index=2, + channels=256, + num_convs=0, + concat_input=False, + dropout_ratio=0, + num_classes=19, + norm_cfg=norm_cfg, + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=[ + 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, + 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, + 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, + 10.396974, 10.055647 + ]))) +# model training and testing settings +train_cfg = dict(sampler=None) +test_cfg = dict(mode='whole') diff --git a/configs/cgnet/README.md b/configs/cgnet/README.md new file mode 100644 index 000000000..f260a47d0 --- /dev/null +++ b/configs/cgnet/README.md @@ -0,0 +1,21 @@ +# CGNet: A Light-weight Context Guided Network for Semantic Segmentation + +## Introduction + +```latext +@article{wu2018cgnet, + title={CGNet: A Light-weight Context Guided Network for Semantic Segmentation}, + author={Wu, Tianyi and Tang, Sheng and Zhang, Rui and Zhang, Yongdong}, + journal={arXiv preprint arXiv:1811.08201}, + year={2018} +} +``` + +## Results and models + +### Cityscapes + +| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | download | +|-----------|----------|-----------|--------:|----------|----------------|------:|--------------:|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| CGNet | M3N21 | 680x680 | 60000 | 7.5 | 30.51 | 65.63 | 68.04 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes-20201101_110253.log.json) | +| CGNet | M3N21 | 512x1024 | 60000 | 8.3 | 31.14 | 68.27 | 70.33 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes-20201101_110254.log.json) | diff --git a/configs/cgnet/cgnet_512x1024_60k_cityscapes.py b/configs/cgnet/cgnet_512x1024_60k_cityscapes.py new file mode 100644 index 000000000..11421ef9d --- /dev/null +++ b/configs/cgnet/cgnet_512x1024_60k_cityscapes.py @@ -0,0 +1,66 @@ +_base_ = ['../_base_/models/cgnet.py', '../_base_/default_runtime.py'] + +# optimizer +optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) +# runtime settings +total_iters = 60000 +checkpoint_config = dict(by_epoch=False, interval=4000) +evaluation = dict(interval=4000, metric='mIoU') + +# dataset settings +dataset_type = 'CityscapesDataset' +data_root = 'data/cityscapes/' +img_norm_cfg = dict( + mean=[72.39239876, 82.90891754, 73.15835921], std=[1, 1, 1], to_rgb=True) +crop_size = (512, 1024) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 1024), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=8, + train=dict( + type=dataset_type, + data_root=data_root, + img_dir='leftImg8bit/train', + ann_dir='gtFine/train', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + data_root=data_root, + img_dir='leftImg8bit/val', + ann_dir='gtFine/val', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + data_root=data_root, + img_dir='leftImg8bit/val', + ann_dir='gtFine/val', + pipeline=test_pipeline)) diff --git a/configs/cgnet/cgnet_680x680_60k_cityscapes.py b/configs/cgnet/cgnet_680x680_60k_cityscapes.py new file mode 100644 index 000000000..2b2f8eefb --- /dev/null +++ b/configs/cgnet/cgnet_680x680_60k_cityscapes.py @@ -0,0 +1,50 @@ +_base_ = [ + '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py', + '../_base_/default_runtime.py' +] + +# optimizer +optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005) +optimizer_config = dict() +# learning policy +lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) +# runtime settings +total_iters = 60000 +checkpoint_config = dict(by_epoch=False, interval=4000) +evaluation = dict(interval=4000, metric='mIoU') + +img_norm_cfg = dict( + mean=[72.39239876, 82.90891754, 73.15835921], std=[1, 1, 1], to_rgb=True) +crop_size = (680, 680) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 1024), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=8, + workers_per_gpu=8, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/docs/model_zoo.md b/docs/model_zoo.md index fb86c5e37..0dd1b410b 100644 --- a/docs/model_zoo.md +++ b/docs/model_zoo.md @@ -111,6 +111,10 @@ Please refer to [EMANet](https://github.com/open-mmlab/mmsegmentation/blob/maste Please refer to [DNLNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet) for details. +### CGNet + +Please refer to [CGNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/cgnet) for details. + ### Mixed Precision (FP16) Training Please refer [Mixed Precision (FP16) Training](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fp16/README.md) for details. diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py index db5eb1c5c..86174ac7a 100644 --- a/mmseg/models/backbones/__init__.py +++ b/mmseg/models/backbones/__init__.py @@ -1,3 +1,4 @@ +from .cgnet import CGNet from .fast_scnn import FastSCNN from .hrnet import HRNet from .mobilenet_v2 import MobileNetV2 @@ -8,5 +9,5 @@ from .unet import UNet __all__ = [ 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN', - 'ResNeSt', 'MobileNetV2', 'UNet' + 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet' ] diff --git a/mmseg/models/backbones/cgnet.py b/mmseg/models/backbones/cgnet.py new file mode 100644 index 000000000..968d171cd --- /dev/null +++ b/mmseg/models/backbones/cgnet.py @@ -0,0 +1,367 @@ +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, + constant_init, kaiming_init) +from mmcv.runner import load_checkpoint +from mmcv.utils.parrots_wrapper import _BatchNorm + +from mmseg.utils import get_root_logger +from ..builder import BACKBONES + + +class GlobalContextExtractor(nn.Module): + """Global Context Extractor for CGNet. + + This class is employed to refine the joFint feature of both local feature + and surrounding context. + + Args: + channel (int): Number of input feature channels. + reduction (int): Reductions for global context extractor. Default: 16. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, channel, reduction=16, with_cp=False): + super(GlobalContextExtractor, self).__init__() + self.channel = channel + self.reduction = reduction + assert reduction >= 1 and channel >= reduction + self.with_cp = with_cp + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), nn.Sigmoid()) + + def forward(self, x): + + def _inner_forward(x): + num_batch, num_channel = x.size()[:2] + y = self.avg_pool(x).view(num_batch, num_channel) + y = self.fc(y).view(num_batch, num_channel, 1, 1) + return x * y + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class ContextGuidedBlock(nn.Module): + """Context Guided Block for CGNet. + + This class consists of four components: local feature extractor, + surrounding feature extractor, joint feature extractor and global + context extractor. + + Args: + in_channels (int): Number of input feature channels. + out_channels (int): Number of output feature channels. + dilation (int): Dilation rate for surrounding context extractor. + Default: 2. + reduction (int): Reduction for global context extractor. Default: 16. + skip_connect (bool): Add input to output or not. Default: True. + downsample (bool): Downsample the input to 1/2 or not. Default: False. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='PReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + dilation=2, + reduction=16, + skip_connect=True, + downsample=False, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'), + with_cp=False): + super(ContextGuidedBlock, self).__init__() + self.with_cp = with_cp + self.downsample = downsample + + channels = out_channels if downsample else out_channels // 2 + if 'type' in act_cfg and act_cfg['type'] == 'PReLU': + act_cfg['num_parameters'] = channels + kernel_size = 3 if downsample else 1 + stride = 2 if downsample else 1 + padding = (kernel_size - 1) // 2 + + self.conv1x1 = ConvModule( + in_channels, + channels, + kernel_size, + stride, + padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.f_loc = build_conv_layer( + conv_cfg, + channels, + channels, + kernel_size=3, + padding=1, + groups=channels, + bias=False) + self.f_sur = build_conv_layer( + conv_cfg, + channels, + channels, + kernel_size=3, + padding=dilation, + groups=channels, + dilation=dilation, + bias=False) + + self.bn = build_norm_layer(norm_cfg, 2 * channels)[1] + self.activate = nn.PReLU(2 * channels) + + if downsample: + self.bottleneck = build_conv_layer( + conv_cfg, + 2 * channels, + out_channels, + kernel_size=1, + bias=False) + + self.skip_connect = skip_connect and not downsample + self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp) + + def forward(self, x): + + def _inner_forward(x): + out = self.conv1x1(x) + loc = self.f_loc(out) + sur = self.f_sur(out) + + joi_feat = torch.cat([loc, sur], 1) # the joint feature + joi_feat = self.bn(joi_feat) + joi_feat = self.activate(joi_feat) + if self.downsample: + joi_feat = self.bottleneck(joi_feat) # channel = out_channels + # f_glo is employed to refine the joint feature + out = self.f_glo(joi_feat) + + if self.skip_connect: + return x + out + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class InputInjection(nn.Module): + """Downsampling module for CGNet.""" + + def __init__(self, num_downsampling): + super(InputInjection, self).__init__() + self.pool = nn.ModuleList() + for i in range(num_downsampling): + self.pool.append(nn.AvgPool2d(3, stride=2, padding=1)) + + def forward(self, x): + for pool in self.pool: + x = pool(x) + return x + + +@BACKBONES.register_module() +class CGNet(nn.Module): + """CGNet backbone. + + A Light-weight Context Guided Network for Semantic Segmentation + arXiv: https://arxiv.org/abs/1811.08201 + + Args: + in_channels (int): Number of input image channels. Normally 3. + num_channels (tuple[int]): Numbers of feature channels at each stages. + Default: (32, 64, 128). + num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2. + Default: (3, 21). + dilations (tuple[int]): Dilation rate for surrounding context + extractors at stage 1 and stage 2. Default: (2, 4). + reductions (tuple[int]): Reductions for global context extractors at + stage 1 and stage 2. Default: (8, 16). + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='PReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels=3, + num_channels=(32, 64, 128), + num_blocks=(3, 21), + dilations=(2, 4), + reductions=(8, 16), + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'), + norm_eval=False, + with_cp=False): + + super(CGNet, self).__init__() + self.in_channels = in_channels + self.num_channels = num_channels + assert isinstance(self.num_channels, tuple) and len( + self.num_channels) == 3 + self.num_blocks = num_blocks + assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2 + self.dilations = dilations + assert isinstance(self.dilations, tuple) and len(self.dilations) == 2 + self.reductions = reductions + assert isinstance(self.reductions, tuple) and len(self.reductions) == 2 + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU': + self.act_cfg['num_parameters'] = num_channels[0] + self.norm_eval = norm_eval + self.with_cp = with_cp + + cur_channels = in_channels + self.stem = nn.ModuleList() + for i in range(3): + self.stem.append( + ConvModule( + cur_channels, + num_channels[0], + 3, + 2 if i == 0 else 1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + cur_channels = num_channels[0] + + self.inject_2x = InputInjection(1) # down-sample for Input, factor=2 + self.inject_4x = InputInjection(2) # down-sample for Input, factor=4 + + cur_channels += in_channels + self.norm_prelu_0 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + # stage 1 + self.level1 = nn.ModuleList() + for i in range(num_blocks[0]): + self.level1.append( + ContextGuidedBlock( + cur_channels if i == 0 else num_channels[1], + num_channels[1], + dilations[0], + reductions[0], + downsample=(i == 0), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) # CG block + + cur_channels = 2 * num_channels[1] + in_channels + self.norm_prelu_1 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + # stage 2 + self.level2 = nn.ModuleList() + for i in range(num_blocks[1]): + self.level2.append( + ContextGuidedBlock( + cur_channels if i == 0 else num_channels[2], + num_channels[2], + dilations[1], + reductions[1], + downsample=(i == 0), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) # CG block + + cur_channels = 2 * num_channels[2] + self.norm_prelu_2 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + def forward(self, x): + output = [] + + # stage 0 + inp_2x = self.inject_2x(x) + inp_4x = self.inject_4x(x) + for layer in self.stem: + x = layer(x) + x = self.norm_prelu_0(torch.cat([x, inp_2x], 1)) + output.append(x) + + # stage 1 + for i, layer in enumerate(self.level1): + x = layer(x) + if i == 0: + down1 = x + x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1)) + output.append(x) + + # stage 2 + for i, layer in enumerate(self.level2): + x = layer(x) + if i == 0: + down2 = x + x = self.norm_prelu_2(torch.cat([down2, x], 1)) + output.append(x) + + return output + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + elif isinstance(m, nn.PReLU): + constant_init(m, 0) + else: + raise TypeError('pretrained must be a str or None') + + def train(self, mode=True): + """Convert the model into training mode whill keeping the normalization + layer freezed.""" + super(CGNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmseg/models/decode_heads/fcn_head.py b/mmseg/models/decode_heads/fcn_head.py index ff48b5197..d660847f8 100644 --- a/mmseg/models/decode_heads/fcn_head.py +++ b/mmseg/models/decode_heads/fcn_head.py @@ -24,11 +24,14 @@ class FCNHead(BaseDecodeHead): kernel_size=3, concat_input=True, **kwargs): - assert num_convs > 0 + assert num_convs >= 0 self.num_convs = num_convs self.concat_input = concat_input self.kernel_size = kernel_size super(FCNHead, self).__init__(**kwargs) + if num_convs == 0: + assert self.in_channels == self.channels + convs = [] convs.append( ConvModule( @@ -49,7 +52,10 @@ class FCNHead(BaseDecodeHead): conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) - self.convs = nn.Sequential(*convs) + if num_convs == 0: + self.convs = nn.Identity() + else: + self.convs = nn.Sequential(*convs) if self.concat_input: self.conv_cat = ConvModule( self.in_channels + self.channels, diff --git a/tests/test_models/test_backbone.py b/tests/test_models/test_backbone.py index 3f654b7d7..25467f86e 100644 --- a/tests/test_models/test_backbone.py +++ b/tests/test_models/test_backbone.py @@ -4,8 +4,10 @@ from mmcv.ops import DeformConv2dPack from mmcv.utils.parrots_wrapper import _BatchNorm from torch.nn.modules import AvgPool2d, GroupNorm -from mmseg.models.backbones import (FastSCNN, ResNeSt, ResNet, ResNetV1d, - ResNeXt) +from mmseg.models.backbones import (CGNet, FastSCNN, ResNeSt, ResNet, + ResNetV1d, ResNeXt) +from mmseg.models.backbones.cgnet import (ContextGuidedBlock, + GlobalContextExtractor) from mmseg.models.backbones.resnest import Bottleneck as BottleneckS from mmseg.models.backbones.resnet import BasicBlock, Bottleneck from mmseg.models.backbones.resnext import Bottleneck as BottleneckX @@ -729,3 +731,147 @@ def test_resnest_backbone(): assert feat[1].shape == torch.Size([2, 512, 28, 28]) assert feat[2].shape == torch.Size([2, 1024, 14, 14]) assert feat[3].shape == torch.Size([2, 2048, 7, 7]) + + +def test_cgnet_GlobalContextExtractor(): + block = GlobalContextExtractor(16, 16, with_cp=True) + x = torch.randn(2, 16, 64, 64, requires_grad=True) + x_out = block(x) + assert x_out.shape == torch.Size([2, 16, 64, 64]) + + +def test_cgnet_context_guided_block(): + with pytest.raises(AssertionError): + # cgnet ContextGuidedBlock GlobalContextExtractor channel and reduction + # constraints. + ContextGuidedBlock(8, 8) + + # test cgnet ContextGuidedBlock with checkpoint forward + block = ContextGuidedBlock( + 16, 16, act_cfg=dict(type='PReLU'), with_cp=True) + assert block.with_cp + x = torch.randn(2, 16, 64, 64, requires_grad=True) + x_out = block(x) + assert x_out.shape == torch.Size([2, 16, 64, 64]) + + # test cgnet ContextGuidedBlock without checkpoint forward + block = ContextGuidedBlock(32, 32) + assert not block.with_cp + x = torch.randn(3, 32, 32, 32) + x_out = block(x) + assert x_out.shape == torch.Size([3, 32, 32, 32]) + + # test cgnet ContextGuidedBlock with down sampling + block = ContextGuidedBlock(32, 32, downsample=True) + assert block.conv1x1.conv.in_channels == 32 + assert block.conv1x1.conv.out_channels == 32 + assert block.conv1x1.conv.kernel_size == (3, 3) + assert block.conv1x1.conv.stride == (2, 2) + assert block.conv1x1.conv.padding == (1, 1) + + assert block.f_loc.in_channels == 32 + assert block.f_loc.out_channels == 32 + assert block.f_loc.kernel_size == (3, 3) + assert block.f_loc.stride == (1, 1) + assert block.f_loc.padding == (1, 1) + assert block.f_loc.groups == 32 + assert block.f_loc.dilation == (1, 1) + assert block.f_loc.bias is None + + assert block.f_sur.in_channels == 32 + assert block.f_sur.out_channels == 32 + assert block.f_sur.kernel_size == (3, 3) + assert block.f_sur.stride == (1, 1) + assert block.f_sur.padding == (2, 2) + assert block.f_sur.groups == 32 + assert block.f_sur.dilation == (2, 2) + assert block.f_sur.bias is None + + assert block.bottleneck.in_channels == 64 + assert block.bottleneck.out_channels == 32 + assert block.bottleneck.kernel_size == (1, 1) + assert block.bottleneck.stride == (1, 1) + assert block.bottleneck.bias is None + + x = torch.randn(1, 32, 32, 32) + x_out = block(x) + assert x_out.shape == torch.Size([1, 32, 16, 16]) + + # test cgnet ContextGuidedBlock without down sampling + block = ContextGuidedBlock(32, 32, downsample=False) + assert block.conv1x1.conv.in_channels == 32 + assert block.conv1x1.conv.out_channels == 16 + assert block.conv1x1.conv.kernel_size == (1, 1) + assert block.conv1x1.conv.stride == (1, 1) + assert block.conv1x1.conv.padding == (0, 0) + + assert block.f_loc.in_channels == 16 + assert block.f_loc.out_channels == 16 + assert block.f_loc.kernel_size == (3, 3) + assert block.f_loc.stride == (1, 1) + assert block.f_loc.padding == (1, 1) + assert block.f_loc.groups == 16 + assert block.f_loc.dilation == (1, 1) + assert block.f_loc.bias is None + + assert block.f_sur.in_channels == 16 + assert block.f_sur.out_channels == 16 + assert block.f_sur.kernel_size == (3, 3) + assert block.f_sur.stride == (1, 1) + assert block.f_sur.padding == (2, 2) + assert block.f_sur.groups == 16 + assert block.f_sur.dilation == (2, 2) + assert block.f_sur.bias is None + + x = torch.randn(1, 32, 32, 32) + x_out = block(x) + assert x_out.shape == torch.Size([1, 32, 32, 32]) + + +def test_cgnet_backbone(): + with pytest.raises(AssertionError): + # check invalid num_channels + CGNet(num_channels=(32, 64, 128, 256)) + + with pytest.raises(AssertionError): + # check invalid num_blocks + CGNet(num_blocks=(3, 21, 3)) + + with pytest.raises(AssertionError): + # check invalid dilation + CGNet(num_blocks=2) + + with pytest.raises(AssertionError): + # check invalid reduction + CGNet(reductions=16) + + with pytest.raises(AssertionError): + # check invalid num_channels and reduction + CGNet(num_channels=(32, 64, 128), reductions=(64, 129)) + + # Test CGNet with default settings + model = CGNet() + model.init_weights() + model.train() + + imgs = torch.randn(2, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size([2, 35, 112, 112]) + assert feat[1].shape == torch.Size([2, 131, 56, 56]) + assert feat[2].shape == torch.Size([2, 256, 28, 28]) + + # Test CGNet with norm_eval True and with_cp True + model = CGNet(norm_eval=True, with_cp=True) + with pytest.raises(TypeError): + # check invalid pretrained + model.init_weights(pretrained=8) + model.init_weights() + model.train() + + imgs = torch.randn(2, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size([2, 35, 112, 112]) + assert feat[1].shape == torch.Size([2, 131, 56, 56]) + assert feat[2].shape == torch.Size([2, 256, 28, 28]) diff --git a/tests/test_models/test_heads.py b/tests/test_models/test_heads.py index 8e60a915c..acf290226 100644 --- a/tests/test_models/test_heads.py +++ b/tests/test_models/test_heads.py @@ -105,8 +105,8 @@ def test_decode_head(): def test_fcn_head(): with pytest.raises(AssertionError): - # num_convs must be larger than 0 - FCNHead(num_classes=19, num_convs=0) + # num_convs must be not less than 0 + FCNHead(num_classes=19, num_convs=-1) # test no norm_cfg head = FCNHead(in_channels=32, channels=16, num_classes=19) @@ -178,6 +178,20 @@ def test_fcn_head(): outputs = head(inputs) assert outputs.shape == (1, head.num_classes, 45, 45) + # test num_conv = 0 + inputs = [torch.randn(1, 32, 45, 45)] + head = FCNHead( + in_channels=32, + channels=32, + num_classes=19, + num_convs=0, + concat_input=False) + if torch.cuda.is_available(): + head, inputs = to_cuda(head, inputs) + assert isinstance(head.convs, torch.nn.Identity) + outputs = head(inputs) + assert outputs.shape == (1, head.num_classes, 45, 45) + def test_psp_head():