init commit: fast_scnn

2020-07-31 14:16:00 +08:00 · 2020-07-31 14:16:00 +08:00 · 6435e3e162
parent 2b801dedfc
commit 6435e3e162
9 changed files with 645 additions and 71 deletions
--- a/.gitignore
+++ b/.gitignore
@ -113,6 +113,10 @@ data
 *.pkl.json
 *.log.json
 work_dirs/
+workdirs/
+configs_unify/

 # Pytorch
 *.pth
+
+
--- a/configs/_base_/models/fast_scnn.py
+++ b/configs/_base_/models/fast_scnn.py
@ -0,0 +1,55 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='FastSCNN',
+        downsample_dw_channels1=32,
+        downsample_dw_channels2=48,
+        global_in_channels=64,
+        global_block_channels=(64, 96, 128),
+        global_out_channels=128,
+        higher_in_channels=64,
+        lower_in_channels=128,
+        fusion_out_channels=128,
+        scale_factor=4,
+        out_indices=(0, 1, 2),
+        norm_cfg=norm_cfg,
+        align_corners=False),
+    decode_head=dict(
+        type='SepFCNHead',
+        in_channels=128,
+        channels=128,
+        concat_input=False,
+        num_classes=19,
+        in_index=-1,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=32,
+            num_convs=1,
+            num_classes=19,
+            in_index=-2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=64,
+            channels=32,
+            num_convs=1,
+            num_classes=19,
+            in_index=-3,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ])
--- a/configs/fastscnn/fast_scnn_4x3_1000e_cityscapes.py
+++ b/configs/fastscnn/fast_scnn_4x3_1000e_cityscapes.py
@ -0,0 +1,61 @@
+_base_ = [
+    '../_base_/models/fast_scnn.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py'
+]
+crop_size = (512, 1024)
+cudnn_benchmark = True
+# model training and testing settings
+train_cfg = dict()
+test_cfg = dict(mode='whole')
+
+# Here: What is parameter 'with_seg'?
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile', to_float32=True),
+    dict(type='LoadAnnotations'),   # with_seg=True
+    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='RandomCrop', crop_size=crop_size),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=3,
+    workers_per_gpu=3,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
+
+# optimizer
+optimizer = dict(type='SGD', lr=0.045, momentum=0.9, weight_decay=4e-5)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(
+    policy='poly',
+    power=0.9,
+    by_epoch=False,
+)
+# runtime settings
+# total_epochs = 1000
+total_iters = 10000
+evaluation = dict(interval=100, metric='mIoU')
+checkpoint_config = dict(interval=100)
+
--- a/mmseg/models/backbones/init.py
+++ b/mmseg/models/backbones/init.py
@ -1,5 +1,6 @@
 from .hrnet import HRNet
 from .resnet import ResNet, ResNetV1c, ResNetV1d
 from .resnext import ResNeXt
+from .fast_scnn import FastSCNN

-__all__ = ['ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet']
+__all__ = ['ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN']
--- a/mmseg/models/backbones/fast_scnn.py
+++ b/mmseg/models/backbones/fast_scnn.py
@ -0,0 +1,248 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.models.backbones.mobile_net_v2 import InvertedResidual
+from mmseg.models.decode_heads.psp_head import PPM
+from mmseg.ops import DepthwiseSeparableConvModule, resize
+from ..builder import BACKBONES
+
+
+class LearningToDownsample(nn.Module):
+    """Learning to downsample module"""
+
+    def __init__(self,
+                 in_channels,
+                 dw_channels1,
+                 dw_channels2,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU')):
+        super(LearningToDownsample, self).__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.conv = ConvModule(
+            in_channels,
+            dw_channels1,
+            3,
+            stride=2,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.dsconv1 = DepthwiseSeparableConvModule(
+            dw_channels1,
+            dw_channels2,
+            stride=2,
+            relu_first=False,
+            norm_cfg=self.norm_cfg)
+        self.dsconv2 = DepthwiseSeparableConvModule(
+            dw_channels2,
+            out_channels,
+            stride=2,
+            relu_first=False,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.dsconv1(x)
+        x = self.dsconv2(x)
+        return x
+
+
+class GlobalFeatureExtractor(nn.Module):
+    """Global feature extractor module"""
+
+    def __init__(self,
+                 in_channels=64,
+                 block_channels=(64, 96, 128),
+                 out_channels=128,
+                 t=6,
+                 num_blocks=(3, 3, 3),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=True):
+        super(GlobalFeatureExtractor, self).__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        assert len(block_channels) == len(num_blocks) == 3
+        self.bottleneck1 = self._make_layer(in_channels, block_channels[0],
+                                            num_blocks[0], t, 2)
+        self.bottleneck2 = self._make_layer(block_channels[0],
+                                            block_channels[1], num_blocks[1],
+                                            t, 2)
+        self.bottleneck3 = self._make_layer(block_channels[1],
+                                            block_channels[2], num_blocks[2],
+                                            t, 1)
+        self.ppm = PPM(
+            pool_scales,
+            block_channels[2],
+            block_channels[2] // 4,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=align_corners)
+        self.out = ConvModule(
+            block_channels[2] * 2,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _make_layer(self, inplanes, planes, blocks, t=6, stride=1):
+        layers = []
+        layers.append(
+            InvertedResidual(
+                inplanes, planes, stride, t, norm_cfg=self.norm_cfg))
+        for i in range(1, blocks):
+            layers.append(
+                InvertedResidual(planes, planes, 1, t, norm_cfg=self.norm_cfg))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.bottleneck1(x)
+        x = self.bottleneck2(x)
+        x = self.bottleneck3(x)
+        x = torch.cat([x, *self.ppm(x)], dim=1)
+        x = self.out(x)
+        return x
+
+
+class FeatureFusionModule(nn.Module):
+    """Feature fusion module"""
+
+    def __init__(self,
+                 higher_in_channels,
+                 lower_in_channels,
+                 out_channels,
+                 scale_factor,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=True):
+        super(FeatureFusionModule, self).__init__()
+        self.scale_factor = scale_factor
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.dwconv = ConvModule(
+            lower_in_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.conv_lower_res = ConvModule(
+            out_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.conv_higher_res = ConvModule(
+            higher_in_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.relu = nn.ReLU(True)
+
+    def forward(self, higher_res_feature, lower_res_feature):
+        lower_res_feature = resize(
+            lower_res_feature,
+            scale_factor=self.scale_factor,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        lower_res_feature = self.dwconv(lower_res_feature)
+        lower_res_feature = self.conv_lower_res(lower_res_feature)
+
+        higher_res_feature = self.conv_higher_res(higher_res_feature)
+        out = higher_res_feature + lower_res_feature
+        return self.relu(out)
+
+
+@BACKBONES.register_module()
+class FastSCNN(nn.Module):
+
+    def __init__(self,
+                 in_channels=3,
+                 downsample_dw_channels1=32,
+                 downsample_dw_channels2=48,
+                 global_in_channels=64,
+                 global_block_channels=(64, 96, 128),
+                 global_out_channels=128,
+                 higher_in_channels=64,
+                 lower_in_channels=128,
+                 fusion_out_channels=128,
+                 scale_factor=4,
+                 out_indices=(0, 1, 2),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False):
+        super(FastSCNN, self).__init__()
+        self.in_channels = in_channels
+        self.downsample_dw_channels1 = downsample_dw_channels1
+        self.downsample_dw_channels2 = downsample_dw_channels2
+        self.global_in_channels = global_in_channels
+        self.global_block_channels = global_block_channels
+        self.global_out_channels = global_out_channels
+        self.higher_in_channels = higher_in_channels
+        self.lower_in_channels = lower_in_channels
+        self.fusion_out_channels = fusion_out_channels
+        self.scale_factor = scale_factor
+        self.out_indices = out_indices
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.learning_to_downsample = LearningToDownsample(
+            in_channels,
+            downsample_dw_channels1,
+            downsample_dw_channels2,
+            global_in_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.global_feature_extractor = GlobalFeatureExtractor(
+            global_in_channels,
+            global_block_channels,
+            global_out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.feature_fusion = FeatureFusionModule(
+            higher_in_channels,
+            lower_in_channels,
+            fusion_out_channels,
+            scale_factor=self.scale_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def init_weights(self, pretrained=None):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                constant_init(m, 1)
+
+    def forward(self, x):
+        higher_res_features = self.learning_to_downsample(x)
+        lower_res_features = self.global_feature_extractor(higher_res_features)
+        fusion_output = self.feature_fusion(higher_res_features,
+                                            lower_res_features)
+
+        outs = [higher_res_features, lower_res_features, fusion_output]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
--- a/mmseg/models/backbones/mobile_net_v2.py
+++ b/mmseg/models/backbones/mobile_net_v2.py
@ -0,0 +1,203 @@
+from mmcv.cnn import (ConvModule, build_norm_layer, constant_init,
+                      kaiming_init, normal_init)
+from mmcv.runner import load_checkpoint
+from torch import nn
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.utils import get_root_logger
+from ..builder import BACKBONES
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self,
+                 inp,
+                 oup,
+                 stride,
+                 expand_ratio,
+                 dilation=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6')):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(
+                ConvModule(
+                    inp,
+                    hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        layers.extend([
+            # dw
+            ConvModule(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=3,
+                padding=dilation,
+                stride=stride,
+                dilation=dilation,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            build_norm_layer(norm_cfg, oup)[1],
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+@BACKBONES.register_module()
+class MobileNetV2(nn.Module):
+    arch_settings = (
+        InvertedResidual,
+        [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1]
+        ])
+
+    def __init__(self,
+                 in_channels=3,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 input_channels=32,
+                 width_mult=1.0,
+                 round_nearest=8,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6')):
+        """
+        MobileNet V2 main class
+        Args:
+            width_mult (float): Width multiplier - adjusts number of channels
+                in each layer by this amount
+            round_nearest (int): Round the number of channels in each layer to
+                be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for
+                mobilenet
+        """
+        super(MobileNetV2, self).__init__()
+        self.in_channels = in_channels
+        self.width_mult = width_mult
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        block, inverted_residual_setting = self.arch_settings
+        self.dilations = dilations
+        self.out_indices = out_indices
+
+        # building first layer
+        input_channels = int(
+            input_channels *
+            self.width_mult) if self.width_mult > 1.0 else input_channels
+        # last_channels = int(1280 * multiplier) if multiplier > 1.0 else 1280
+        self.conv1 = ConvModule(
+            3,
+            input_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        # building inverted residual blocks
+        self.planes = input_channels
+        self.block1 = self._make_layer(block, self.planes,
+                                       inverted_residual_setting[0:1],
+                                       dilations[0])
+        self.block2 = self._make_layer(block, self.planes,
+                                       inverted_residual_setting[1:2],
+                                       dilations[1])
+        self.block3 = self._make_layer(block, self.planes,
+                                       inverted_residual_setting[2:3],
+                                       dilations[2])
+        self.block4 = self._make_layer(block, self.planes,
+                                       inverted_residual_setting[3:5],
+                                       dilations[3])
+        self.block5 = self._make_layer(block, self.planes,
+                                       inverted_residual_setting[5:],
+                                       dilations[4])
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    inverted_residual_setting,
+                    dilation=1):
+        features = list()
+        for t, c, n, s in inverted_residual_setting:
+            out_channels = int(c * self.width_mult)
+            stride = s if dilation == 1 else 1
+            features.append(
+                block(
+                    planes,
+                    out_channels,
+                    stride,
+                    t,
+                    dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            planes = out_channels
+            for i in range(n - 1):
+                features.append(
+                    block(
+                        planes,
+                        out_channels,
+                        1,
+                        t,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                planes = out_channels
+        self.planes = planes
+        return nn.Sequential(*features)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        else:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_out')
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, 0, 0.01)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.block1(x)
+        c1 = self.block2(x)
+        c2 = self.block3(c1)
+        c3 = self.block4(c2)
+        c4 = self.block5(c3)
+
+        outs = [c1, c2, c3, c4]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
--- a/mmseg/models/decode_heads/init.py
+++ b/mmseg/models/decode_heads/init.py
@ -11,9 +11,10 @@ from .psa_head import PSAHead
 from .psp_head import PSPHead
 from .sep_aspp_head import DepthwiseSeparableASPPHead
 from .uper_head import UPerHead
+from .sep_fcn_head import SepFCNHead

 __all__ = [
    'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
    'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
-    'EncHead'
+    'EncHead', 'SepFCNHead'
 ]
--- a/mmseg/models/decode_heads/sep_fcn_head.py
+++ b/mmseg/models/decode_heads/sep_fcn_head.py
@ -0,0 +1,29 @@
+from mmseg.ops import DepthwiseSeparableConvModule
+from ..builder import HEADS
+from .fcn_head import FCNHead
+
+
+@HEADS.register_module()
+class SepFCNHead(FCNHead):
+
+    def __init__(self, **kwargs):
+        super(SepFCNHead, self).__init__(**kwargs)
+        self.convs[0] = DepthwiseSeparableConvModule(
+            self.in_channels,
+            self.channels,
+            norm_cfg=self.norm_cfg,
+            relu_first=False)
+        for i in range(1, self.num_convs):
+            self.convs[i] = DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                norm_cfg=self.norm_cfg,
+                relu_first=False)
+
+        if self.concat_input:
+            self.conv_cat = DepthwiseSeparableConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                self.channels,
+                norm_cfg=self.norm_cfg,
+                relu_first=False)
--- a/mmseg/ops/separable_conv_module.py
+++ b/mmseg/ops/separable_conv_module.py
@ -1,88 +1,60 @@
-import torch.nn as nn
-from mmcv.cnn import ConvModule
+from mmcv.cnn import build_norm_layer
+from torch import nn


 class DepthwiseSeparableConvModule(nn.Module):
-    """Depthwise separable convolution module.
-
-    See https://arxiv.org/pdf/1704.04861.pdf for details.
-
-    This module can replace a ConvModule with the conv block replaced by two
-    conv block: depthwise conv block and pointwise conv block. The depthwise
-    conv block contains depthwise-conv/norm/activation layers. The pointwise
-    conv block contains pointwise-conv/norm/activation layers. It should be
-    noted that there will be norm/activation layer in the depthwise conv block
-    if `norm_cfg` and `act_cfg` are specified.
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int or tuple[int]): Same as nn.Conv2d.
-        stride (int or tuple[int]): Same as nn.Conv2d. Default: 1.
-        padding (int or tuple[int]): Same as nn.Conv2d. Default: 0.
-        dilation (int or tuple[int]): Same as nn.Conv2d. Default: 1.
-        norm_cfg (dict): Default norm config for both depthwise ConvModule and
-            pointwise ConvModule. Default: None.
-        act_cfg (dict): Default activation config for both depthwise ConvModule
-            and pointwise ConvModule. Default: dict(type='ReLU').
-        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
-            'default', it will be the same as `norm_cfg`. Default: 'default'.
-        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
-            'default', it will be the same as `act_cfg`. Default: 'default'.
-        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
-            'default', it will be the same as `norm_cfg`. Default: 'default'.
-        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
-            'default', it will be the same as `act_cfg`. Default: 'default'.
-        kwargs (optional): Other shared arguments for depthwise and pointwise
-            ConvModule. See ConvModule for ref.
-    """

    def __init__(self,
                 in_channels,
                 out_channels,
-                 kernel_size,
+                 kernel_size=3,
                 stride=1,
-                 padding=0,
                 dilation=1,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 dw_norm_cfg='default',
-                 dw_act_cfg='default',
-                 pw_norm_cfg='default',
-                 pw_act_cfg='default',
-                 **kwargs):
+                 relu_first=True,
+                 bias=False,
+                 norm_cfg=dict(type='BN')):
        super(DepthwiseSeparableConvModule, self).__init__()
-        assert 'groups' not in kwargs, 'groups should not be specified'
-
-        # if norm/activation config of depthwise/pointwise ConvModule is not
-        # specified, use default config.
-        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
-        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
-        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
-        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
-
-        # depthwise convolution
-        self.depthwise_conv = ConvModule(
+        self.depthwise = nn.Conv2d(
            in_channels,
            in_channels,
            kernel_size,
            stride=stride,
-            padding=padding,
+            padding=dilation,
            dilation=dilation,
            groups=in_channels,
-            norm_cfg=dw_norm_cfg,
-            act_cfg=dw_act_cfg,
-            **kwargs)
+            bias=bias)
+        self.norm_depth_name, norm_depth = build_norm_layer(
+            norm_cfg, in_channels, postfix='_depth')
+        self.add_module(self.norm_depth_name, norm_depth)

-        self.pointwise_conv = ConvModule(
-            in_channels,
-            out_channels,
-            1,
-            norm_cfg=pw_norm_cfg,
-            act_cfg=pw_act_cfg,
-            **kwargs)
+        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, bias=bias)
+        self.norm_point_name, norm_point = build_norm_layer(
+            norm_cfg, out_channels, postfix='_point')
+        self.add_module(self.norm_point_name, norm_point)
+
+        self.relu_first = relu_first
+        self.relu = nn.ReLU(inplace=not relu_first)
+
+    @property
+    def norm_depth(self):
+        return getattr(self, self.norm_depth_name)
+
+    @property
+    def norm_point(self):
+        return getattr(self, self.norm_point_name)

    def forward(self, x):
-        x = self.depthwise_conv(x)
-        x = self.pointwise_conv(x)
-        return x
+        if self.relu_first:
+            out = self.relu(x)
+            out = self.depthwise(out)
+            out = self.norm_depth(out)
+            out = self.pointwise(out)
+            out = self.norm_point(out)
+        else:
+            out = self.depthwise(x)
+            out = self.norm_depth(out)
+            out = self.relu(out)
+            out = self.pointwise(out)
+            out = self.norm_point(out)
+            out = self.relu(out)
+        return out