[features] Support stdc (#284)

* add stdc semantic segmentation algorithm
2025-06-03 14:49:00 +08:00 · 2023-02-16 14:00:59 +08:00 · 2023-02-16 14:00:59 +08:00 · 26cd12ab42
commit 26cd12ab42
parent 2fe73eee91
24 changed files with 1350 additions and 8 deletions
--- a/configs/segmentation/stdc/stdc1_cityscape_8xb6_e1290.py
+++ b/configs/segmentation/stdc/stdc1_cityscape_8xb6_e1290.py
@ -0,0 +1,198 @@
 _base_ = ['configs/base.py']
 # warning batch_size need >= 2
 # model
 norm_cfg = dict(type='BN', requires_grad=True)
 model = dict(
    type='EncoderDecoder',
    backbone=dict(
        type='STDCContextPathNet',
        backbone_cfg=dict(
            type='STDCNet',
            stdc_type='STDCNet1',
            in_channels=3,
            channels=(32, 64, 256, 512, 1024),
            bottleneck_type='cat',
            num_convs=4,
            norm_cfg=norm_cfg,
            act_cfg=dict(type='ReLU'),
            with_final_conv=False),
        last_in_channels=(1024, 512),
        out_channels=128,
        ffm_cfg=dict(in_channels=384, out_channels=256, scale_factor=4)),
    decode_head=dict(
        type='FCNHead',
        in_channels=256,
        channels=256,
        num_convs=1,
        num_classes=19,
        in_index=3,
        concat_input=False,
        dropout_ratio=0.1,
        norm_cfg=norm_cfg,
        align_corners=True,
        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
        loss_decode=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
    auxiliary_head=[
        dict(
            type='FCNHead',
            in_channels=128,
            channels=64,
            num_convs=1,
            num_classes=19,
            in_index=2,
            norm_cfg=norm_cfg,
            concat_input=False,
            align_corners=False,
            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
            loss_decode=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
        dict(
            type='FCNHead',
            in_channels=128,
            channels=64,
            num_convs=1,
            num_classes=19,
            in_index=1,
            norm_cfg=norm_cfg,
            concat_input=False,
            align_corners=False,
            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
            loss_decode=dict(
                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
        dict(
            type='STDCHead',
            in_channels=256,
            channels=64,
            num_convs=1,
            num_classes=2,
            boundary_threshold=0.1,
            in_index=0,
            norm_cfg=norm_cfg,
            concat_input=False,
            align_corners=True,
            loss_decode=[
                dict(
                    type='CrossEntropyLoss',
                    loss_name='loss_ce',
                    use_sigmoid=True,
                    loss_weight=1.0),
                dict(type='DiceLoss', loss_name='loss_dice', loss_weight=1.0)
            ]),
    ],
    train_cfg=dict(),
    test_cfg=dict(mode='whole'),
    pretrained=
    'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/stdc/pretrain/stdc1_easycv.pth'
 )
 # dataset
 CLASSES = [
    'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light',
    'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car',
    'truck', 'bus', 'train', 'motorcycle', 'bicycle'
 ]
 img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 crop_size = (512, 1024)
 train_pipeline = [
    dict(type='MMResize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
    dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
    dict(type='MMRandomFlip', flip_ratio=0.5),
    dict(type='MMPhotoMetricDistortion'),
    dict(type='MMNormalize', **img_norm_cfg),
    dict(type='MMPad', size=crop_size),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=['img', 'gt_semantic_seg'],
        meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
                   'img_norm_cfg')),
 ]
 test_pipeline = [
    dict(
        type='MMMultiScaleFlipAug',
        img_scale=(2048, 1024),
        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
        flip=False,
        transforms=[
            dict(type='MMResize', keep_ratio=True),
            dict(type='MMRandomFlip'),
            dict(type='MMNormalize', **img_norm_cfg),
            dict(type='ImageToTensor', keys=['img']),
            dict(
                type='Collect',
                keys=['img'],
                meta_keys=('filename', 'ori_filename', 'ori_shape',
                           'img_shape', 'pad_shape', 'scale_factor', 'flip',
                           'flip_direction', 'img_norm_cfg')),
        ])
 ]
 dataset_type = 'SegDataset'
 data_root = '../Cityscapes/'
 train_img_root = data_root + 'leftImg8bit/train/'
 train_label_root = data_root + 'gtFine/train/'
 val_img_root = data_root + 'leftImg8bit/val/'
 val_label_root = data_root + 'gtFine/val/'
 data = dict(
    imgs_per_gpu=6,
    workers_per_gpu=4,
    persistent_workers=True,
    train=dict(
        type=dataset_type,
        ignore_index=255,
        data_source=dict(
            type='SegSourceCityscapes',
            img_root=train_img_root,
            label_root=train_label_root,
            classes=CLASSES),
        pipeline=train_pipeline),
    val=dict(
        imgs_per_gpu=1,
        ignore_index=255,
        type=dataset_type,
        data_source=dict(
            type='SegSourceCityscapes',
            img_root=val_img_root,
            label_root=val_label_root,
            classes=CLASSES),
        pipeline=test_pipeline))
 # optimizer
 optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 optimizer_config = dict()
 # learning policy
 lr_config = dict(
    policy='CosineAnnealing',
    min_lr=1e-4,
    warmup='linear',
    warmup_iters=10,
    warmup_ratio=0.0001,
    warmup_by_epoch=True,
    by_epoch=False)
 # runtime settings
 total_epochs = 1290
 checkpoint_config = dict(interval=10)
 eval_config = dict(interval=10, gpu_collect=False)
 eval_pipelines = [
    dict(
        mode='test',
        evaluators=[
            dict(
                type='SegmentationEvaluator',
                classes=CLASSES,
                metric_names=['mIoU'])
        ],
    )
 ]
 # export config
 export = dict(export_neck=True)
 checkpoint_sync_export = True
--- a/configs/segmentation/stdc/stdc2_cityscape_8xb6_e1290.py
+++ b/configs/segmentation/stdc/stdc2_cityscape_8xb6_e1290.py
@ -0,0 +1,7 @@
 _base_ = ['configs/segmentation/stdc/stdc1_cityscape_8xb6_e1290.py']
 model = dict(
    backbone=dict(backbone_cfg=dict(stdc_type='STDCNet2')),
    pretrained=
    'http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/stdc/pretrain/stdc2_easycv.pth'
 )
--- a/docs/source/model_zoo_seg.md
+++ b/docs/source/model_zoo_seg.md
@ -15,6 +15,12 @@ Pretrained on **Pascal VOC 2012 + Aug**.
 | ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 5.5 | 282.9ms | 76.59               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) |
 ## STDC
 trained on **Cityscapes**.
 | Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | Train memory<br/>(GB)       | inference time(V100)<br/>(ms/img)                      | mIoU | Download                                                     |
 | ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | STDC1 | [stdc1_cityscape_8xb6_e1290](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/stdc/stdc1_cityscape_8xb6_e1290.py) | 7.7M/8.5M | 4.5 | 11.9ms | 75.4               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/stdc/stdc1_cityscapes/epoch_1250.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/stdc/stdc1_cityscapes/20230214_173123.log.json) |
 ## Mask2former
 ### Instance Segmentation on COCO
--- a/easycv/datasets/segmentation/data_sources/init.py
+++ b/easycv/datasets/segmentation/data_sources/init.py
@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .cityscapes import SegSourceCityscapes
 from .coco import SegSourceCoco, SegSourceCoco2017
 from .coco_stuff import SegSourceCocoStuff10k, SegSourceCocoStuff164k
 from .raw import SegSourceRaw
@ -7,5 +8,5 @@ from .voc import SegSourceVoc2007, SegSourceVoc2010, SegSourceVoc2012
 __all__ = [
    'SegSourceRaw', 'SegSourceVoc2010', 'SegSourceVoc2007', 'SegSourceVoc2012',
    'SegSourceCoco', 'SegSourceCoco2017', 'SegSourceCocoStuff164k',
-    'SegSourceCocoStuff10k'
+    'SegSourceCocoStuff10k', 'SegSourceCityscapes'
 ]
--- a/easycv/datasets/segmentation/data_sources/cityscapes.py
+++ b/easycv/datasets/segmentation/data_sources/cityscapes.py
@ -0,0 +1,129 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import copy
 import logging
 import os
 import subprocess
 import numpy as np
 from easycv.datasets.registry import DATASOURCES
 from easycv.file import io
 from easycv.file.image import load_image as _load_img
 from .raw import SegSourceRaw
 try:
    import cityscapesscripts.helpers.labels as CSLabels
 except ModuleNotFoundError as e:
    res = subprocess.call('pip install cityscapesscripts', shell=True)
    if res != 0:
        info_string = (
            '\n\nAuto install failed! Please install cityscapesscripts with the following commands :\n'
            '\t`pip install cityscapesscripts`\n')
        raise ModuleNotFoundError(info_string)
 def load_seg_map_cityscape(seg_path, reduce_zero_label):
    gt_semantic_seg = _load_img(seg_path, mode='P')
    gt_semantic_seg_copy = gt_semantic_seg.copy()
    for labels in CSLabels.labels:
        gt_semantic_seg_copy[gt_semantic_seg == labels.id] = labels.trainId
    return {'gt_semantic_seg': gt_semantic_seg_copy}
@DATASOURCES.register_module
 class SegSourceCityscapes(SegSourceRaw):
    """Cityscapes datasource
    """
    CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
               'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
               'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
               'bicycle')
    PALETTE = [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
               [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
               [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
               [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100],
               [0, 80, 100], [0, 0, 230], [119, 11, 32]]
    def __init__(self,
                 img_suffix='_leftImg8bit.png',
                 label_suffix='_gtFine_labelIds.png',
                 **kwargs):
        super(SegSourceCityscapes, self).__init__(
            img_suffix=img_suffix, label_suffix=label_suffix, **kwargs)
    def __getitem__(self, idx):
        result_dict = self.samples_list[idx]
        load_success = True
        try:
            # avoid data cache from taking up too much memory
            if not self.cache_at_init and not self.cache_on_the_fly:
                result_dict = copy.deepcopy(result_dict)
            if not self.cache_at_init:
                if result_dict.get('img', None) is None:
                    img = _load_img(result_dict['filename'], mode='BGR')
                    result = {
                        'img': img.astype(np.float32),
                        'img_shape': img.shape,  # h, w, c
                        'ori_shape': img.shape,
                    }
                    result_dict.update(result)
                if result_dict.get('gt_semantic_seg', None) is None:
                    result_dict.update(
                        load_seg_map_cityscape(
                            result_dict['seg_filename'],
                            reduce_zero_label=self.reduce_zero_label))
                if self.cache_on_the_fly:
                    self.samples_list[idx] = result_dict
            result_dict = self.post_process_fn(copy.deepcopy(result_dict))
            self._retry_count = 0
        except Exception as e:
            logging.warning(e)
            load_success = False
        if not load_success:
            logging.warning(
                'Something wrong with current sample %s,Try load next sample...'
                % result_dict.get('filename', ''))
            self._retry_count += 1
            if self._retry_count >= self._max_retry_num:
                raise ValueError('All samples failed to load!')
            result_dict = self[(idx + 1) % self.num_samples]
        return result_dict
    def get_source_iterator(self):
        self.img_files = [
            os.path.join(self.img_root, i)
            for i in io.listdir(self.img_root, recursive=True)
            if i.endswith(self.img_suffix[0])
        ]
        self.label_files = []
        for img_path in self.img_files:
            self.img_root = os.path.join(self.img_root, '')
            img_name = img_path.replace(self.img_root,
                                        '')[:-len(self.img_suffix[0])]
            find_label_path = False
            for label_format in self.label_suffix:
                lable_path = os.path.join(self.label_root,
                                          img_name + label_format)
                if io.exists(lable_path):
                    find_label_path = True
                    self.label_files.append(lable_path)
                    break
            if not find_label_path:
                logging.warning(
                    'Not find label file %s for img: %s, skip the sample!' %
                    (lable_path, img_path))
                self.img_files.remove(img_path)
        assert len(self.img_files) == len(self.label_files)
        assert len(
            self.img_files) > 0, 'No samples found in %s' % self.img_root
        return list(zip(self.img_files, self.label_files))
--- a/easycv/models/backbones/init.py
+++ b/easycv/models/backbones/init.py
@ -22,6 +22,7 @@ from .resnet import ResNet
 from .resnet_jit import ResNetJIT
 from .resnext import ResNeXt
 from .shuffle_transformer import ShuffleTransformer
 from .stdc import STDCContextPathNet, STDCNet
 from .swin_transformer import SwinTransformer
 from .swin_transformer3d import SwinTransformer3D
 from .vision_transformer import VisionTransformer
--- a/easycv/models/backbones/stdc.py
+++ b/easycv/models/backbones/stdc.py
@ -0,0 +1,465 @@
 # Modified from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/backbones/stdc.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
 from easycv.models import builder
 from ..registry import BACKBONES
 class AttentionRefinementModule(BaseModule):
    """Attention Refinement Module (ARM) to refine the features of each stage.
    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
    Returns:
        x_out (torch.Tensor): Feature map of Attention Refinement Module.
    """
    def __init__(self,
                 in_channels,
                 out_channel,
                 conv_cfg=None,
                 norm_cfg=dict(type='BN'),
                 act_cfg=dict(type='ReLU'),
                 init_cfg=None):
        super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
        self.conv_layer = ConvModule(
            in_channels=in_channels,
            out_channels=out_channel,
            kernel_size=3,
            stride=1,
            padding=1,
            conv_cfg=conv_cfg,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg)
        self.atten_conv_layer = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            ConvModule(
                in_channels=out_channel,
                out_channels=out_channel,
                kernel_size=1,
                bias=False,
                conv_cfg=conv_cfg,
                norm_cfg=norm_cfg,
                act_cfg=None), nn.Sigmoid())
    def forward(self, x):
        x = self.conv_layer(x)
        x_atten = self.atten_conv_layer(x)
        x_out = x * x_atten
        return x_out
 class STDCModule(BaseModule):
    """STDCModule.
    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels before scaling.
        stride (int): The number of stride for the first conv layer.
        norm_cfg (dict): Config dict for normalization layer. Default: None.
        act_cfg (dict): The activation config for conv layers.
        num_convs (int): Numbers of conv layers.
        fusion_type (str): Type of fusion operation. Default: 'add'.
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 norm_cfg=None,
                 act_cfg=None,
                 num_convs=4,
                 fusion_type='add',
                 init_cfg=None):
        super(STDCModule, self).__init__(init_cfg=init_cfg)
        assert num_convs > 1
        assert fusion_type in ['add', 'cat']
        self.stride = stride
        self.with_downsample = True if self.stride == 2 else False
        self.fusion_type = fusion_type
        self.layers = ModuleList()
        conv_0 = ConvModule(
            in_channels, out_channels // 2, kernel_size=1, norm_cfg=norm_cfg)
        if self.with_downsample:
            self.downsample = ConvModule(
                out_channels // 2,
                out_channels // 2,
                kernel_size=3,
                stride=2,
                padding=1,
                groups=out_channels // 2,
                norm_cfg=norm_cfg,
                act_cfg=None)
            if self.fusion_type == 'add':
                self.layers.append(nn.Sequential(conv_0, self.downsample))
                self.skip = Sequential(
                    ConvModule(
                        in_channels,
                        in_channels,
                        kernel_size=3,
                        stride=2,
                        padding=1,
                        groups=in_channels,
                        norm_cfg=norm_cfg,
                        act_cfg=None),
                    ConvModule(
                        in_channels,
                        out_channels,
                        1,
                        norm_cfg=norm_cfg,
                        act_cfg=None))
            else:
                self.layers.append(conv_0)
                self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
        else:
            self.layers.append(conv_0)
        for i in range(1, num_convs):
            out_factor = 2**(i + 1) if i != num_convs - 1 else 2**i
            self.layers.append(
                ConvModule(
                    out_channels // 2**i,
                    out_channels // out_factor,
                    kernel_size=3,
                    stride=1,
                    padding=1,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg))
    def forward(self, inputs):
        if self.fusion_type == 'add':
            out = self.forward_add(inputs)
        else:
            out = self.forward_cat(inputs)
        return out
    def forward_add(self, inputs):
        layer_outputs = []
        x = inputs.clone()
        for layer in self.layers:
            x = layer(x)
            layer_outputs.append(x)
        if self.with_downsample:
            inputs = self.skip(inputs)
        return torch.cat(layer_outputs, dim=1) + inputs
    def forward_cat(self, inputs):
        x0 = self.layers[0](inputs)
        layer_outputs = [x0]
        for i, layer in enumerate(self.layers[1:]):
            if i == 0:
                if self.with_downsample:
                    x = layer(self.downsample(x0))
                else:
                    x = layer(x0)
            else:
                x = layer(x)
            layer_outputs.append(x)
        if self.with_downsample:
            layer_outputs[0] = self.skip(x0)
        return torch.cat(layer_outputs, dim=1)
 class FeatureFusionModule(BaseModule):
    """Feature Fusion Module. This module is different from FeatureFusionModule
    in BiSeNetV1. It uses two ConvModules in `self.attention` whose inter
    channel number is calculated by given `scale_factor`, while
    FeatureFusionModule in BiSeNetV1 only uses one ConvModule in
    `self.conv_atten`.
    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        scale_factor (int): The number of channel scale factor.
            Default: 4.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='BN').
        act_cfg (dict): The activation config for conv layers.
            Default: dict(type='ReLU').
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None.
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 scale_factor=4,
                 norm_cfg=dict(type='BN'),
                 act_cfg=dict(type='ReLU'),
                 init_cfg=None):
        super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
        channels = out_channels // scale_factor
        self.conv0 = ConvModule(
            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
        self.attention = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            ConvModule(
                out_channels,
                channels,
                1,
                norm_cfg=None,
                bias=False,
                act_cfg=act_cfg),
            ConvModule(
                channels,
                out_channels,
                1,
                norm_cfg=None,
                bias=False,
                act_cfg=None), nn.Sigmoid())
    def forward(self, spatial_inputs, context_inputs):
        inputs = torch.cat([spatial_inputs, context_inputs], dim=1)
        x = self.conv0(inputs)
        attn = self.attention(x)
        x_attn = x * attn
        return x_attn + x
@BACKBONES.register_module()
 class STDCNet(BaseModule):
    """This backbone is the implementation of `Rethinking BiSeNet For Real-time
    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
    Args:
        stdc_type (int): The type of backbone structure,
            `STDCNet1` and`STDCNet2` denotes two main backbones in paper,
            whose FLOPs is 813M and 1446M, respectively.
        in_channels (int): The num of input_channels.
        channels (tuple[int]): The output channels for each stage.
        bottleneck_type (str): The type of STDC Module type, the value must
            be 'add' or 'cat'.
        norm_cfg (dict): Config dict for normalization layer.
        act_cfg (dict): The activation config for conv layers.
        num_convs (int): Numbers of conv layer at each STDC Module.
            Default: 4.
        with_final_conv (bool): Whether add a conv layer at the Module output.
            Default: True.
        pretrained (str, optional): Model pretrained path. Default: None.
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None.
    Example:
        >>> import torch
        >>> stdc_type = 'STDCNet1'
        >>> in_channels = 3
        >>> channels = (32, 64, 256, 512, 1024)
        >>> bottleneck_type = 'cat'
        >>> inputs = torch.rand(1, 3, 1024, 2048)
        >>> self = STDCNet(stdc_type, in_channels,
        ...                 channels, bottleneck_type).eval()
        >>> outputs = self.forward(inputs)
        >>> for i in range(len(outputs)):
        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
        outputs[0].shape = torch.Size([1, 256, 128, 256])
        outputs[1].shape = torch.Size([1, 512, 64, 128])
        outputs[2].shape = torch.Size([1, 1024, 32, 64])
    """
    arch_settings = {
        'STDCNet1': [(2, 1), (2, 1), (2, 1)],
        'STDCNet2': [(2, 1, 1, 1), (2, 1, 1, 1, 1), (2, 1, 1)]
    }
    def __init__(self,
                 stdc_type,
                 in_channels,
                 channels,
                 bottleneck_type,
                 norm_cfg,
                 act_cfg,
                 num_convs=4,
                 with_final_conv=False,
                 pretrained=None,
                 init_cfg=None):
        super(STDCNet, self).__init__(init_cfg=init_cfg)
        assert stdc_type in self.arch_settings, \
            f'invalid structure {stdc_type} for STDCNet.'
        assert bottleneck_type in ['add', 'cat'],\
            f'bottleneck_type must be `add` or `cat`, got {bottleneck_type}'
        assert len(channels) == 5,\
            f'invalid channels length {len(channels)} for STDCNet.'
        self.in_channels = in_channels
        self.channels = channels
        self.stage_strides = self.arch_settings[stdc_type]
        self.prtrained = pretrained
        self.num_convs = num_convs
        self.with_final_conv = with_final_conv
        self.stages = ModuleList([
            ConvModule(
                self.in_channels,
                self.channels[0],
                kernel_size=3,
                stride=2,
                padding=1,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg),
            ConvModule(
                self.channels[0],
                self.channels[1],
                kernel_size=3,
                stride=2,
                padding=1,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg)
        ])
        # `self.num_shallow_features` is the number of shallow modules in
        # `STDCNet`, which is noted as `Stage1` and `Stage2` in original paper.
        # They are both not used for following modules like Attention
        # Refinement Module and Feature Fusion Module.
        # Thus they would be cut from `outs`. Please refer to Figure 4
        # of original paper for more details.
        self.num_shallow_features = len(self.stages)
        for strides in self.stage_strides:
            idx = len(self.stages) - 1
            self.stages.append(
                self._make_stage(self.channels[idx], self.channels[idx + 1],
                                 strides, norm_cfg, act_cfg, bottleneck_type))
        # After appending, `self.stages` is a ModuleList including several
        # shallow modules and STDCModules.
        # (len(self.stages) ==
        # self.num_shallow_features + len(self.stage_strides))
        if self.with_final_conv:
            self.final_conv = ConvModule(
                self.channels[-1],
                max(1024, self.channels[-1]),
                1,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg)
    def _make_stage(self, in_channels, out_channels, strides, norm_cfg,
                    act_cfg, bottleneck_type):
        layers = []
        for i, stride in enumerate(strides):
            layers.append(
                STDCModule(
                    in_channels if i == 0 else out_channels,
                    out_channels,
                    stride,
                    norm_cfg,
                    act_cfg,
                    num_convs=self.num_convs,
                    fusion_type=bottleneck_type))
        return Sequential(*layers)
    def forward(self, x):
        outs = []
        for stage in self.stages:
            x = stage(x)
            outs.append(x)
        if self.with_final_conv:
            outs[-1] = self.final_conv(outs[-1])
        outs = outs[self.num_shallow_features:]
        return tuple(outs)
@BACKBONES.register_module()
 class STDCContextPathNet(BaseModule):
    """STDCNet with Context Path. The `outs` below is a list of three feature
    maps from deep to shallow, whose height and width is from small to big,
    respectively. The biggest feature map of `outs` is outputted for
    `STDCHead`, where Detail Loss would be calculated by Detail Ground-truth.
    The other two feature maps are used for Attention Refinement Module,
    respectively. Besides, the biggest feature map of `outs` and the last
    output of Attention Refinement Module are concatenated for Feature Fusion
    Module. Then, this fusion feature map `feat_fuse` would be outputted for
    `decode_head`. More details please refer to Figure 4 of original paper.
    Args:
        backbone_cfg (dict): Config dict for stdc backbone.
        last_in_channels (tuple(int)), The number of channels of last
            two feature maps from stdc backbone. Default: (1024, 512).
        out_channels (int): The channels of output feature maps.
            Default: 128.
        ffm_cfg (dict): Config dict for Feature Fusion Module. Default:
            `dict(in_channels=512, out_channels=256, scale_factor=4)`.
        upsample_mode (str): Algorithm used for upsampling:
                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
                ``'trilinear'``. Default: ``'nearest'``.
        align_corners (str): align_corners argument of F.interpolate. It
            must be `None` if upsample_mode is ``'nearest'``. Default: None.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='BN').
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None.
    Return:
        outputs (tuple): The tuple of list of output feature map for
            auxiliary heads and decoder head.
    """
    def __init__(self,
                 backbone_cfg,
                 last_in_channels=(1024, 512),
                 out_channels=128,
                 ffm_cfg=dict(
                     in_channels=512, out_channels=256, scale_factor=4),
                 upsample_mode='nearest',
                 align_corners=None,
                 norm_cfg=dict(type='BN'),
                 init_cfg=None):
        super(STDCContextPathNet, self).__init__(init_cfg=init_cfg)
        self.backbone = builder.build_backbone(backbone_cfg)
        self.arms = ModuleList()
        self.convs = ModuleList()
        for channels in last_in_channels:
            self.arms.append(AttentionRefinementModule(channels, out_channels))
            self.convs.append(
                ConvModule(
                    out_channels,
                    out_channels,
                    3,
                    padding=1,
                    norm_cfg=norm_cfg))
        self.conv_avg = ConvModule(
            last_in_channels[0], out_channels, 1, norm_cfg=norm_cfg)
        self.ffm = FeatureFusionModule(**ffm_cfg)
        self.upsample_mode = upsample_mode
        self.align_corners = align_corners
    def forward(self, x):
        outs = list(self.backbone(x))
        avg = F.adaptive_avg_pool2d(outs[-1], 1)
        avg_feat = self.conv_avg(avg)
        feature_up = F.interpolate(
            avg_feat,
            size=outs[-1].shape[2:],
            mode=self.upsample_mode,
            align_corners=self.align_corners)
        arms_out = []
        for i in range(len(self.arms)):
            x_arm = self.arms[i](outs[len(outs) - 1 - i]) + feature_up
            feature_up = F.interpolate(
                x_arm,
                size=outs[len(outs) - 1 - i - 1].shape[2:],
                mode=self.upsample_mode,
                align_corners=self.align_corners)
            feature_up = self.convs[i](feature_up)
            arms_out.append(feature_up)
        feat_fuse = self.ffm(outs[0], arms_out[1])
        # The `outputs` has four feature maps.
        # `outs[0]` is outputted for `STDCHead` auxiliary head.
        # Two feature maps of `arms_out` are outputted for auxiliary head.
        # `feat_fuse` is outputted for decoder head.
        outputs = [outs[0]] + list(arms_out) + [feat_fuse]
        return tuple(outputs)
--- a/easycv/models/loss/init.py
+++ b/easycv/models/loss/init.py
@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .cross_entropy_loss import CrossEntropyLoss
 from .det_db_loss import DBLoss
 from .dice_loss import DiceLoss
 from .face_keypoint_loss import FacePoseLoss, WingLossWithPose
 from .focal_loss import FocalLoss, VarifocalLoss
 from .iou_loss import GIoULoss, IoULoss, YOLOX_IOULoss
@ -22,5 +23,6 @@ __all__ = [
    'FocalLoss2d', 'DistributeMSELoss', 'CrossEntropyLossWithLabelSmooth',
    'AMSoftmaxLoss', 'ModelParallelSoftmaxLoss', 'ModelParallelAMSoftmaxLoss',
    'SoftTargetCrossEntropy', 'CDNCriterion', 'DNCriterion', 'DBLoss',
-    'HungarianMatcher', 'SetCriterion', 'L1Loss', 'MultiLoss', 'SmoothL1Loss'
+    'HungarianMatcher', 'SetCriterion', 'L1Loss', 'MultiLoss', 'SmoothL1Loss',
    'DiceLoss'
 ]
--- a/easycv/models/loss/dice_loss.py
+++ b/easycv/models/loss/dice_loss.py
@ -0,0 +1,135 @@
 # Borrowed from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/losses/dice_loss.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from easycv.models.builder import LOSSES
 from .utils import get_class_weight, weighted_loss
@weighted_loss
 def dice_loss(pred,
              target,
              valid_mask,
              smooth=1,
              exponent=2,
              class_weight=None,
              ignore_index=255):
    assert pred.shape[0] == target.shape[0]
    total_loss = 0
    num_classes = pred.shape[1]
    for i in range(num_classes):
        if i != ignore_index:
            dice_loss = binary_dice_loss(
                pred[:, i],
                target[..., i],
                valid_mask=valid_mask,
                smooth=smooth,
                exponent=exponent)
            if class_weight is not None:
                dice_loss *= class_weight[i]
            total_loss += dice_loss
    return total_loss / num_classes
@weighted_loss
 def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwargs):
    assert pred.shape[0] == target.shape[0]
    pred = pred.reshape(pred.shape[0], -1)
    target = target.reshape(target.shape[0], -1)
    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
    num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth
    den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth
    return 1 - num / den
@LOSSES.register_module()
 class DiceLoss(nn.Module):
    """DiceLoss.
    This loss is proposed in `V-Net: Fully Convolutional Neural Networks for
    Volumetric Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
    Args:
        smooth (float): A float number to smooth loss, and avoid NaN error.
            Default: 1
        exponent (float): An float number to calculate denominator
            value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2.
        reduction (str, optional): The method used to reduce the loss. Options
            are "none", "mean" and "sum". This parameter only works when
            per_image is True. Default: 'mean'.
        class_weight (list[float] | str, optional): Weight of each class. If in
            str format, read them from a file. Defaults to None.
        loss_weight (float, optional): Weight of the loss. Default to 1.0.
        ignore_index (int | None): The label index to be ignored. Default: 255.
        loss_name (str, optional): Name of the loss item. If you want this loss
            item to be included into the backward graph, `loss_` must be the
            prefix of the name. Defaults to 'loss_dice'.
    """
    def __init__(self,
                 smooth=1,
                 exponent=2,
                 reduction='mean',
                 class_weight=None,
                 loss_weight=1.0,
                 ignore_index=255,
                 loss_name='loss_dice',
                 **kwargs):
        super(DiceLoss, self).__init__()
        self.smooth = smooth
        self.exponent = exponent
        self.reduction = reduction
        self.class_weight = get_class_weight(class_weight)
        self.loss_weight = loss_weight
        self.ignore_index = ignore_index
        self._loss_name = loss_name
    def forward(self,
                pred,
                target,
                avg_factor=None,
                reduction_override=None,
                **kwargs):
        assert reduction_override in (None, 'none', 'mean', 'sum')
        reduction = (
            reduction_override if reduction_override else self.reduction)
        if self.class_weight is not None:
            class_weight = pred.new_tensor(self.class_weight)
        else:
            class_weight = None
        pred = F.softmax(pred, dim=1)
        num_classes = pred.shape[1]
        one_hot_target = F.one_hot(
            torch.clamp(target.long(), 0, num_classes - 1),
            num_classes=num_classes)
        valid_mask = (target != self.ignore_index).long()
        loss = self.loss_weight * dice_loss(
            pred,
            one_hot_target,
            valid_mask=valid_mask,
            reduction=reduction,
            avg_factor=avg_factor,
            smooth=self.smooth,
            exponent=self.exponent,
            class_weight=class_weight,
            ignore_index=self.ignore_index)
        return loss
    @property
    def loss_name(self):
        """Loss Name.
        This function must be implemented and will return the name of this
        loss function. This name will be used to combine different loss items
        by simple sum operation. In addition, if you want this loss item to be
        included into the backward graph, `loss_` must be the prefix of the
        name.
        Returns:
            str: The name of this loss item.
        """
        return self._loss_name
--- a/easycv/models/loss/utils.py
+++ b/easycv/models/loss/utils.py
@ -1,12 +1,32 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 import mmcv
 import numpy as np
 import torch
 import torch.nn.functional as F
 from easycv.framework.errors import ValueError
 def get_class_weight(class_weight):
    """Get class weight for loss function.
    Args:
        class_weight (list[float] | str | None): If class_weight is a str,
            take it as a file name and read from it.
    """
    if isinstance(class_weight, str):
        # take it as a file path
        if class_weight.endswith('.npy'):
            class_weight = np.load(class_weight)
        else:
            # pkl, json or yaml
            class_weight = mmcv.load(class_weight)
    return class_weight
 def reduce_loss(loss, reduction):
    """Reduce loss as specified.
--- a/easycv/models/segmentation/init.py
+++ b/easycv/models/segmentation/init.py
@ -2,3 +2,4 @@
 from .encoder_decoder import EncoderDecoder
 from .heads import *
 from .mask2former import Mask2Former
 from .sampler import BasePixelSampler, OHEMPixelSampler, build_pixel_sampler
--- a/easycv/models/segmentation/encoder_decoder.py
+++ b/easycv/models/segmentation/encoder_decoder.py
@ -157,7 +157,6 @@ class EncoderDecoder(BaseModel):
        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        x = self.extract_feat(img)
        losses = dict()
        loss_decode = self._decode_head_forward_train(x, img_metas,
--- a/easycv/models/segmentation/heads/init.py
+++ b/easycv/models/segmentation/heads/init.py
@ -2,6 +2,9 @@
 from .fcn_head import FCNHead
 from .mask2former_head import Mask2FormerHead
 from .segformer_head import SegformerHead
 from .stdc_head import STDCHead
 from .uper_head import UPerHead
-__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead', 'SegformerHead']
+__all__ = [
    'FCNHead', 'UPerHead', 'Mask2FormerHead', 'SegformerHead', 'STDCHead'
 ]
--- a/easycv/models/segmentation/heads/base.py
+++ b/easycv/models/segmentation/heads/base.py
@ -11,6 +11,7 @@ from easycv.framework.errors import TypeError
 from easycv.models.builder import build_loss
 from easycv.models.utils.ops import resize_tensor
 from easycv.utils.logger import print_log
 from ..sampler import build_pixel_sampler
 # Modified from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/decode_head.py
@ -69,6 +70,7 @@ class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
                     type='CrossEntropyLoss',
                     use_sigmoid=False,
                     loss_weight=1.0),
                 sampler=None,
                 ignore_index=255,
                 align_corners=False,
                 init_cfg=dict(
@ -97,6 +99,11 @@ class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
                but got {type(loss_decode)}')
        if sampler is not None:
            self.sampler = build_pixel_sampler(sampler, context=self)
        else:
            self.sampler = None
        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
        if dropout_ratio > 0:
            self.dropout = nn.Dropout2d(dropout_ratio)
@ -232,7 +239,10 @@ class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
            size=seg_label.shape[2:],
            mode='bilinear',
            align_corners=self.align_corners)
-
+        if self.sampler is not None:
            seg_weight = self.sampler.sample(seg_logit, seg_label)
        else:
            seg_weight = None
        seg_label = seg_label.squeeze(1)
        if not isinstance(self.loss_decode, nn.ModuleList):
--- a/easycv/models/segmentation/heads/stdc_head.py
+++ b/easycv/models/segmentation/heads/stdc_head.py
@ -0,0 +1,85 @@
 # Modified from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/stdc_head.py
 import torch
 import torch.nn.functional as F
 from easycv.models.builder import HEADS
 from .fcn_head import FCNHead
@HEADS.register_module()
 class STDCHead(FCNHead):
    """This head is the implementation of `Rethinking BiSeNet For Real-time
    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
    Args:
        boundary_threshold (float): The threshold of calculating boundary.
            Default: 0.1.
    """
    def __init__(self, boundary_threshold=0.1, **kwargs):
        super(STDCHead, self).__init__(**kwargs)
        self.boundary_threshold = boundary_threshold
        # Using register buffer to make laplacian kernel on the same
        # device of `seg_label`.
        self.register_buffer(
            'laplacian_kernel',
            torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1],
                         dtype=torch.float32,
                         requires_grad=False).reshape((1, 1, 3, 3)))
        self.fusion_kernel = torch.nn.Parameter(
            torch.tensor([[6. / 10], [3. / 10], [1. / 10]],
                         dtype=torch.float32).reshape(1, 3, 1, 1),
            requires_grad=False)
    def losses(self, seg_logit, seg_label):
        """Compute Detail Aggregation Loss."""
        # Note: The paper claims `fusion_kernel` is a trainable 1x1 conv
        # parameters. However, it is a constant in original repo and other
        # codebase because it would not be added into computation graph
        # after threshold operation.
        seg_label = seg_label.to(self.laplacian_kernel)
        boundary_targets = F.conv2d(
            seg_label, self.laplacian_kernel, padding=1)
        boundary_targets = boundary_targets.clamp(min=0)
        boundary_targets[boundary_targets > self.boundary_threshold] = 1
        boundary_targets[boundary_targets <= self.boundary_threshold] = 0
        boundary_targets_x2 = F.conv2d(
            seg_label, self.laplacian_kernel, stride=2, padding=1)
        boundary_targets_x2 = boundary_targets_x2.clamp(min=0)
        boundary_targets_x4 = F.conv2d(
            seg_label, self.laplacian_kernel, stride=4, padding=1)
        boundary_targets_x4 = boundary_targets_x4.clamp(min=0)
        boundary_targets_x4_up = F.interpolate(
            boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
        boundary_targets_x2_up = F.interpolate(
            boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
        boundary_targets_x2_up[
            boundary_targets_x2_up > self.boundary_threshold] = 1
        boundary_targets_x2_up[
            boundary_targets_x2_up <= self.boundary_threshold] = 0
        boundary_targets_x4_up[
            boundary_targets_x4_up > self.boundary_threshold] = 1
        boundary_targets_x4_up[
            boundary_targets_x4_up <= self.boundary_threshold] = 0
        boundary_targets_pyramids = torch.stack(
            (boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
            dim=1)
        boundary_targets_pyramids = boundary_targets_pyramids.squeeze(2)
        boundary_targets_pyramid = F.conv2d(boundary_targets_pyramids,
                                            self.fusion_kernel)
        boundary_targets_pyramid[
            boundary_targets_pyramid > self.boundary_threshold] = 1
        boundary_targets_pyramid[
            boundary_targets_pyramid <= self.boundary_threshold] = 0
        loss = super(STDCHead, self).losses(seg_logit,
                                            boundary_targets_pyramid.long())
        return loss
--- a/easycv/models/segmentation/sampler/init.py
+++ b/easycv/models/segmentation/sampler/init.py
@ -0,0 +1,5 @@
 from .base_pixel_sampler import BasePixelSampler
 from .builder import build_pixel_sampler
 from .ohem_pixel_sampler import OHEMPixelSampler
 __all__ = ['BasePixelSampler', 'OHEMPixelSampler', 'build_pixel_sampler']
--- a/easycv/models/segmentation/sampler/base_pixel_sampler.py
+++ b/easycv/models/segmentation/sampler/base_pixel_sampler.py
@ -0,0 +1,13 @@
 # Borrowed from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg
 from abc import ABCMeta, abstractmethod
 class BasePixelSampler(metaclass=ABCMeta):
    """Base class of pixel sampler."""
    def __init__(self, **kwargs):
        pass
    @abstractmethod
    def sample(self, seg_logit, seg_label):
        """Placeholder for sample function."""
--- a/easycv/models/segmentation/sampler/builder.py
+++ b/easycv/models/segmentation/sampler/builder.py
@ -0,0 +1,9 @@
 # Borrowed from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg
 from mmcv.utils import Registry, build_from_cfg
 PIXEL_SAMPLERS = Registry('pixel sampler')
 def build_pixel_sampler(cfg, **default_args):
    """Build pixel sampler for segmentation map."""
    return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
--- a/easycv/models/segmentation/sampler/ohem_pixel_sampler.py
+++ b/easycv/models/segmentation/sampler/ohem_pixel_sampler.py
@ -0,0 +1,85 @@
 # Borrowed from https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .base_pixel_sampler import BasePixelSampler
 from .builder import PIXEL_SAMPLERS
@PIXEL_SAMPLERS.register_module()
 class OHEMPixelSampler(BasePixelSampler):
    """Online Hard Example Mining Sampler for segmentation.
    Args:
        context (nn.Module): The context of sampler, subclass of
            :obj:`BaseDecodeHead`.
        thresh (float, optional): The threshold for hard example selection.
            Below which, are prediction with low confidence. If not
            specified, the hard examples will be pixels of top ``min_kept``
            loss. Default: None.
        min_kept (int, optional): The minimum number of predictions to keep.
            Default: 100000.
    """
    def __init__(self, context, thresh=None, min_kept=100000):
        super(OHEMPixelSampler, self).__init__()
        self.context = context
        assert min_kept > 1
        self.thresh = thresh
        self.min_kept = min_kept
    def sample(self, seg_logit, seg_label):
        """Sample pixels that have high loss or with low prediction confidence.
        Args:
            seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W)
            seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W)
        Returns:
            torch.Tensor: segmentation weight, shape (N, H, W)
        """
        with torch.no_grad():
            assert seg_logit.shape[2:] == seg_label.shape[2:]
            assert seg_label.shape[1] == 1
            seg_label = seg_label.squeeze(1).long()
            batch_kept = self.min_kept * seg_label.size(0)
            valid_mask = seg_label != self.context.ignore_index
            seg_weight = seg_logit.new_zeros(size=seg_label.size())
            valid_seg_weight = seg_weight[valid_mask]
            if self.thresh is not None:
                seg_prob = F.softmax(seg_logit, dim=1)
                tmp_seg_label = seg_label.clone().unsqueeze(1)
                tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0
                seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1)
                sort_prob, sort_indices = seg_prob[valid_mask].sort()
                if sort_prob.numel() > 0:
                    min_threshold = sort_prob[min(batch_kept,
                                                  sort_prob.numel() - 1)]
                else:
                    min_threshold = 0.0
                threshold = max(min_threshold, self.thresh)
                valid_seg_weight[seg_prob[valid_mask] < threshold] = 1.
            else:
                if not isinstance(self.context.loss_decode, nn.ModuleList):
                    losses_decode = [self.context.loss_decode]
                else:
                    losses_decode = self.context.loss_decode
                losses = 0.0
                for loss_module in losses_decode:
                    losses += loss_module(
                        seg_logit,
                        seg_label,
                        weight=None,
                        ignore_index=self.context.ignore_index,
                        reduction_override='none')
                # faster than topk according to https://github.com/pytorch/pytorch/issues/22812  # noqa
                _, sort_indices = losses[valid_mask].sort(descending=True)
                valid_seg_weight[sort_indices[:batch_kept]] = 1.
            seg_weight[valid_mask] = valid_seg_weight
            return seg_weight
--- a/easycv/predictors/segmentation.py
+++ b/easycv/predictors/segmentation.py
@ -29,7 +29,7 @@ class SegmentationPredictor(PredictorV2):
    def __init__(self,
                 model_path,
-                 config_file,
+                 config_file=None,
                 batch_size=1,
                 device=None,
                 save_results=False,
@ -54,7 +54,7 @@ class SegmentationPredictor(PredictorV2):
            **kwargs)
        self.CLASSES = self.cfg.CLASSES
-        self.PALETTE = self.cfg.PALETTE
+        self.PALETTE = self.cfg.get('PALETTE', None)
    def show_result(self,
                    img,
@ -90,7 +90,8 @@ class SegmentationPredictor(PredictorV2):
        img = mmcv.imread(img)
        img = img.copy()
-        seg = result[0]
+        # seg = result[0]
        seg = result
        if palette is None:
            if self.PALETTE is None:
                # Get random state before set seed,
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -1,4 +1,5 @@
 albumentations
 cityscapesscripts
 dataclasses
 decord
 einops
--- a/tests/datasets/segmentation/data_sources/test_seg_cityscapes_datasource.py
+++ b/tests/datasets/segmentation/data_sources/test_seg_cityscapes_datasource.py
@ -0,0 +1,38 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import random
 import unittest
 from tests.ut_config import SEG_DATA_SAMLL_CITYSCAPES
 from easycv.datasets.segmentation.data_sources.cityscapes import \
    SegSourceCityscapes
 CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
           'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
           'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
           'bicycle')
 class SegSourceCityscapesTest(unittest.TestCase):
    def setUp(self):
        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
    def test_cityscapes(self):
        data_source = SegSourceCityscapes(
            img_root=os.path.join(SEG_DATA_SAMLL_CITYSCAPES, 'leftImg8bit'),
            label_root=os.path.join(SEG_DATA_SAMLL_CITYSCAPES, 'gtFine'),
            classes=CLASSES,
        )
        index_list = random.choices(list(range(20)), k=3)
        for idx in index_list:
            data = data_source[idx]
            self.assertIn('img_fields', data)
            self.assertIn('seg_fields', data)
 if __name__ == '__main__':
    unittest.main()
--- a/tests/models/segmentation/test_stdc.py
+++ b/tests/models/segmentation/test_stdc.py
@ -0,0 +1,126 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 import torch
 from easycv.models import build_model
 class StdcTest(unittest.TestCase):
    def setUp(self):
        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
    def test_stdc(self):
        norm_cfg = dict(type='BN', requires_grad=True)
        model_cfg = dict(
            type='EncoderDecoder',
            backbone=dict(
                type='STDCContextPathNet',
                backbone_cfg=dict(
                    type='STDCNet',
                    stdc_type='STDCNet1',
                    in_channels=3,
                    channels=(32, 64, 256, 512, 1024),
                    bottleneck_type='cat',
                    num_convs=4,
                    norm_cfg=norm_cfg,
                    act_cfg=dict(type='ReLU'),
                    with_final_conv=False),
                last_in_channels=(1024, 512),
                out_channels=128,
                ffm_cfg=dict(
                    in_channels=384, out_channels=256, scale_factor=4)),
            decode_head=dict(
                type='FCNHead',
                in_channels=256,
                channels=256,
                num_convs=1,
                num_classes=19,
                in_index=3,
                concat_input=False,
                dropout_ratio=0.1,
                norm_cfg=norm_cfg,
                align_corners=True,
                sampler=dict(
                    type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
                loss_decode=dict(
                    type='CrossEntropyLoss',
                    use_sigmoid=False,
                    loss_weight=1.0)),
            auxiliary_head=[
                dict(
                    type='FCNHead',
                    in_channels=128,
                    channels=64,
                    num_convs=1,
                    num_classes=19,
                    in_index=2,
                    norm_cfg=norm_cfg,
                    concat_input=False,
                    align_corners=False,
                    sampler=dict(
                        type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
                    loss_decode=dict(
                        type='CrossEntropyLoss',
                        use_sigmoid=False,
                        loss_weight=1.0)),
                dict(
                    type='FCNHead',
                    in_channels=128,
                    channels=64,
                    num_convs=1,
                    num_classes=19,
                    in_index=1,
                    norm_cfg=norm_cfg,
                    concat_input=False,
                    align_corners=False,
                    sampler=dict(
                        type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
                    loss_decode=dict(
                        type='CrossEntropyLoss',
                        use_sigmoid=False,
                        loss_weight=1.0)),
                dict(
                    type='STDCHead',
                    in_channels=256,
                    channels=64,
                    num_convs=1,
                    num_classes=2,
                    boundary_threshold=0.1,
                    in_index=0,
                    norm_cfg=norm_cfg,
                    concat_input=False,
                    align_corners=True,
                    loss_decode=[
                        dict(
                            type='CrossEntropyLoss',
                            loss_name='loss_ce',
                            use_sigmoid=True,
                            loss_weight=1.0),
                        dict(
                            type='DiceLoss',
                            loss_name='loss_dice',
                            loss_weight=1.0)
                    ]),
            ],
            train_cfg=dict(),
            test_cfg=dict(mode='whole'),
        )
        model = build_model(model_cfg).to('cuda')
        img = torch.rand(2, 3, 512, 1024).to('cuda')
        gt_semantic_seg = torch.randint(
            low=0, high=18, size=(2, 1, 512, 1024)).to('cuda')
        train_output = model.forward_train(img, [], gt_semantic_seg)
        self.assertIn('decode.loss_ce', train_output)
        self.assertIn('aux_0.loss_ce', train_output)
        self.assertIn('aux_1.loss_ce', train_output)
        self.assertIn('aux_2.loss_ce', train_output)
 if __name__ == '__main__':
    unittest.main()
--- a/tests/ut_config.py
+++ b/tests/ut_config.py
@ -159,6 +159,8 @@ SEG_DATA_SMALL_COCO_STUFF_10K = os.path.join(
    BASE_LOCAL_PATH, 'data/segmentation/small_coco_stuff/small_coco_stuff10k')
 SEG_DATA_SAMLL_COCO_STUFF_164K = os.path.join(
    BASE_LOCAL_PATH, 'data/segmentation/small_coco_stuff/small_coco_stuff164k')
 SEG_DATA_SAMLL_CITYSCAPES = os.path.join(BASE_LOCAL_PATH,
                                         'data/segmentation/small_cityscapes')
 # OCR data
 SMALL_OCR_CLS_DATA = os.path.join(BASE_LOCAL_PATH, 'data/ocr/small_ocr_cls')