[Feature] Support ICNet (#884)

* add icnet backbone * add icnet head * add icnet configs * nclass -> num_classes * Support ICNet * ICNet * ICNet * Add ICNeck * Add ICNeck * Add ICNeck * Add ICNeck * Adding unittest * Uploading models & logs * Uploading models & logs * add comment * smaller test_swin.py * try to delete test_swin.py * delete test_unet.py * delete test_unet.py * temp * smaller test_unet.py Co-authored-by: Junjun2016 <hejunjun@sjtu.edu.cn>
2021-10-01 00:31:57 +08:00 · 2021-10-01 00:31:57 +08:00 · 7db1cbb181
parent 84edf6c190
commit 7db1cbb181
30 changed files with 953 additions and 112 deletions
--- a/README.md
+++ b/README.md
@ -79,6 +79,7 @@ Supported methods:
 - [x] [PSANet (ECCV'2018)](configs/psanet)
 - [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
 - [x] [UPerNet (ECCV'2018)](configs/upernet)
+- [x] [ICNet (ECCV'2018)](configs/icnet)
 - [x] [NonLocal Net (CVPR'2018)](configs/nonlocal_net)
 - [x] [EncNet (CVPR'2018)](configs/encnet)
 - [x] [Semantic FPN (CVPR'2019)](configs/sem_fpn)
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -78,6 +78,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 - [x] [PSANet (ECCV'2018)](configs/psanet)
 - [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
 - [x] [UPerNet (ECCV'2018)](configs/upernet)
+- [x] [ICNet (ECCV'2018)](configs/icnet)
 - [x] [NonLocal Net (CVPR'2018)](configs/nonlocal_net)
 - [x] [EncNet (CVPR'2018)](configs/encnet)
 - [x] [Semantic FPN (CVPR'2019)](configs/sem_fpn)
--- a/configs/_base_/datasets/cityscapes_832x832.py
+++ b/configs/_base_/datasets/cityscapes_832x832.py
@ -0,0 +1,35 @@
+_base_ = './cityscapes.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (832, 832)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
--- a/configs/_base_/models/icnet_r50-d8.py
+++ b/configs/_base_/models/icnet_r50-d8.py
@ -0,0 +1,74 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='ICNet',
+        backbone_cfg=dict(
+            type='ResNetV1c',
+            in_channels=3,
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 2, 4),
+            strides=(1, 2, 1, 1),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True),
+        in_channels=3,
+        layer_channels=(512, 2048),
+        light_branch_middle_channels=32,
+        psp_out_channels=512,
+        out_channels=(64, 256, 256),
+        norm_cfg=norm_cfg,
+        align_corners=False,
+    ),
+    neck=dict(
+        type='ICNeck',
+        in_channels=(64, 256, 256),
+        out_channels=128,
+        norm_cfg=norm_cfg,
+        align_corners=False),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        channels=128,
+        num_convs=1,
+        in_index=2,
+        dropout_ratio=0,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        concat_input=False,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=128,
+            num_convs=1,
+            num_classes=19,
+            in_index=0,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=128,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
--- a/configs/bisenetv1/README.md
+++ b/configs/bisenetv1/README.md
@ -32,7 +32,7 @@
 | BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
 | BiSeNetV1| R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
 | BiSeNetV1 (4x8) | R-18-D32 | 1024x1024  | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
-| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024  | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
+| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024  | 160000 | 15.39 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
 | BiSeNetV1 | R-50-D32 | 1024x1024  | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |

 Note:
--- a/configs/bisenetv1/bisenetv1.yml
+++ b/configs/bisenetv1/bisenetv1.yml
@ -92,7 +92,7 @@ Models:
      batch size: 1
      mode: FP32
      resolution: (1024,1024)
-    memory (GB): 3.3
+    memory (GB): 15.39
  Results:
  - Task: Semantic Segmentation
    Dataset: Cityscapes
--- a/configs/icnet/README.md
+++ b/configs/icnet/README.md
@ -0,0 +1,45 @@
+# ICNet for Real-time Semantic Segmentation on High-resolution Images
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/hszhao/ICNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77">Code Snippet</a>
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1704.08545">ICNet (ECCV'2018)</a></summary>
+
+```latext
+@inproceedings{zhao2018icnet,
+  title={Icnet for real-time semantic segmentation on high-resolution images},
+  author={Zhao, Hengshuang and Qi, Xiaojuan and Shen, Xiaoyong and Shi, Jianping and Jia, Jiaya},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={405--420},
+  year={2018}
+}
+```
+
+</details>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone   | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                       |
+| ------ | ---------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ICNet    | R-18-D8    | 832x832  |   80000 | 1.70      | 27.12          | 68.14 |         70.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521.log.json)         |
+| ICNet    | R-18-D8   | 832x832  |   160000 | -     | -           | 71.64 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153.log.json)     |
+| ICNet (in1k-pre)    | R-18-D8    | 832x832   |   80000 | -      | -           | 72.51 |         74.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354.log.json)             |
+| ICNet (in1k-pre)    | R-18-D8   | 832x832   |   160000 | -     | -           | 74.43 |         76.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702.log.json)         |
+| ICNet    | R-50-D8    | 832x832  |   80000 | 2.53      | 20.08          | 68.91 |         69.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625.log.json)         |
+| ICNet    | R-50-D8    | 832x832  |   160000 | -       | -               | 73.82 |         75.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612.log.json)         |
+| ICNet (in1k-pre)    | R-50-D8   | 832x832  |   80000 | -        | -              | 74.58 |         76.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943.log.json)     |
+| ICNet (in1k-pre)    | R-50-D8    | 832x832   |   160000 | -      | -           | 76.29 |         78.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715.log.json)             |
+| ICNet    | R-101-D8    | 832x832  |   80000 | 3.08      | 16.95          | 70.28 |         71.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447.log.json)         |
+| ICNet    | R-101-D8    | 832x832  |   160000 | -        | -               | 73.80 |         76.10 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350.log.json)         |
+| ICNet (in1k-pre)    | R-101-D8   | 832x832  |   80000 | -       | -              | 75.57 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414.log.json)     |
+| ICNet (in1k-pre)    | R-101-D8    | 832x832   |   160000 | -      | -           | 76.15 |         77.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612.log.json)             |
+
+Note: `in1k-pre` means pretrained model is used.
--- a/configs/icnet/icnet.yml
+++ b/configs/icnet/icnet.yml
@ -0,0 +1,207 @@
+Collections:
+- Name: icnet
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    URL: https://arxiv.org/abs/1704.08545
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+  README: configs/icnet/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+    Version: v0.18.0
+  Converted From:
+    Code: https://github.com/hszhao/ICNet
+Models:
+- Name: icnet_r18-d8_832x832_80k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-18-D8
+    crop size: (832,832)
+    lr schd: 80000
+    inference time (ms/im):
+    - value: 36.87
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (832,832)
+    memory (GB): 1.7
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.14
+      mIoU(ms+flip): 70.16
+  Config: configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth
+- Name: icnet_r18-d8_832x832_160k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-18-D8
+    crop size: (832,832)
+    lr schd: 160000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.64
+      mIoU(ms+flip): 74.18
+  Config: configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth
+- Name: icnet_r18-d8_in1k-pre_832x832_80k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-18-D8
+    crop size: (832,832)
+    lr schd: 80000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.51
+      mIoU(ms+flip): 74.78
+  Config: configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth
+- Name: icnet_r18-d8_in1k-pre_832x832_160k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-18-D8
+    crop size: (832,832)
+    lr schd: 160000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.43
+      mIoU(ms+flip): 76.72
+  Config: configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth
+- Name: icnet_r50-d8_832x832_80k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-50-D8
+    crop size: (832,832)
+    lr schd: 80000
+    inference time (ms/im):
+    - value: 49.8
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (832,832)
+    memory (GB): 2.53
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.91
+      mIoU(ms+flip): 69.72
+  Config: configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth
+- Name: icnet_r50-d8_832x832_160k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-50-D8
+    crop size: (832,832)
+    lr schd: 160000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.82
+      mIoU(ms+flip): 75.67
+  Config: configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth
+- Name: icnet_r50-d8_in1k-pre_832x832_80k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-50-D8
+    crop size: (832,832)
+    lr schd: 80000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.58
+      mIoU(ms+flip): 76.41
+  Config: configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth
+- Name: icnet_r50-d8_in1k-pre_832x832_160k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-50-D8
+    crop size: (832,832)
+    lr schd: 160000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.29
+      mIoU(ms+flip): 78.09
+  Config: configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth
+- Name: icnet_r101-d8_832x832_80k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-101-D8
+    crop size: (832,832)
+    lr schd: 80000
+    inference time (ms/im):
+    - value: 59.0
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (832,832)
+    memory (GB): 3.08
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.28
+      mIoU(ms+flip): 71.95
+  Config: configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth
+- Name: icnet_r101-d8_832x832_160k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-101-D8
+    crop size: (832,832)
+    lr schd: 160000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.8
+      mIoU(ms+flip): 76.1
+  Config: configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth
+- Name: icnet_r101-d8_in1k-pre_832x832_80k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-101-D8
+    crop size: (832,832)
+    lr schd: 80000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.57
+      mIoU(ms+flip): 77.86
+  Config: configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth
+- Name: icnet_r101-d8_in1k-pre_832x832_160k_cityscapes
+  In Collection: icnet
+  Metadata:
+    backbone: R-101-D8
+    crop size: (832,832)
+    lr schd: 160000
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.15
+      mIoU(ms+flip): 77.98
+  Config: configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth
--- a/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py
+++ b/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py
@ -0,0 +1,2 @@
+_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
+model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
--- a/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py
+++ b/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py
@ -0,0 +1,2 @@
+_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
+model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
--- a/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py
+++ b/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py
@ -0,0 +1,7 @@
+_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
--- a/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py
+++ b/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py
@ -0,0 +1,7 @@
+_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
--- a/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py
+++ b/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py
@ -0,0 +1,3 @@
+_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
+model = dict(
+    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
--- a/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py
+++ b/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py
@ -0,0 +1,3 @@
+_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
+model = dict(
+    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
--- a/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py
+++ b/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py
@ -0,0 +1,8 @@
+_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
+model = dict(
+    backbone=dict(
+        layer_channels=(128, 512),
+        backbone_cfg=dict(
+            depth=18,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
--- a/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py
+++ b/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py
@ -0,0 +1,8 @@
+_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
+model = dict(
+    backbone=dict(
+        layer_channels=(128, 512),
+        backbone_cfg=dict(
+            depth=18,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
--- a/configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py
+++ b/configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py
@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/icnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_832x832.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
--- a/configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py
+++ b/configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py
@ -0,0 +1,5 @@
+_base_ = [
+    '../_base_/models/icnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_832x832.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
--- a/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py
+++ b/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py
@ -0,0 +1,6 @@
+_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
--- a/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py
+++ b/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py
@ -0,0 +1,6 @@
+_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
--- a/mmseg/models/backbones/init.py
+++ b/mmseg/models/backbones/init.py
@ -4,6 +4,7 @@ from .bisenetv2 import BiSeNetV2
 from .cgnet import CGNet
 from .fast_scnn import FastSCNN
 from .hrnet import HRNet
+from .icnet import ICNet
 from .mit import MixVisionTransformer
 from .mobilenet_v2 import MobileNetV2
 from .mobilenet_v3 import MobileNetV3
@ -18,5 +19,5 @@ __all__ = [
    'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
    'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
-    'BiSeNetV1', 'BiSeNetV2'
+    'BiSeNetV1', 'BiSeNetV2', 'ICNet'
 ]
--- a/mmseg/models/backbones/icnet.py
+++ b/mmseg/models/backbones/icnet.py
@ -0,0 +1,165 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES, build_backbone
+from ..decode_heads.psp_head import PPM
+
+
+@BACKBONES.register_module()
+class ICNet(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This backbone is the implementation of
+    `ICNet <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        backbone_cfg (dict): Config dict to build backbone. Usually it is
+            ResNet but it can also be other backbones.
+        in_channels (int): The number of input image channels. Default: 3.
+        layer_channels (Sequence[int]): The numbers of feature channels at
+            layer 2 and layer 4 in ResNet. It can also be other backbones.
+            Default: (512, 2048).
+        light_branch_middle_channels (int): The number of channels of the
+            middle layer in light branch. Default: 32.
+        psp_out_channels (int): The number of channels of the output of PSP
+            module. Default: 512.
+        out_channels (Sequence[int]): The numbers of output feature channels
+            at each branches. Default: (64, 256, 256).
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 layer_channels=(512, 2048),
+                 light_branch_middle_channels=32,
+                 psp_out_channels=512,
+                 out_channels=(64, 256, 256),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        if backbone_cfg is None:
+            raise TypeError('backbone_cfg must be passed from config file!')
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', mode='fan_out', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='_BatchNorm'),
+                dict(type='Normal', mean=0.01, layer='Linear')
+            ]
+        super(ICNet, self).__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.backbone = build_backbone(backbone_cfg)
+
+        # Note: Default `ceil_mode` is false in nn.MaxPool2d, set
+        # `ceil_mode=True` to keep information in the corner of feature map.
+        self.backbone.maxpool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=True)
+
+        self.psp_modules = PPM(
+            pool_scales=pool_scales,
+            in_channels=layer_channels[1],
+            channels=psp_out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            align_corners=align_corners)
+
+        self.psp_bottleneck = ConvModule(
+            layer_channels[1] + len(pool_scales) * psp_out_channels,
+            psp_out_channels,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv_sub1 = nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=out_channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        self.conv_sub2 = ConvModule(
+            layer_channels[0],
+            out_channels[1],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.conv_sub4 = ConvModule(
+            psp_out_channels,
+            out_channels[2],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        output = []
+
+        # sub 1
+        output.append(self.conv_sub1(x))
+
+        # sub 2
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.stem(x)
+        x = self.backbone.maxpool(x)
+        x = self.backbone.layer1(x)
+        x = self.backbone.layer2(x)
+        output.append(self.conv_sub2(x))
+
+        # sub 4
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.layer3(x)
+        x = self.backbone.layer4(x)
+        psp_outs = self.psp_modules(x) + [x]
+        psp_outs = torch.cat(psp_outs, dim=1)
+        x = self.psp_bottleneck(psp_outs)
+
+        output.append(self.conv_sub4(x))
+
+        return output
--- a/mmseg/models/necks/init.py
+++ b/mmseg/models/necks/init.py
@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .fpn import FPN
+from .ic_neck import ICNeck
 from .mla_neck import MLANeck
 from .multilevel_neck import MultiLevelNeck

-__all__ = ['FPN', 'MultiLevelNeck', 'MLANeck']
+__all__ = ['FPN', 'MultiLevelNeck', 'MLANeck', 'ICNeck']
--- a/mmseg/models/necks/ic_neck.py
+++ b/mmseg/models/necks/ic_neck.py
@ -0,0 +1,147 @@
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import NECKS
+
+
+class CascadeFeatureFusion(BaseModule):
+    """Cascade Feature Fusion Unit in ICNet.
+
+    Args:
+        low_channels (int): The number of input channels for
+            low resolution feature map.
+        high_channels (int): The number of input channels for
+            high resolution feature map.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Returns:
+        x (Tensor): The output tensor of shape (N, out_channels, H, W).
+        x_low (Tensor): The output tensor of shape (N, out_channels, H, W)
+            for Cascade Label Guidance in auxiliary heads.
+    """
+
+    def __init__(self,
+                 low_channels,
+                 high_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super(CascadeFeatureFusion, self).__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.conv_low = ConvModule(
+            low_channels,
+            out_channels,
+            3,
+            padding=2,
+            dilation=2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_high = ConvModule(
+            high_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_low, x_high):
+        x_low = resize(
+            x_low,
+            size=x_high.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        # Note: Different from original paper, `x_low` is underwent
+        # `self.conv_low` rather than another 1x1 conv classifier
+        #  before being used for auxiliary head.
+        x_low = self.conv_low(x_low)
+        x_high = self.conv_high(x_high)
+        x = x_low + x_high
+        x = F.relu(x, inplace=True)
+        return x, x_low
+
+
+@NECKS.register_module()
+class ICNeck(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This head is the implementation of `ICHead
+    <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        in_channels (int): The number of input image channels. Default: 3.
+        out_channels (int): The numbers of output feature channels.
+            Default: 128.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(64, 256, 256),
+                 out_channels=128,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super(ICNeck, self).__init__(init_cfg=init_cfg)
+        assert len(in_channels) == 3, 'Length of input channels \
+                                        must be 3!'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.cff_24 = CascadeFeatureFusion(
+            self.in_channels[2],
+            self.in_channels[1],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+        self.cff_12 = CascadeFeatureFusion(
+            self.out_channels,
+            self.in_channels[0],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, inputs):
+        assert len(inputs) == 3, 'Length of input feature \
+                                        maps must be 3!'
+
+        x_sub1, x_sub2, x_sub4 = inputs
+        x_cff_24, x_24 = self.cff_24(x_sub4, x_sub2)
+        x_cff_12, x_12 = self.cff_12(x_cff_24, x_sub1)
+        # Note: `x_cff_12` is used for decode_head,
+        # `x_24` and `x_12` are used for auxiliary head.
+        return x_24, x_12, x_cff_12
--- a/model-index.yml
+++ b/model-index.yml
@ -18,6 +18,7 @@ Import:
 - configs/fp16/fp16.yml
 - configs/gcnet/gcnet.yml
 - configs/hrnet/hrnet.yml
+- configs/icnet/icnet.yml
 - configs/isanet/isanet.yml
 - configs/mobilenet_v2/mobilenet_v2.yml
 - configs/mobilenet_v3/mobilenet_v3.yml
--- a/tests/test_models/test_backbones/test_icnet.py
+++ b/tests/test_models/test_backbones/test_icnet.py
@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmseg.models.backbones import ICNet
+
+
+def test_icnet_backbone():
+    with pytest.raises(TypeError):
+        # Must give backbone dict in config file.
+        ICNet(
+            in_channels=3,
+            layer_channels=(512, 2048),
+            light_branch_middle_channels=32,
+            psp_out_channels=512,
+            out_channels=(64, 256, 256),
+            backbone_cfg=None)
+
+    # Test ICNet Standard Forward
+    model = ICNet(
+        backbone_cfg=dict(
+            type='ResNetV1c',
+            in_channels=3,
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 2, 4),
+            strides=(1, 2, 1, 1),
+            norm_cfg=dict(type='BN', requires_grad=True),
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True), )
+    assert hasattr(model.backbone,
+                   'maxpool') and model.backbone.maxpool.ceil_mode is True
+    model.init_weights()
+    model.train()
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 512, 1024)
+    feat = model(imgs)
+
+    assert model.psp_modules[0][0].output_size == 1
+    assert model.psp_modules[1][0].output_size == 2
+    assert model.psp_modules[2][0].output_size == 3
+    assert model.psp_bottleneck.padding == 1
+    assert model.conv_sub1[0].padding == 1
+
+    assert len(feat) == 3
+    assert feat[0].shape == torch.Size([batch_size, 64, 64, 128])
--- a/tests/test_models/test_backbones/test_swin.py
+++ b/tests/test_models/test_backbones/test_swin.py
@ -50,22 +50,22 @@ def test_swin_transformer():
    model(temp)

    # Test normal inference
-    temp = torch.randn((1, 3, 512, 512))
+    temp = torch.randn((1, 3, 256, 256))
    model = SwinTransformer()
    outs = model(temp)
-    assert outs[0].shape == (1, 96, 128, 128)
-    assert outs[1].shape == (1, 192, 64, 64)
-    assert outs[2].shape == (1, 384, 32, 32)
-    assert outs[3].shape == (1, 768, 16, 16)
+    assert outs[0].shape == (1, 96, 64, 64)
+    assert outs[1].shape == (1, 192, 32, 32)
+    assert outs[2].shape == (1, 384, 16, 16)
+    assert outs[3].shape == (1, 768, 8, 8)

    # Test abnormal inference size
-    temp = torch.randn((1, 3, 511, 511))
+    temp = torch.randn((1, 3, 255, 255))
    model = SwinTransformer()
    outs = model(temp)
-    assert outs[0].shape == (1, 96, 128, 128)
-    assert outs[1].shape == (1, 192, 64, 64)
-    assert outs[2].shape == (1, 384, 32, 32)
-    assert outs[3].shape == (1, 768, 16, 16)
+    assert outs[0].shape == (1, 96, 64, 64)
+    assert outs[1].shape == (1, 192, 32, 32)
+    assert outs[2].shape == (1, 384, 16, 16)
+    assert outs[3].shape == (1, 768, 8, 8)

    # Test abnormal inference size
    temp = torch.randn((1, 3, 112, 137))
@ -89,7 +89,7 @@ def test_swin_transformer():
        assert not p.requires_grad

    # Test Swin with checkpoint forward
-    temp = torch.randn((1, 3, 224, 224))
+    temp = torch.randn((1, 3, 112, 112))
    model = SwinTransformer(with_cp=True)
    for m in model.modules():
        if isinstance(m, SwinBlock):
--- a/tests/test_models/test_backbones/test_unet.py
+++ b/tests/test_models/test_backbones/test_unet.py
@ -345,7 +345,7 @@ def test_unet():
        # case is 8.
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=4,
            strides=(1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2),
@ -362,7 +362,7 @@ def test_unet():
        # case is 16.
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -379,7 +379,7 @@ def test_unet():
        # case is 8.
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -396,7 +396,7 @@ def test_unet():
        # case is 8.
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 2, 2, 2, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -413,7 +413,7 @@ def test_unet():
        # case is 32.
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=6,
            strides=(1, 1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2, 2),
@ -428,7 +428,7 @@ def test_unet():
        # Check if num_stages matchs strides, len(strides)=num_stages
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -443,7 +443,7 @@ def test_unet():
        # Check if num_stages matchs strides, len(enc_num_convs)=num_stages
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2),
@ -458,7 +458,7 @@ def test_unet():
        # Check if num_stages matchs strides, len(dec_num_convs)=num_stages-1
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -473,7 +473,7 @@ def test_unet():
        # Check if num_stages matchs strides, len(downsamples)=num_stages-1
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -488,7 +488,7 @@ def test_unet():
        # Check if num_stages matchs strides, len(enc_dilations)=num_stages
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -503,7 +503,7 @@ def test_unet():
        # Check if num_stages matchs strides, len(dec_dilations)=num_stages-1
        unet = UNet(
            in_channels=3,
-            base_channels=64,
+            base_channels=4,
            num_stages=5,
            strides=(1, 1, 1, 1, 1),
            enc_num_convs=(2, 2, 2, 2, 2),
@ -517,7 +517,7 @@ def test_unet():
    # test UNet norm_eval=True
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -532,7 +532,7 @@ def test_unet():
    # test UNet norm_eval=False
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -547,7 +547,7 @@ def test_unet():
    # test UNet forward and outputs. The whole downsample rate is 16.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -558,16 +558,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 8, 8])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 8, 8])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 8.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -578,16 +578,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 16, 16])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 16, 16])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 8.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 2, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -598,16 +598,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 16, 16])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 16, 16])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 4.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -618,16 +618,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 32, 32])
-    assert x_outs[1].shape == torch.Size([2, 512, 32, 32])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 32, 32])
+    assert x_outs[1].shape == torch.Size([2, 32, 32, 32])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 4.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -638,16 +638,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 32, 32])
-    assert x_outs[1].shape == torch.Size([2, 512, 32, 32])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 32, 32])
+    assert x_outs[1].shape == torch.Size([2, 32, 32, 32])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 8.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -658,16 +658,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 16, 16])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 16, 16])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 4.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -678,16 +678,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 32, 32])
-    assert x_outs[1].shape == torch.Size([2, 512, 32, 32])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 32, 32])
+    assert x_outs[1].shape == torch.Size([2, 32, 32, 32])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 2.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -698,16 +698,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 64, 64])
-    assert x_outs[1].shape == torch.Size([2, 512, 64, 64])
-    assert x_outs[2].shape == torch.Size([2, 256, 64, 64])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 64, 64])
+    assert x_outs[1].shape == torch.Size([2, 32, 64, 64])
+    assert x_outs[2].shape == torch.Size([2, 16, 64, 64])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 1.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 1, 1, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -718,16 +718,16 @@ def test_unet():

    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 128, 128])
-    assert x_outs[1].shape == torch.Size([2, 512, 128, 128])
-    assert x_outs[2].shape == torch.Size([2, 256, 128, 128])
-    assert x_outs[3].shape == torch.Size([2, 128, 128, 128])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[1].shape == torch.Size([2, 32, 128, 128])
+    assert x_outs[2].shape == torch.Size([2, 16, 128, 128])
+    assert x_outs[3].shape == torch.Size([2, 8, 128, 128])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 16.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -737,16 +737,16 @@ def test_unet():
        dec_dilations=(1, 1, 1, 1))
    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 8, 8])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 8, 8])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 8.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -756,16 +756,16 @@ def test_unet():
        dec_dilations=(1, 1, 1, 1))
    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 16, 16])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 16, 16])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 8.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 2, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -775,16 +775,16 @@ def test_unet():
        dec_dilations=(1, 1, 1, 1))
    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 16, 16])
-    assert x_outs[1].shape == torch.Size([2, 512, 16, 16])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 16, 16])
+    assert x_outs[1].shape == torch.Size([2, 32, 16, 16])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet forward and outputs. The whole downsample rate is 4.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -794,16 +794,16 @@ def test_unet():
        dec_dilations=(1, 1, 1, 1))
    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 32, 32])
-    assert x_outs[1].shape == torch.Size([2, 512, 32, 32])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 32, 32])
+    assert x_outs[1].shape == torch.Size([2, 32, 32, 32])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])

    # test UNet init_weights method.
    unet = UNet(
        in_channels=3,
-        base_channels=64,
+        base_channels=4,
        num_stages=5,
        strides=(1, 2, 2, 1, 1),
        enc_num_convs=(2, 2, 2, 2, 2),
@ -815,8 +815,8 @@ def test_unet():
    unet.init_weights()
    x = torch.randn(2, 3, 128, 128)
    x_outs = unet(x)
-    assert x_outs[0].shape == torch.Size([2, 1024, 32, 32])
-    assert x_outs[1].shape == torch.Size([2, 512, 32, 32])
-    assert x_outs[2].shape == torch.Size([2, 256, 32, 32])
-    assert x_outs[3].shape == torch.Size([2, 128, 64, 64])
-    assert x_outs[4].shape == torch.Size([2, 64, 128, 128])
+    assert x_outs[0].shape == torch.Size([2, 64, 32, 32])
+    assert x_outs[1].shape == torch.Size([2, 32, 32, 32])
+    assert x_outs[2].shape == torch.Size([2, 16, 32, 32])
+    assert x_outs[3].shape == torch.Size([2, 8, 64, 64])
+    assert x_outs[4].shape == torch.Size([2, 4, 128, 128])
--- a/tests/test_models/test_necks/test_ic_neck.py
+++ b/tests/test_models/test_necks/test_ic_neck.py
@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmseg.models.necks import ICNeck
+from mmseg.models.necks.ic_neck import CascadeFeatureFusion
+from ..test_heads.utils import _conv_has_norm, to_cuda
+
+
+def test_ic_neck():
+    # test with norm_cfg
+    neck = ICNeck(
+        in_channels=(64, 256, 256),
+        out_channels=128,
+        norm_cfg=dict(type='SyncBN'),
+        align_corners=False)
+    assert _conv_has_norm(neck, sync_bn=True)
+
+    inputs = [
+        torch.randn(1, 64, 128, 256),
+        torch.randn(1, 256, 65, 129),
+        torch.randn(1, 256, 32, 64)
+    ]
+    neck = ICNeck(
+        in_channels=(64, 256, 256),
+        out_channels=128,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        align_corners=False)
+    if torch.cuda.is_available():
+        neck, inputs = to_cuda(neck, inputs)
+
+    outputs = neck(inputs)
+    assert outputs[0].shape == (1, 128, 65, 129)
+    assert outputs[1].shape == (1, 128, 128, 256)
+    assert outputs[1].shape == (1, 128, 128, 256)
+
+
+def test_ic_neck_cascade_feature_fusion():
+    cff = CascadeFeatureFusion(256, 256, 128)
+    assert cff.conv_low.in_channels == 256
+    assert cff.conv_low.out_channels == 128
+    assert cff.conv_high.in_channels == 256
+    assert cff.conv_high.out_channels == 128
+
+
+def test_ic_neck_input_channels():
+    with pytest.raises(AssertionError):
+        # ICNet Neck input channel constraints.
+        ICNeck(
+            in_channels=(64, 256, 256, 256),
+            out_channels=128,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            align_corners=False)
--- a/tools/benchmark_new.py
+++ b/tools/benchmark_new.py