add segformer algo

* 将COCO_Stuff_164k数据集增加到了data_hub.md, prepared_data.md, 以及将其对应脚本加到了prepare_data文件夹下面 * 增加了segformer对应的predictor * 完善docs/source/model_zoo_seg.md中关于segformer的信息
2025-06-03 14:49:00 +08:00 · 2022-08-18 10:40:18 +08:00 · 2022-08-18 10:40:18 +08:00 · 0128e881d9
commit 0128e881d9
parent fbdde2a054
21 changed files with 2060 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,5 +1,5 @@
-*.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
--- a/configs/segmentation/segformer/segformer_b0_coco.py
+++ b/configs/segmentation/segformer/segformer_b0_coco.py
@ -0,0 +1,249 @@
+# segformer of B0
+
+CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'branch', 'bridge',
+    'building-other', 'bush', 'cabinet', 'cage', 'cardboard', 'carpet',
+    'ceiling-other', 'ceiling-tile', 'cloth', 'clothes', 'clouds', 'counter',
+    'cupboard', 'curtain', 'desk-stuff', 'dirt', 'door-stuff', 'fence',
+    'floor-marble', 'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
+    'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
+    'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
+    'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+    'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+    'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+    'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper', 'snow',
+    'solid-other', 'stairs', 'stone', 'straw', 'structural-other', 'table',
+    'tent', 'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+    'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone', 'wall-tile',
+    'wall-wood', 'water-other', 'waterdrops', 'window-blind', 'window-other',
+    'wood'
+]
+PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96],
+           [128, 192, 192], [0, 64, 64], [0, 192, 224], [0, 192, 192],
+           [128, 192, 64], [0, 192, 96], [128, 192, 64], [128, 32, 192],
+           [0, 0, 224], [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+           [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+           [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+           [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
+           [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
+           [192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
+           [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+           [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128,
+                                                            64], [192, 0, 32],
+           [128, 96, 0], [128, 0, 192], [0, 128, 32], [64, 224, 0], [0, 0, 64],
+           [128, 128, 160], [64, 96, 0], [0, 128, 192], [0, 128, 160],
+           [192, 224, 0], [0, 128, 64], [128, 128, 32], [192, 32, 128],
+           [0, 64, 192], [0, 0, 32], [64, 160, 128], [128, 64, 64],
+           [128, 0, 160], [64, 32, 128], [128, 192, 192], [0, 0, 160],
+           [192, 160, 128], [128, 192, 0], [128, 0, 96], [192, 32, 0],
+           [128, 64, 128], [64, 128, 96], [64, 160, 0], [0, 64, 0],
+           [192, 128, 224], [64, 32, 0], [0, 192, 128], [64, 128, 224],
+           [192, 160, 0], [0, 192, 0], [192, 128, 96], [192, 96, 128],
+           [0, 64, 128], [64, 0, 96], [64, 224, 128], [128, 64, 0],
+           [192, 0, 224], [64, 96, 128], [128, 192, 128], [64, 0, 224],
+           [192, 224, 128], [128, 192, 64], [192, 0, 96], [192, 96, 0],
+           [128, 64, 192], [0, 128, 96], [0, 224, 0], [64, 64, 64],
+           [128, 128, 224], [0, 96, 0], [64, 192, 192], [0, 128, 224],
+           [128, 224, 0], [64, 192, 64], [128, 128, 96], [128, 32, 128],
+           [64, 0, 192], [0, 64, 96], [0, 160, 128], [192, 0, 64],
+           [128, 64, 224], [0, 32, 128], [192, 128, 192], [0, 64, 224],
+           [128, 160, 128], [192, 128, 0], [128, 64, 32], [128, 32, 64],
+           [192, 0, 128], [64, 192, 32], [0, 160, 64], [64, 0, 0],
+           [192, 192, 160], [0, 32, 64], [64, 128, 128], [64, 192, 160],
+           [128, 160, 64], [64, 128, 0], [192, 192, 32], [128, 96, 192],
+           [64, 0, 128], [64, 64, 32], [0, 224, 192], [192, 0, 0],
+           [192, 64, 160], [0, 96, 192], [192, 128, 128], [64, 64, 160],
+           [128, 224, 192], [192, 128, 64], [192, 64, 32], [128, 96, 64],
+           [192, 0, 192], [0, 192, 32], [64, 224, 64], [64, 0, 64],
+           [128, 192, 160], [64, 96, 64], [64, 128, 192], [0, 192, 160],
+           [192, 224, 64], [64, 128, 64], [128, 192, 32], [192, 32, 192],
+           [64, 64, 192], [0, 64, 32], [64, 160, 192], [192, 64, 64],
+           [128, 64, 160], [64, 32, 192], [192, 192, 192], [0, 64, 160],
+           [192, 160, 192], [192, 192, 0], [128, 64, 96], [192, 32, 64],
+           [192, 64, 128], [64, 192, 96], [64, 160, 64], [64, 64, 0]]
+
+num_classes = 172
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
+    backbone=dict(
+        type='MixVisionTransformer',
+        in_channels=3,
+        embed_dims=32,
+        num_stages=4,
+        num_layers=[2, 2, 2, 2],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[7, 3, 3, 3],
+        sr_ratios=[8, 4, 2, 1],
+        out_indices=(0, 1, 2, 3),
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1),
+    decode_head=dict(
+        type='SegformerHead',
+        in_channels=[32, 64, 160, 256],
+        in_index=[0, 1, 2, 3],
+        channels=256,
+        dropout_ratio=0.1,
+        num_classes=num_classes,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset settings
+dataset_type = 'SegDataset'
+data_root = './data/coco_stuff164k/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='MMResize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+    dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='MMRandomFlip', flip_ratio=0.5),
+    dict(type='MMPhotoMetricDistortion'),
+    dict(type='MMNormalize', **img_norm_cfg),
+    dict(type='MMPad', size=crop_size, pad_val=dict(img=0, masks=0, seg=255)),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_semantic_seg'],
+        meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'img_norm_cfg')),
+]
+test_pipeline = [
+    dict(
+        type='MMMultiScaleFlipAug',
+        img_scale=(2048, 512),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='MMResize', keep_ratio=True),
+            dict(type='MMRandomFlip'),
+            dict(type='MMNormalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_filename', 'ori_shape',
+                           'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                           'flip_direction', 'img_norm_cfg')),
+        ])
+]
+
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ignore_index=255,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'train2017/',
+            label_root=data_root + 'annotations/train2017/',
+            split=data_root + 'train.txt',
+            classes=CLASSES,
+        ),
+        pipeline=train_pipeline),
+    val=dict(
+        imgs_per_gpu=1,
+        workers_per_gpu=1,
+        ignore_index=255,
+        type=dataset_type,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'val2017/',
+            label_root=data_root + 'annotations/val2017',
+            split=data_root + 'val.txt',
+            classes=CLASSES,
+        ),
+        pipeline=test_pipeline),
+    test=dict(
+        imgs_per_gpu=1,
+        workers_per_gpu=1,
+        type=dataset_type,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'val2017/',
+            label_root=data_root + 'annotations/val2017',
+            split=data_root + 'val.txt',
+            classes=CLASSES,
+        ),
+        pipeline=test_pipeline))
+optimizer = dict(
+    type='AdamW',
+    lr=6e-05,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_options=dict(
+        custom_keys=dict(
+            pos_block=dict(decay_mult=0.0),
+            norm=dict(decay_mult=0.0),
+            head=dict(lr_mult=10.0))))
+optimizer_config = dict()
+lr_config = dict(
+    policy='poly',
+    warmup='linear',
+    warmup_iters=800,
+    warmup_ratio=1e-06,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+
+# runtime settings
+total_epochs = 20
+checkpoint_config = dict(interval=1)
+eval_config = dict(interval=1, gpu_collect=False)
+eval_pipelines = [
+    dict(
+        mode='test',
+        evaluators=[
+            dict(
+                type='SegmentationEvaluator',
+                classes=CLASSES,
+                metric_names=['mIoU'])
+        ],
+    )
+]
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+dist_params = dict(backend='nccl')
+
+cudnn_benchmark = False
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/configs/segmentation/segformer/segformer_b5_coco.py
+++ b/configs/segmentation/segformer/segformer_b5_coco.py
@ -0,0 +1,249 @@
+# segformer B5
+
+CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'branch', 'bridge',
+    'building-other', 'bush', 'cabinet', 'cage', 'cardboard', 'carpet',
+    'ceiling-other', 'ceiling-tile', 'cloth', 'clothes', 'clouds', 'counter',
+    'cupboard', 'curtain', 'desk-stuff', 'dirt', 'door-stuff', 'fence',
+    'floor-marble', 'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
+    'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
+    'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
+    'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+    'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+    'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+    'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper', 'snow',
+    'solid-other', 'stairs', 'stone', 'straw', 'structural-other', 'table',
+    'tent', 'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+    'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone', 'wall-tile',
+    'wall-wood', 'water-other', 'waterdrops', 'window-blind', 'window-other',
+    'wood'
+]
+PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96],
+           [128, 192, 192], [0, 64, 64], [0, 192, 224], [0, 192, 192],
+           [128, 192, 64], [0, 192, 96], [128, 192, 64], [128, 32, 192],
+           [0, 0, 224], [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+           [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+           [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+           [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
+           [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
+           [192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
+           [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+           [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128,
+                                                            64], [192, 0, 32],
+           [128, 96, 0], [128, 0, 192], [0, 128, 32], [64, 224, 0], [0, 0, 64],
+           [128, 128, 160], [64, 96, 0], [0, 128, 192], [0, 128, 160],
+           [192, 224, 0], [0, 128, 64], [128, 128, 32], [192, 32, 128],
+           [0, 64, 192], [0, 0, 32], [64, 160, 128], [128, 64, 64],
+           [128, 0, 160], [64, 32, 128], [128, 192, 192], [0, 0, 160],
+           [192, 160, 128], [128, 192, 0], [128, 0, 96], [192, 32, 0],
+           [128, 64, 128], [64, 128, 96], [64, 160, 0], [0, 64, 0],
+           [192, 128, 224], [64, 32, 0], [0, 192, 128], [64, 128, 224],
+           [192, 160, 0], [0, 192, 0], [192, 128, 96], [192, 96, 128],
+           [0, 64, 128], [64, 0, 96], [64, 224, 128], [128, 64, 0],
+           [192, 0, 224], [64, 96, 128], [128, 192, 128], [64, 0, 224],
+           [192, 224, 128], [128, 192, 64], [192, 0, 96], [192, 96, 0],
+           [128, 64, 192], [0, 128, 96], [0, 224, 0], [64, 64, 64],
+           [128, 128, 224], [0, 96, 0], [64, 192, 192], [0, 128, 224],
+           [128, 224, 0], [64, 192, 64], [128, 128, 96], [128, 32, 128],
+           [64, 0, 192], [0, 64, 96], [0, 160, 128], [192, 0, 64],
+           [128, 64, 224], [0, 32, 128], [192, 128, 192], [0, 64, 224],
+           [128, 160, 128], [192, 128, 0], [128, 64, 32], [128, 32, 64],
+           [192, 0, 128], [64, 192, 32], [0, 160, 64], [64, 0, 0],
+           [192, 192, 160], [0, 32, 64], [64, 128, 128], [64, 192, 160],
+           [128, 160, 64], [64, 128, 0], [192, 192, 32], [128, 96, 192],
+           [64, 0, 128], [64, 64, 32], [0, 224, 192], [192, 0, 0],
+           [192, 64, 160], [0, 96, 192], [192, 128, 128], [64, 64, 160],
+           [128, 224, 192], [192, 128, 64], [192, 64, 32], [128, 96, 64],
+           [192, 0, 192], [0, 192, 32], [64, 224, 64], [64, 0, 64],
+           [128, 192, 160], [64, 96, 64], [64, 128, 192], [0, 192, 160],
+           [192, 224, 64], [64, 128, 64], [128, 192, 32], [192, 32, 192],
+           [64, 64, 192], [0, 64, 32], [64, 160, 192], [192, 64, 64],
+           [128, 64, 160], [64, 32, 192], [192, 192, 192], [0, 64, 160],
+           [192, 160, 192], [192, 192, 0], [128, 64, 96], [192, 32, 64],
+           [192, 64, 128], [64, 192, 96], [64, 160, 64], [64, 64, 0]]
+
+num_classes = 172
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained=
+    'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth',
+    backbone=dict(
+        type='MixVisionTransformer',
+        in_channels=3,
+        embed_dims=64,
+        num_stages=4,
+        num_layers=[3, 6, 40, 3],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[7, 3, 3, 3],
+        sr_ratios=[8, 4, 2, 1],
+        out_indices=(0, 1, 2, 3),
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1),
+    decode_head=dict(
+        type='SegformerHead',
+        in_channels=[64, 128, 320, 512],
+        in_index=[0, 1, 2, 3],
+        channels=768,
+        dropout_ratio=0.1,
+        num_classes=num_classes,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset settings
+dataset_type = 'SegDataset'
+data_root = 'data/coco_stuff164k/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='MMResize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)),
+    dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='MMRandomFlip', flip_ratio=0.5),
+    dict(type='MMPhotoMetricDistortion'),
+    dict(type='MMNormalize', **img_norm_cfg),
+    dict(type='MMPad', size=crop_size, pad_val=dict(img=0, masks=0, seg=255)),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_semantic_seg'],
+        meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'img_norm_cfg')),
+]
+test_pipeline = [
+    dict(
+        type='MMMultiScaleFlipAug',
+        img_scale=(2048, 640),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='MMResize', keep_ratio=True),
+            dict(type='MMRandomFlip'),
+            dict(type='MMNormalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=('filename', 'ori_filename', 'ori_shape',
+                           'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                           'flip_direction', 'img_norm_cfg')),
+        ])
+]
+
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ignore_index=255,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'train2017/',
+            label_root=data_root + 'annotations/train2017/',
+            split=data_root + 'train.txt',
+            classes=CLASSES,
+        ),
+        pipeline=train_pipeline),
+    val=dict(
+        imgs_per_gpu=1,
+        workers_per_gpu=1,
+        ignore_index=255,
+        type=dataset_type,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'val2017/',
+            label_root=data_root + 'annotations/val2017',
+            split=data_root + 'val.txt',
+            classes=CLASSES,
+        ),
+        pipeline=test_pipeline),
+    test=dict(
+        imgs_per_gpu=1,
+        workers_per_gpu=1,
+        type=dataset_type,
+        data_source=dict(
+            type='SegSourceRaw',
+            img_suffix='.jpg',
+            label_suffix='_labelTrainIds.png',
+            img_root=data_root + 'val2017/',
+            label_root=data_root + 'annotations/val2017',
+            split=data_root + 'val.txt',
+            classes=CLASSES,
+        ),
+        pipeline=test_pipeline))
+optimizer = dict(
+    type='AdamW',
+    lr=6e-05,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_options=dict(
+        custom_keys=dict(
+            pos_block=dict(decay_mult=0.0),
+            norm=dict(decay_mult=0.0),
+            head=dict(lr_mult=10.0))))
+optimizer_config = dict()
+lr_config = dict(
+    policy='poly',
+    warmup='linear',
+    warmup_iters=800,
+    warmup_ratio=1e-06,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+
+# runtime settings
+total_epochs = 20
+checkpoint_config = dict(interval=1)
+eval_config = dict(interval=1, gpu_collect=False)
+eval_pipelines = [
+    dict(
+        mode='test',
+        evaluators=[
+            dict(
+                type='SegmentationEvaluator',
+                classes=CLASSES,
+                metric_names=['mIoU'])
+        ],
+    )
+]
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+
+dist_params = dict(backend='nccl')
+
+cudnn_benchmark = False
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg
+++ b/data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
+size 146140
--- a/docs/source/_static/dingding_qrcode.jpg
+++ b/docs/source/_static/dingding_qrcode.jpg
--- a/docs/source/data_hub.md
+++ b/docs/source/data_hub.md
@ -62,6 +62,7 @@ EasyCV summarized various datasets in different fields. At present, we support p
 | **VOC2012**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) | Common        | From 2009 to 2011, the amount of data is still growing on the basis of the previous year's dataset, and from 2011 to 2012, the amount of data used for classification, detection and person layout tasks does not change. Mainly for segmentation and action recognition, improve the corresponding data subsets and label information. | [VOCtrainval_11-May-2012.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar) (2G) |         |
 | **Pascal Context**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/) | Common        | This dataset is a set of additional annotations for PASCAL VOC 2010. It goes beyond the original PASCAL semantic segmentation task by providing annotations for the whole scene. The [statistics section](https://www.cs.stanford.edu/~roozbeh/pascal-context/#statistics) has a full list of 400+ labels. | [voc2010/VOCtrainval_03-May-2010.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar) (1.3GB)<br/>[VOC2010test.tar](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar) <br/>[trainval_merged.json](https://codalabuser.blob.core.windows.net/public/trainval_merged.json) (590MB) |         |
 | **COCO-Stuff 10K**<br/>[url](https://github.com/nightrome/cocostuff10k) | Common        | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [cocostuff-10k-v1.1.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip) (2.0 GB) |         |
+| **COCO-Stuff 164K**<br/>[url](https://github.com/nightrome/cocostuff) | Common        | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [train2017.zip](http://images.cocodataset.org/zips/train2017.zip) (18.0 GB), <br/>[val2017.zip](http://images.cocodataset.org/zips/val2017.zip) (1.0 GB), <br/>[stuffthingmaps_trainval2017.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) (659M)|         |
 | **Cityscapes**<br/>[url](https://www.cityscapes-dataset.com/) | Street scenes | The Cityscapes contains a diverse set of stereo video sequences recorded in street scenes from 50 different cities, with high quality pixel-level annotations of 5 000 frames in addition to a larger set of 20 000 weakly annotated frames. The dataset is thus an order of magnitude larger than similar previous attempts. | [leftImg8bit_trainvaltest.zip](https://www.cityscapes-dataset.com/file-handling/?packageID=3) (11GB) |         |
 | **ADE20K**<br/>[url](http://groups.csail.mit.edu/vision/datasets/ADE20K/) | Scene         | The ADE20K dataset is released by MIT and can be used for scene perception, parsing, segmentation, multi-object recognition and semantic understanding.The annotated images cover the scene categories from the SUN and Places database.It contains 25.574 training set and 2000 validation set. | [ADEChallengeData2016.zip](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip) (923MB)<br/>[release_test.zip](http://data.csail.mit.edu/places/ADEchallenge/release_test.zip) (202MB) |         |

--- a/docs/source/model_zoo_seg.md
+++ b/docs/source/model_zoo_seg.md
@ -11,6 +11,7 @@ Pretrained on **Pascal VOC 2012 + Aug**.
 ## UperNet

 Pretrained on **Pascal VOC 2012 + Aug**.
+
 | Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                      | mIoU | Download                                                     |
 | ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 282.9ms | 76.59               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) |
@ -26,3 +27,13 @@ Pretrained on **Pascal VOC 2012 + Aug**.
 | Algorithm  | Config                                                       | PQ | box MAP | Mask mAP | Download                                                     |
 | ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |
 | mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) |
+
+
+## SegFormer
+
+Semantic segmentation models trained on **CoCo_stuff164k**.
+
+| Algorithm  | Config                                                       | Params<br/>(backbone/total)                            | inference time(V100)<br/>(ms/img)                    |mIoU | Download                                                     |
+| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| SegFormer_B0 | [segformer_b0_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b0_coco.py) | 3.3M/3.8M | 47.2ms |  34.79               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b0/20220810_102335.log.json) |
+| SegFormer_B5 | [segformer_b5_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b5_coco.py) | 81M/85M   | 99.2ms |  46.75               | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b5/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b5/20220812_144336.log.json) |
--- a/docs/source/prepare_data.md
+++ b/docs/source/prepare_data.md
@ -6,6 +6,8 @@ EasyCV provides various datasets for multi tasks. Please refer to the following
 - [Object Detection](#Object-Detection)
 - [Self-Supervised Learning](#Self-Supervised-Learning)
 - [Pose (Keypoint)](#Pose)
+- [Image Segmentation](#Image-Segmentation)
+

 ## Image Classification

@ -354,3 +356,31 @@ data/coco2017
    ├── 000000000285.jpg
    ├── ...
 ```
+
+## Image Segmentation
+
+- [COCO Stuff 164k](#COCO-Stuff-164k)
+### COCO Stuff 164k
+
+For COCO Stuff 164k dataset, please run the following commands to download and convert the augmented dataset.
+
+```shell
+# download
+mkdir coco_stuff164k && cd coco_stuff164k
+wget http://images.cocodataset.org/zips/train2017.zip
+wget http://images.cocodataset.org/zips/val2017.zip
+wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
+
+# unzip
+unzip train2017.zip -d images/
+unzip val2017.zip -d images/
+unzip stuffthingmaps_trainval2017.zip -d annotations/
+
+# --nproc means 8 process for conversion, which could be omitted as well.
+python tools/prepare_data/coco_stuff164k.py /path/to/coco_stuff164k --nproc 8
+```
+
+By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2017/*_labelTrainIds.png` are used for COCO Stuff 164k training and testing.
+
+The details of this dataset could be found at [here](https://github.com/nightrome/cocostuff#downloads).
+
--- a/easycv/models/backbones/init.py
+++ b/easycv/models/backbones/init.py
@ -9,6 +9,7 @@ from .hrnet import HRNet
 from .inceptionv3 import Inception3
 from .lighthrnet import LiteHRNet
 from .mae_vit_transformer import *
+from .mit import MixVisionTransformer
 from .mnasnet import MNASNet
 from .mobilenetv2 import MobileNetV2
 from .pytorch_image_models_wrapper import *
--- a/easycv/models/backbones/mit.py
+++ b/easycv/models/backbones/mit.py
@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/2d66179630035097dcae08ee958f60d4b5a7fcae/mmseg/models/backbones/mit.py
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+from mmcv.runner import BaseModule, ModuleList, Sequential
+
+from easycv.models.registry import BACKBONES
+from easycv.models.segmentation.utils import (PatchEmbed, nchw_to_nlc,
+                                              nlc_to_nchw)
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of Segformer.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Conv to encode positional information.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        # 3x3 depth wise conv to provide positional encode information
+        pe_conv = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=feedforward_channels,
+            kernel_size=3,
+            stride=1,
+            padding=(3 - 1) // 2,
+            bias=True,
+            groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class EfficientMultiheadAttention(MultiheadAttention):
+    """An implementation of Efficient Multi-head Attention of Segformer.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None,
+                 batch_first=True,
+                 qkv_bias=False,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            dropout_layer=dropout_layer,
+            init_cfg=init_cfg,
+            batch_first=batch_first,
+            bias=qkv_bias)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # `need_weights=True` will let nn.MultiHeadAttention
+        # `return attn_output, attn_output_weights.sum(dim=1) / num_heads`
+        # The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set
+        # `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`.
+        # This issue - `https://github.com/pytorch/pytorch/issues/37583` report
+        # the error that large scale tensor sum operation may cause cuda error.
+        out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Segformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default:None.
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 sr_ratio=1,
+                 with_cp=False):
+        super(TransformerEncoderLayer, self).__init__()
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = EfficientMultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg)
+
+        self.with_cp = with_cp
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), hw_shape, identity=x)
+            x = self.ffn(self.norm2(x), hw_shape, identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@BACKBONES.register_module()
+class MixVisionTransformer(BaseModule):
+    """The backbone of Segformer.
+
+    This backbone is the implementation of `SegFormer: Simple and
+    Efficient Design for Semantic Segmentation with
+    Transformers <https://arxiv.org/abs/2105.15203>`_.
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 4, 8].
+        patch_sizes (Sequence[int]): The patch_size of each overlapped patch
+            embedding. Default: [7, 3, 3, 3].
+        strides (Sequence[int]): The stride of each overlapped patch embedding.
+            Default: [4, 2, 2, 2].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 4, 8],
+                 patch_sizes=[7, 3, 3, 3],
+                 strides=[4, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 init_cfg=None,
+                 with_cp=False):
+        super(MixVisionTransformer, self).__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        self.with_cp = with_cp
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=patch_sizes[i] // 2,
+                norm_cfg=norm_cfg)
+            layer = ModuleList([
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratio * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    sr_ratio=sr_ratios[i]) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            self.layers.append(ModuleList([patch_embed, layer, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super(MixVisionTransformer, self).init_weights()
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
--- a/easycv/models/segmentation/heads/init.py
+++ b/easycv/models/segmentation/heads/init.py
@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .fcn_head import FCNHead
 from .mask2former_head import Mask2FormerHead
+from .segformer_head import SegformerHead
 from .uper_head import UPerHead

-__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead']
+__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead', 'SegformerHead']
--- a/easycv/models/segmentation/heads/segformer_head.py
+++ b/easycv/models/segmentation/heads/segformer_head.py
@ -0,0 +1,146 @@
+# Modified from
+# https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/decode_heads/segformer_head.py
+#
+# This work is licensed under the NVIDIA Source Code License.
+#
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+#
+#  1. Definitions
+#  "Licensor" means any person or entity that distributes its Work.
+#  "Software" means the original work of authorship made available under
+# this License.
+#  "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+#  The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+#  Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+#  2. License Grants
+#      2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+#  3. Limitations
+#      3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+#      3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+#      3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+#      3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+#      3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+#      3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+#  4. Disclaimer of Warranty.
+#  THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+#  5. Limitation of Liability.
+#  EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from https://github.com/open-mmlab/mmsegmentation/blob/2d66179630035097dcae08ee958f60d4b5a7fcae/mmseg/models/decode_heads/segformer_head.py
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from easycv.models.builder import HEADS
+from easycv.models.segmentation.heads.base import BaseDecodeHead
+from easycv.models.utils.ops import resize_tensor as resize
+
+
+@HEADS.register_module()
+class SegformerHead(BaseDecodeHead):
+    """The all mlp Head of segformer.
+
+    This head is the implementation of
+    `Segformer <https://arxiv.org/abs/2105.15203>` _.
+
+    Args:
+        interpolate_mode: The interpolate mode of MLP head upsample operation.
+            Default: 'bilinear'.
+    """
+
+    def __init__(self, interpolate_mode='bilinear', **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+
+        self.interpolate_mode = interpolate_mode
+        num_inputs = len(self.in_channels)
+
+        assert num_inputs == len(self.in_index)
+
+        self.convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.channels,
+                    kernel_size=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+        self.fusion_conv = ConvModule(
+            in_channels=self.channels * num_inputs,
+            out_channels=self.channels,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs):
+        # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for idx in range(len(inputs)):
+            x = inputs[idx]
+            conv = self.convs[idx]
+            outs.append(
+                resize(
+                    input=conv(x),
+                    size=inputs[0].shape[2:],
+                    mode=self.interpolate_mode,
+                    align_corners=self.align_corners))
+
+        out = self.fusion_conv(torch.cat(outs, dim=1))
+
+        out = self.cls_seg(out)
+
+        return out
--- a/easycv/models/segmentation/utils/init.py
+++ b/easycv/models/segmentation/utils/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .embed import PatchEmbed
+from .shape_convert import (nchw2nlc2nchw, nchw_to_nlc, nlc2nchw2nlc,
+                            nlc_to_nchw)
--- a/easycv/models/segmentation/utils/embed.py
+++ b/easycv/models/segmentation/utils/embed.py
@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/utils/embed.py
+
+import math
+from typing import Sequence
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import to_2tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int, optional): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
--- a/easycv/models/segmentation/utils/shape_convert.py
+++ b/easycv/models/segmentation/utils/shape_convert.py
@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/utils/shape_convert.py
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
+    """Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
+    reshaped tensor as the input of `module`, and the convert the output of
+    `module`, whose shape is.
+
+    [N, L, C], to [N, C, H, W].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, L, C] as input.
+        x (Tensor): The input tensor of shape [N, C, H, W].
+                contiguous:
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> norm = nn.LayerNorm(4)
+        >>> feature_map = torch.rand(4, 4, 5, 5)
+        >>> output = nchw2nlc2nchw(norm, feature_map)
+    """
+    B, C, H, W = x.shape
+    if not contiguous:
+        x = x.flatten(2).transpose(1, 2)
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+    else:
+        x = x.flatten(2).transpose(1, 2).contiguous()
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+    return x
+
+
+def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
+    """Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
+    reshaped tensor as the input of `module`, and convert the output of
+    `module`, whose shape is.
+
+    [N, C, H, W], to [N, L, C].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, C, H, W] as input.
+        x (Tensor): The input tensor of shape [N, L, C].
+        hw_shape: (Sequence[int]): The height and width of the
+            feature map with shape [N, C, H, W].
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> conv = nn.Conv2d(16, 16, 3, 1, 1)
+        >>> feature_map = torch.rand(4, 25, 16)
+        >>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    if not contiguous:
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2)
+    else:
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2).contiguous()
+    return x
--- a/easycv/predictors/init.py
+++ b/easycv/predictors/init.py
@ -7,3 +7,4 @@ from .feature_extractor import (TorchFaceAttrExtractor,
                                TorchFeatureExtractor)
 from .pose_predictor import (TorchPoseTopDownPredictor,
                             TorchPoseTopDownPredictorWithDetector)
+from .segmentation import Mask2formerPredictor, SegFormerPredictor
--- a/easycv/predictors/segmentation.py
+++ b/easycv/predictors/segmentation.py
@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import cv2
+import mmcv
 import numpy as np
 import torch
 from matplotlib.collections import PatchCollection
@ -13,6 +14,7 @@ from easycv.models import build_model
 from easycv.predictors.builder import PREDICTORS
 from easycv.predictors.interface import PredictorInterface
 from easycv.utils.checkpoint import load_checkpoint
+from easycv.utils.config_tools import mmcv_config_fromfile
 from easycv.utils.registry import build_from_cfg


@ -108,6 +110,147 @@ class Mask2formerPredictor(PredictorInterface):
        return instance_result


+@PREDICTORS.register_module()
+class SegFormerPredictor(PredictorInterface):
+
+    def __init__(self, model_path, model_config):
+        """init model
+
+        Args:
+            model_path (str): Path of model path
+            model_config (config): config string for model to init. Defaults to None.
+        """
+        self.model_path = model_path
+
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.model = None
+        with io.open(self.model_path, 'rb') as infile:
+            checkpoint = torch.load(infile, map_location='cpu')
+
+        self.cfg = mmcv_config_fromfile(model_config)
+        self.CLASSES = self.cfg.CLASSES
+        self.PALETTE = self.cfg.PALETTE
+        # build model
+        self.model = build_model(self.cfg.model)
+
+        self.ckpt = load_checkpoint(
+            self.model, self.model_path, map_location=self.device)
+        self.model.to(self.device)
+        self.model.eval()
+
+        # build pipeline
+        test_pipeline = self.cfg.test_pipeline
+        pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline]
+        self.pipeline = Compose(pipeline)
+
+    def predict(self, input_data_list):
+        """
+    using session run predict a number of samples using batch_size
+
+    Args:
+      input_data_list:  a list of numpy array(in rgb order), each array is a sample
+        to be predicted
+        use a fixed number if you do not want to adjust batch_size in runtime
+    """
+        output_list = []
+        for idx, img in enumerate(input_data_list):
+            if type(img) is not np.ndarray:
+                img = np.asarray(img)
+
+            ori_img_shape = img.shape[:2]
+
+            data_dict = {'img': img}
+            data_dict['ori_shape'] = ori_img_shape
+            data_dict = self.pipeline(data_dict)
+            img = data_dict['img']
+            img = torch.unsqueeze(img[0], 0).to(self.device)
+            data_dict.pop('img')
+
+            with torch.no_grad():
+                out = self.model([img],
+                                 mode='test',
+                                 img_metas=[[data_dict['img_metas'][0]._data]])
+
+            output_list.append(out)
+
+        return output_list
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+
+        img = mmcv.imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            return img
+
+
 def _get_bias_color(base, max_dist=30):
    """Get different colors for each masks.

--- a/result.jpg
+++ b/result.jpg
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fca74c76fea1589f9479b3261fd281bdb5673041f4ab7a70dbe14663873a19d3
+size 189945
--- a/tests/predictors/test_segmentor.py
+++ b/tests/predictors/test_segmentor.py
@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+isort:skip_file
+"""
+import os
+import tempfile
+import unittest
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from easycv.predictors.detector import TorchYoloXPredictor, TorchViTDetPredictor
+from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_EXPORT,
+                             PRETRAINED_MODEL_YOLOXS_EXPORT_JIT,
+                             PRETRAINED_MODEL_YOLOXS_END2END_JIT,
+                             DET_DATA_SMALL_COCO_LOCAL)
+
+from tests.ut_config import (PRETRAINED_MODEL_SEGFORMER,
+                             MODEL_CONFIG_SEGFORMER)
+from easycv.predictors.segmentation import (SegFormerPredictor)
+from numpy.testing import assert_array_almost_equal
+
+
+class SegmentorTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def test_segformer_detector(self):
+        segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
+        segmentation_model_config = MODEL_CONFIG_SEGFORMER
+
+        img = os.path.join(DET_DATA_SMALL_COCO_LOCAL,
+                           'val2017/000000289059.jpg')
+        if os.path.exists(img) == False:
+            img = './data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg'
+
+        input_data_list = [np.asarray(Image.open(img))]
+        predictor = SegFormerPredictor(
+            model_path=segmentation_model_path,
+            model_config=segmentation_model_config)
+
+        output = predictor.predict(input_data_list)[0]
+        self.assertIn('seg_pred', output)
+
+        self.assertListEqual(
+            list(input_data_list[0].shape)[:2],
+            list(output['seg_pred'][0].shape))
+        self.assertListEqual(output['seg_pred'][0][1, :10].tolist(),
+                             [161 for i in range(10)])
+        self.assertListEqual(output['seg_pred'][0][-1, -10:].tolist(),
+                             [133 for i in range(10)])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/ut_config.py
+++ b/tests/ut_config.py
@ -111,3 +111,10 @@ PRETRAINED_MODEL_MAE = os.path.join(
 PRETRAINED_MODEL_MASK2FORMER = os.path.join(
    BASE_LOCAL_PATH,
    'pretrained_models/segmentation/mask2former/mask2former_r50_instance.pth')
+
+PRETRAINED_MODEL_SEGFORMER = os.path.join(
+    BASE_LOCAL_PATH,
+    'pretrained_models/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth'
+)
+MODEL_CONFIG_SEGFORMER = (
+    './configs/segmentation/segformer/segformer_b0_coco.py')
--- a/tools/prepare_data/coco_stuff164k.py
+++ b/tools/prepare_data/coco_stuff164k.py
@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/coco_stuff164k.py
+
+import argparse
+import os.path as osp
+import shutil
+from functools import partial
+from glob import glob
+
+import mmcv
+import numpy as np
+from PIL import Image
+
+COCO_LEN = 123287
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    26: 24,
+    27: 25,
+    30: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    45: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    66: 60,
+    69: 61,
+    71: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    83: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    91: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    255: 255
+}
+
+
+def convert_to_trainID(maskpath, out_mask_dir, is_train):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(
+        out_mask_dir, 'train2017',
+        osp.basename(maskpath).split('.')[0] +
+        '_labelTrainIds.png') if is_train else osp.join(
+            out_mask_dir, 'val2017',
+            osp.basename(maskpath).split('.')[0] + '_labelTrainIds.png')
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 164k annotations to mmsegmentation format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    nproc = args.nproc
+
+    out_dir = args.out_dir or coco_path
+    out_img_dir = osp.join(out_dir, 'images')
+    out_mask_dir = osp.join(out_dir, 'annotations')
+
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
+    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
+
+    if out_dir != coco_path:
+        shutil.copytree(osp.join(coco_path, 'images'), out_img_dir)
+
+    train_list = glob(osp.join(coco_path, 'annotations', 'train2017', '*.png'))
+    train_list = [file for file in train_list if '_labelTrainIds' not in file]
+    test_list = glob(osp.join(coco_path, 'annotations', 'val2017', '*.png'))
+    test_list = [file for file in test_list if '_labelTrainIds' not in file]
+    assert (len(train_list) +
+            len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(test_list))
+
+    if args.nproc > 1:
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list,
+            nproc=nproc)
+        mmcv.track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list,
+            nproc=nproc)
+    else:
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list)
+        mmcv.track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()