add segformer algo

* 将COCO_Stuff_164k数据集增加到了data_hub.md, prepared_data.md, 以及将其对应脚本加到了prepare_data文件夹下面

* 增加了segformer对应的predictor

* 完善docs/source/model_zoo_seg.md中关于segformer的信息
pull/191/head
pengyu.lpy 2022-08-18 10:40:18 +08:00 committed by jiangnana.jnn
parent fbdde2a054
commit 0128e881d9
21 changed files with 2060 additions and 2 deletions

2
.gitattributes vendored
View File

@ -1,5 +1,5 @@
*.png filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text
*.wav filter=lfs diff=lfs merge=lfs -text
*.JPEG filter=lfs diff=lfs merge=lfs -text

View File

@ -0,0 +1,249 @@
# segformer of B0
CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush', 'banner', 'blanket', 'branch', 'bridge',
'building-other', 'bush', 'cabinet', 'cage', 'cardboard', 'carpet',
'ceiling-other', 'ceiling-tile', 'cloth', 'clothes', 'clouds', 'counter',
'cupboard', 'curtain', 'desk-stuff', 'dirt', 'door-stuff', 'fence',
'floor-marble', 'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper', 'snow',
'solid-other', 'stairs', 'stone', 'straw', 'structural-other', 'table',
'tent', 'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone', 'wall-tile',
'wall-wood', 'water-other', 'waterdrops', 'window-blind', 'window-other',
'wood'
]
PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96],
[128, 192, 192], [0, 64, 64], [0, 192, 224], [0, 192, 192],
[128, 192, 64], [0, 192, 96], [128, 192, 64], [128, 32, 192],
[0, 0, 224], [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
[0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
[128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
[64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
[0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
[192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
[0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
[128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128,
64], [192, 0, 32],
[128, 96, 0], [128, 0, 192], [0, 128, 32], [64, 224, 0], [0, 0, 64],
[128, 128, 160], [64, 96, 0], [0, 128, 192], [0, 128, 160],
[192, 224, 0], [0, 128, 64], [128, 128, 32], [192, 32, 128],
[0, 64, 192], [0, 0, 32], [64, 160, 128], [128, 64, 64],
[128, 0, 160], [64, 32, 128], [128, 192, 192], [0, 0, 160],
[192, 160, 128], [128, 192, 0], [128, 0, 96], [192, 32, 0],
[128, 64, 128], [64, 128, 96], [64, 160, 0], [0, 64, 0],
[192, 128, 224], [64, 32, 0], [0, 192, 128], [64, 128, 224],
[192, 160, 0], [0, 192, 0], [192, 128, 96], [192, 96, 128],
[0, 64, 128], [64, 0, 96], [64, 224, 128], [128, 64, 0],
[192, 0, 224], [64, 96, 128], [128, 192, 128], [64, 0, 224],
[192, 224, 128], [128, 192, 64], [192, 0, 96], [192, 96, 0],
[128, 64, 192], [0, 128, 96], [0, 224, 0], [64, 64, 64],
[128, 128, 224], [0, 96, 0], [64, 192, 192], [0, 128, 224],
[128, 224, 0], [64, 192, 64], [128, 128, 96], [128, 32, 128],
[64, 0, 192], [0, 64, 96], [0, 160, 128], [192, 0, 64],
[128, 64, 224], [0, 32, 128], [192, 128, 192], [0, 64, 224],
[128, 160, 128], [192, 128, 0], [128, 64, 32], [128, 32, 64],
[192, 0, 128], [64, 192, 32], [0, 160, 64], [64, 0, 0],
[192, 192, 160], [0, 32, 64], [64, 128, 128], [64, 192, 160],
[128, 160, 64], [64, 128, 0], [192, 192, 32], [128, 96, 192],
[64, 0, 128], [64, 64, 32], [0, 224, 192], [192, 0, 0],
[192, 64, 160], [0, 96, 192], [192, 128, 128], [64, 64, 160],
[128, 224, 192], [192, 128, 64], [192, 64, 32], [128, 96, 64],
[192, 0, 192], [0, 192, 32], [64, 224, 64], [64, 0, 64],
[128, 192, 160], [64, 96, 64], [64, 128, 192], [0, 192, 160],
[192, 224, 64], [64, 128, 64], [128, 192, 32], [192, 32, 192],
[64, 64, 192], [0, 64, 32], [64, 160, 192], [192, 64, 64],
[128, 64, 160], [64, 32, 192], [192, 192, 192], [0, 64, 160],
[192, 160, 192], [192, 192, 0], [128, 64, 96], [192, 32, 64],
[192, 64, 128], [64, 192, 96], [64, 160, 64], [64, 64, 0]]
num_classes = 172
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained=
'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
backbone=dict(
type='MixVisionTransformer',
in_channels=3,
embed_dims=32,
num_stages=4,
num_layers=[2, 2, 2, 2],
num_heads=[1, 2, 5, 8],
patch_sizes=[7, 3, 3, 3],
sr_ratios=[8, 4, 2, 1],
out_indices=(0, 1, 2, 3),
mlp_ratio=4,
qkv_bias=True,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.1),
decode_head=dict(
type='SegformerHead',
in_channels=[32, 64, 160, 256],
in_index=[0, 1, 2, 3],
channels=256,
dropout_ratio=0.1,
num_classes=num_classes,
norm_cfg=norm_cfg,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole'))
# dataset settings
dataset_type = 'SegDataset'
data_root = './data/coco_stuff164k/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
dict(type='MMResize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='MMRandomFlip', flip_ratio=0.5),
dict(type='MMPhotoMetricDistortion'),
dict(type='MMNormalize', **img_norm_cfg),
dict(type='MMPad', size=crop_size, pad_val=dict(img=0, masks=0, seg=255)),
dict(type='DefaultFormatBundle'),
dict(
type='Collect',
keys=['img', 'gt_semantic_seg'],
meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip', 'flip_direction',
'img_norm_cfg')),
]
test_pipeline = [
dict(
type='MMMultiScaleFlipAug',
img_scale=(2048, 512),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='MMResize', keep_ratio=True),
dict(type='MMRandomFlip'),
dict(type='MMNormalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(
type='Collect',
keys=['img'],
meta_keys=('filename', 'ori_filename', 'ori_shape',
'img_shape', 'pad_shape', 'scale_factor', 'flip',
'flip_direction', 'img_norm_cfg')),
])
]
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ignore_index=255,
data_source=dict(
type='SegSourceRaw',
img_suffix='.jpg',
label_suffix='_labelTrainIds.png',
img_root=data_root + 'train2017/',
label_root=data_root + 'annotations/train2017/',
split=data_root + 'train.txt',
classes=CLASSES,
),
pipeline=train_pipeline),
val=dict(
imgs_per_gpu=1,
workers_per_gpu=1,
ignore_index=255,
type=dataset_type,
data_source=dict(
type='SegSourceRaw',
img_suffix='.jpg',
label_suffix='_labelTrainIds.png',
img_root=data_root + 'val2017/',
label_root=data_root + 'annotations/val2017',
split=data_root + 'val.txt',
classes=CLASSES,
),
pipeline=test_pipeline),
test=dict(
imgs_per_gpu=1,
workers_per_gpu=1,
type=dataset_type,
data_source=dict(
type='SegSourceRaw',
img_suffix='.jpg',
label_suffix='_labelTrainIds.png',
img_root=data_root + 'val2017/',
label_root=data_root + 'annotations/val2017',
split=data_root + 'val.txt',
classes=CLASSES,
),
pipeline=test_pipeline))
optimizer = dict(
type='AdamW',
lr=6e-05,
betas=(0.9, 0.999),
weight_decay=0.01,
paramwise_options=dict(
custom_keys=dict(
pos_block=dict(decay_mult=0.0),
norm=dict(decay_mult=0.0),
head=dict(lr_mult=10.0))))
optimizer_config = dict()
lr_config = dict(
policy='poly',
warmup='linear',
warmup_iters=800,
warmup_ratio=1e-06,
power=1.0,
min_lr=0.0,
by_epoch=False)
# runtime settings
total_epochs = 20
checkpoint_config = dict(interval=1)
eval_config = dict(interval=1, gpu_collect=False)
eval_pipelines = [
dict(
mode='test',
evaluators=[
dict(
type='SegmentationEvaluator',
classes=CLASSES,
metric_names=['mIoU'])
],
)
]
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
dist_params = dict(backend='nccl')
cudnn_benchmark = False
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]

View File

@ -0,0 +1,249 @@
# segformer B5
CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush', 'banner', 'blanket', 'branch', 'bridge',
'building-other', 'bush', 'cabinet', 'cage', 'cardboard', 'carpet',
'ceiling-other', 'ceiling-tile', 'cloth', 'clothes', 'clouds', 'counter',
'cupboard', 'curtain', 'desk-stuff', 'dirt', 'door-stuff', 'fence',
'floor-marble', 'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper', 'snow',
'solid-other', 'stairs', 'stone', 'straw', 'structural-other', 'table',
'tent', 'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone', 'wall-tile',
'wall-wood', 'water-other', 'waterdrops', 'window-blind', 'window-other',
'wood'
]
PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96],
[128, 192, 192], [0, 64, 64], [0, 192, 224], [0, 192, 192],
[128, 192, 64], [0, 192, 96], [128, 192, 64], [128, 32, 192],
[0, 0, 224], [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
[0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
[128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
[64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
[0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
[192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
[0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
[128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128,
64], [192, 0, 32],
[128, 96, 0], [128, 0, 192], [0, 128, 32], [64, 224, 0], [0, 0, 64],
[128, 128, 160], [64, 96, 0], [0, 128, 192], [0, 128, 160],
[192, 224, 0], [0, 128, 64], [128, 128, 32], [192, 32, 128],
[0, 64, 192], [0, 0, 32], [64, 160, 128], [128, 64, 64],
[128, 0, 160], [64, 32, 128], [128, 192, 192], [0, 0, 160],
[192, 160, 128], [128, 192, 0], [128, 0, 96], [192, 32, 0],
[128, 64, 128], [64, 128, 96], [64, 160, 0], [0, 64, 0],
[192, 128, 224], [64, 32, 0], [0, 192, 128], [64, 128, 224],
[192, 160, 0], [0, 192, 0], [192, 128, 96], [192, 96, 128],
[0, 64, 128], [64, 0, 96], [64, 224, 128], [128, 64, 0],
[192, 0, 224], [64, 96, 128], [128, 192, 128], [64, 0, 224],
[192, 224, 128], [128, 192, 64], [192, 0, 96], [192, 96, 0],
[128, 64, 192], [0, 128, 96], [0, 224, 0], [64, 64, 64],
[128, 128, 224], [0, 96, 0], [64, 192, 192], [0, 128, 224],
[128, 224, 0], [64, 192, 64], [128, 128, 96], [128, 32, 128],
[64, 0, 192], [0, 64, 96], [0, 160, 128], [192, 0, 64],
[128, 64, 224], [0, 32, 128], [192, 128, 192], [0, 64, 224],
[128, 160, 128], [192, 128, 0], [128, 64, 32], [128, 32, 64],
[192, 0, 128], [64, 192, 32], [0, 160, 64], [64, 0, 0],
[192, 192, 160], [0, 32, 64], [64, 128, 128], [64, 192, 160],
[128, 160, 64], [64, 128, 0], [192, 192, 32], [128, 96, 192],
[64, 0, 128], [64, 64, 32], [0, 224, 192], [192, 0, 0],
[192, 64, 160], [0, 96, 192], [192, 128, 128], [64, 64, 160],
[128, 224, 192], [192, 128, 64], [192, 64, 32], [128, 96, 64],
[192, 0, 192], [0, 192, 32], [64, 224, 64], [64, 0, 64],
[128, 192, 160], [64, 96, 64], [64, 128, 192], [0, 192, 160],
[192, 224, 64], [64, 128, 64], [128, 192, 32], [192, 32, 192],
[64, 64, 192], [0, 64, 32], [64, 160, 192], [192, 64, 64],
[128, 64, 160], [64, 32, 192], [192, 192, 192], [0, 64, 160],
[192, 160, 192], [192, 192, 0], [128, 64, 96], [192, 32, 64],
[192, 64, 128], [64, 192, 96], [64, 160, 64], [64, 64, 0]]
num_classes = 172
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained=
'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth',
backbone=dict(
type='MixVisionTransformer',
in_channels=3,
embed_dims=64,
num_stages=4,
num_layers=[3, 6, 40, 3],
num_heads=[1, 2, 5, 8],
patch_sizes=[7, 3, 3, 3],
sr_ratios=[8, 4, 2, 1],
out_indices=(0, 1, 2, 3),
mlp_ratio=4,
qkv_bias=True,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.1),
decode_head=dict(
type='SegformerHead',
in_channels=[64, 128, 320, 512],
in_index=[0, 1, 2, 3],
channels=768,
dropout_ratio=0.1,
num_classes=num_classes,
norm_cfg=norm_cfg,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole'))
# dataset settings
dataset_type = 'SegDataset'
data_root = 'data/coco_stuff164k/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (640, 640)
train_pipeline = [
dict(type='MMResize', img_scale=(2048, 640), ratio_range=(0.5, 2.0)),
dict(type='SegRandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
dict(type='MMRandomFlip', flip_ratio=0.5),
dict(type='MMPhotoMetricDistortion'),
dict(type='MMNormalize', **img_norm_cfg),
dict(type='MMPad', size=crop_size, pad_val=dict(img=0, masks=0, seg=255)),
dict(type='DefaultFormatBundle'),
dict(
type='Collect',
keys=['img', 'gt_semantic_seg'],
meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip', 'flip_direction',
'img_norm_cfg')),
]
test_pipeline = [
dict(
type='MMMultiScaleFlipAug',
img_scale=(2048, 640),
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='MMResize', keep_ratio=True),
dict(type='MMRandomFlip'),
dict(type='MMNormalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(
type='Collect',
keys=['img'],
meta_keys=('filename', 'ori_filename', 'ori_shape',
'img_shape', 'pad_shape', 'scale_factor', 'flip',
'flip_direction', 'img_norm_cfg')),
])
]
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ignore_index=255,
data_source=dict(
type='SegSourceRaw',
img_suffix='.jpg',
label_suffix='_labelTrainIds.png',
img_root=data_root + 'train2017/',
label_root=data_root + 'annotations/train2017/',
split=data_root + 'train.txt',
classes=CLASSES,
),
pipeline=train_pipeline),
val=dict(
imgs_per_gpu=1,
workers_per_gpu=1,
ignore_index=255,
type=dataset_type,
data_source=dict(
type='SegSourceRaw',
img_suffix='.jpg',
label_suffix='_labelTrainIds.png',
img_root=data_root + 'val2017/',
label_root=data_root + 'annotations/val2017',
split=data_root + 'val.txt',
classes=CLASSES,
),
pipeline=test_pipeline),
test=dict(
imgs_per_gpu=1,
workers_per_gpu=1,
type=dataset_type,
data_source=dict(
type='SegSourceRaw',
img_suffix='.jpg',
label_suffix='_labelTrainIds.png',
img_root=data_root + 'val2017/',
label_root=data_root + 'annotations/val2017',
split=data_root + 'val.txt',
classes=CLASSES,
),
pipeline=test_pipeline))
optimizer = dict(
type='AdamW',
lr=6e-05,
betas=(0.9, 0.999),
weight_decay=0.01,
paramwise_options=dict(
custom_keys=dict(
pos_block=dict(decay_mult=0.0),
norm=dict(decay_mult=0.0),
head=dict(lr_mult=10.0))))
optimizer_config = dict()
lr_config = dict(
policy='poly',
warmup='linear',
warmup_iters=800,
warmup_ratio=1e-06,
power=1.0,
min_lr=0.0,
by_epoch=False)
# runtime settings
total_epochs = 20
checkpoint_config = dict(interval=1)
eval_config = dict(interval=1, gpu_collect=False)
eval_pipelines = [
dict(
mode='test',
evaluators=[
dict(
type='SegmentationEvaluator',
classes=CLASSES,
metric_names=['mIoU'])
],
)
]
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
dist_params = dict(backend='nccl')
cudnn_benchmark = False
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:af6fa61274e497ecc170de5adc4b8e7ac89eba2bc22a6aa119b08ec7adbe9459
size 146140

Binary file not shown.

Before

Width:  |  Height:  |  Size: 97 KiB

After

Width:  |  Height:  |  Size: 130 B

View File

@ -62,6 +62,7 @@ EasyCV summarized various datasets in different fields. At present, we support p
| **VOC2012**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) | Common | From 2009 to 2011, the amount of data is still growing on the basis of the previous year's dataset, and from 2011 to 2012, the amount of data used for classification, detection and person layout tasks does not change. Mainly for segmentation and action recognition, improve the corresponding data subsets and label information. | [VOCtrainval_11-May-2012.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar) (2G) | |
| **Pascal Context**<br/>[url](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/) | Common | This dataset is a set of additional annotations for PASCAL VOC 2010. It goes beyond the original PASCAL semantic segmentation task by providing annotations for the whole scene. The [statistics section](https://www.cs.stanford.edu/~roozbeh/pascal-context/#statistics) has a full list of 400+ labels. | [voc2010/VOCtrainval_03-May-2010.tar](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar) (1.3GB)<br/>[VOC2010test.tar](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar) <br/>[trainval_merged.json](https://codalabuser.blob.core.windows.net/public/trainval_merged.json) (590MB) | |
| **COCO-Stuff 10K**<br/>[url](https://github.com/nightrome/cocostuff10k) | Common | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [cocostuff-10k-v1.1.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip) (2.0 GB) | |
| **COCO-Stuff 164K**<br/>[url](https://github.com/nightrome/cocostuff) | Common | COCO-Stuff augments the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning. | [train2017.zip](http://images.cocodataset.org/zips/train2017.zip) (18.0 GB), <br/>[val2017.zip](http://images.cocodataset.org/zips/val2017.zip) (1.0 GB), <br/>[stuffthingmaps_trainval2017.zip](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) (659M)| |
| **Cityscapes**<br/>[url](https://www.cityscapes-dataset.com/) | Street scenes | The Cityscapes contains a diverse set of stereo video sequences recorded in street scenes from 50 different cities, with high quality pixel-level annotations of 5000 frames in addition to a larger set of 20000 weakly annotated frames. The dataset is thus an order of magnitude larger than similar previous attempts. | [leftImg8bit_trainvaltest.zip](https://www.cityscapes-dataset.com/file-handling/?packageID=3) (11GB) | |
| **ADE20K**<br/>[url](http://groups.csail.mit.edu/vision/datasets/ADE20K/) | Scene | The ADE20K dataset is released by MIT and can be used for scene perception, parsing, segmentation, multi-object recognition and semantic understanding.The annotated images cover the scene categories from the SUN and Places database.It contains 25.574 training set and 2000 validation set. | [ADEChallengeData2016.zip](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip) (923MB)<br/>[release_test.zip](http://data.csail.mit.edu/places/ADEchallenge/release_test.zip) (202MB) | |

View File

@ -11,6 +11,7 @@ Pretrained on **Pascal VOC 2012 + Aug**.
## UperNet
Pretrained on **Pascal VOC 2012 + Aug**.
| Algorithm | Config | Params<br/>(backbone/total) | inference time(V100)<br/>(ms/img) | mIoU | Download |
| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| upernet_r50 | [upernet_r50_512x512_8xb4_60e_voc12aug](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12aug.py) | 23M/66M | 282.9ms | 76.59 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/epoch_60.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/upernet_r50/20220706_114712.log.json) |
@ -26,3 +27,13 @@ Pretrained on **Pascal VOC 2012 + Aug**.
| Algorithm | Config | PQ | box MAP | Mask mAP | Download |
| ---------- | ---------- | ------------------------------------------------------------ | ------------------------ |----------|---------------------------------------------------------------------------- |
| mask2former_r50 | [mask2former_r50_8xb2_e50_panopatic](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/mask2former/mask2former_r50_8xb2_e50_panopatic.py) | 51.64 | 44.81 | 41.88 |[model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/epoch_50.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/modelzoo/segmentation/mask2former_r50_panoptic/20220629_170721.log.json) |
## SegFormer
Semantic segmentation models trained on **CoCo_stuff164k**.
| Algorithm | Config | Params<br/>(backbone/total) | inference time(V100)<br/>(ms/img) |mIoU | Download |
| ---------- | ------------------------------------------------------------ | ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
| SegFormer_B0 | [segformer_b0_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b0_coco.py) | 3.3M/3.8M | 47.2ms | 34.79 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b0/20220810_102335.log.json) |
| SegFormer_B5 | [segformer_b5_coco.py](https://github.com/alibaba/EasyCV/tree/master/configs/segmentation/segformer/segformer_b5_coco.py) | 81M/85M | 99.2ms | 46.75 | [model](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b5/SegmentationEvaluator_mIoU_best.pth) - [log](http://pai-vision-data-hz.oss-cn-zhangjiakou.aliyuncs.com/EasyCV/damo/modelzoo/segmentation/segformer/segformer_b5/20220812_144336.log.json) |

View File

@ -6,6 +6,8 @@ EasyCV provides various datasets for multi tasks. Please refer to the following
- [Object Detection](#Object-Detection)
- [Self-Supervised Learning](#Self-Supervised-Learning)
- [Pose (Keypoint)](#Pose)
- [Image Segmentation](#Image-Segmentation)
## Image Classification
@ -354,3 +356,31 @@ data/coco2017
├── 000000000285.jpg
├── ...
```
## Image Segmentation
- [COCO Stuff 164k](#COCO-Stuff-164k)
### COCO Stuff 164k
For COCO Stuff 164k dataset, please run the following commands to download and convert the augmented dataset.
```shell
# download
mkdir coco_stuff164k && cd coco_stuff164k
wget http://images.cocodataset.org/zips/train2017.zip
wget http://images.cocodataset.org/zips/val2017.zip
wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
# unzip
unzip train2017.zip -d images/
unzip val2017.zip -d images/
unzip stuffthingmaps_trainval2017.zip -d annotations/
# --nproc means 8 process for conversion, which could be omitted as well.
python tools/prepare_data/coco_stuff164k.py /path/to/coco_stuff164k --nproc 8
```
By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2017/*_labelTrainIds.png` are used for COCO Stuff 164k training and testing.
The details of this dataset could be found at [here](https://github.com/nightrome/cocostuff#downloads).

View File

@ -9,6 +9,7 @@ from .hrnet import HRNet
from .inceptionv3 import Inception3
from .lighthrnet import LiteHRNet
from .mae_vit_transformer import *
from .mit import MixVisionTransformer
from .mnasnet import MNASNet
from .mobilenetv2 import MobileNetV2
from .pytorch_image_models_wrapper import *

View File

@ -0,0 +1,443 @@
# Copyright (c) OpenMMLab. All rights reserved.
# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/2d66179630035097dcae08ee958f60d4b5a7fcae/mmseg/models/backbones/mit.py
import math
import warnings
import torch
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
from mmcv.cnn.bricks.drop import build_dropout
from mmcv.cnn.bricks.transformer import MultiheadAttention
from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
trunc_normal_init)
from mmcv.runner import BaseModule, ModuleList, Sequential
from easycv.models.registry import BACKBONES
from easycv.models.segmentation.utils import (PatchEmbed, nchw_to_nlc,
nlc_to_nchw)
class MixFFN(BaseModule):
"""An implementation of MixFFN of Segformer.
The differences between MixFFN & FFN:
1. Use 1X1 Conv to replace Linear layer.
2. Introduce 3X3 Conv to encode positional information.
Args:
embed_dims (int): The feature dimension. Same as
`MultiheadAttention`. Defaults: 256.
feedforward_channels (int): The hidden dimension of FFNs.
Defaults: 1024.
act_cfg (dict, optional): The activation config for FFNs.
Default: dict(type='ReLU')
ffn_drop (float, optional): Probability of an element to be
zeroed in FFN. Default 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def __init__(self,
embed_dims,
feedforward_channels,
act_cfg=dict(type='GELU'),
ffn_drop=0.,
dropout_layer=None,
init_cfg=None):
super(MixFFN, self).__init__(init_cfg)
self.embed_dims = embed_dims
self.feedforward_channels = feedforward_channels
self.act_cfg = act_cfg
self.activate = build_activation_layer(act_cfg)
in_channels = embed_dims
fc1 = Conv2d(
in_channels=in_channels,
out_channels=feedforward_channels,
kernel_size=1,
stride=1,
bias=True)
# 3x3 depth wise conv to provide positional encode information
pe_conv = Conv2d(
in_channels=feedforward_channels,
out_channels=feedforward_channels,
kernel_size=3,
stride=1,
padding=(3 - 1) // 2,
bias=True,
groups=feedforward_channels)
fc2 = Conv2d(
in_channels=feedforward_channels,
out_channels=in_channels,
kernel_size=1,
stride=1,
bias=True)
drop = nn.Dropout(ffn_drop)
layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
self.layers = Sequential(*layers)
self.dropout_layer = build_dropout(
dropout_layer) if dropout_layer else torch.nn.Identity()
def forward(self, x, hw_shape, identity=None):
out = nlc_to_nchw(x, hw_shape)
out = self.layers(out)
out = nchw_to_nlc(out)
if identity is None:
identity = x
return identity + self.dropout_layer(out)
class EfficientMultiheadAttention(MultiheadAttention):
"""An implementation of Efficient Multi-head Attention of Segformer.
This module is modified from MultiheadAttention which is a module from
mmcv.cnn.bricks.transformer.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads.
attn_drop (float): A Dropout layer on attn_output_weights.
Default: 0.0.
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
Default: 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut. Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default: False.
qkv_bias (bool): enable bias for qkv if True. Default True.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
Attention of Segformer. Default: 1.
"""
def __init__(self,
embed_dims,
num_heads,
attn_drop=0.,
proj_drop=0.,
dropout_layer=None,
init_cfg=None,
batch_first=True,
qkv_bias=False,
norm_cfg=dict(type='LN'),
sr_ratio=1):
super().__init__(
embed_dims,
num_heads,
attn_drop,
proj_drop,
dropout_layer=dropout_layer,
init_cfg=init_cfg,
batch_first=batch_first,
bias=qkv_bias)
self.sr_ratio = sr_ratio
if sr_ratio > 1:
self.sr = Conv2d(
in_channels=embed_dims,
out_channels=embed_dims,
kernel_size=sr_ratio,
stride=sr_ratio)
# The ret[0] of build_norm_layer is norm name.
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
def forward(self, x, hw_shape, identity=None):
x_q = x
if self.sr_ratio > 1:
x_kv = nlc_to_nchw(x, hw_shape)
x_kv = self.sr(x_kv)
x_kv = nchw_to_nlc(x_kv)
x_kv = self.norm(x_kv)
else:
x_kv = x
if identity is None:
identity = x_q
# Because the dataflow('key', 'query', 'value') of
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
# embed_dims), We should adjust the shape of dataflow from
# batch_first (batch, num_query, embed_dims) to num_query_first
# (num_query ,batch, embed_dims), and recover ``attn_output``
# from num_query_first to batch_first.
if self.batch_first:
x_q = x_q.transpose(0, 1)
x_kv = x_kv.transpose(0, 1)
out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
if self.batch_first:
out = out.transpose(0, 1)
return identity + self.dropout_layer(self.proj_drop(out))
def legacy_forward(self, x, hw_shape, identity=None):
"""multi head attention forward in mmcv version < 1.3.17."""
x_q = x
if self.sr_ratio > 1:
x_kv = nlc_to_nchw(x, hw_shape)
x_kv = self.sr(x_kv)
x_kv = nchw_to_nlc(x_kv)
x_kv = self.norm(x_kv)
else:
x_kv = x
if identity is None:
identity = x_q
# `need_weights=True` will let nn.MultiHeadAttention
# `return attn_output, attn_output_weights.sum(dim=1) / num_heads`
# The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set
# `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`.
# This issue - `https://github.com/pytorch/pytorch/issues/37583` report
# the error that large scale tensor sum operation may cause cuda error.
out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0]
return identity + self.dropout_layer(self.proj_drop(out))
class TransformerEncoderLayer(BaseModule):
"""Implements one encoder layer in Segformer.
Args:
embed_dims (int): The feature dimension.
num_heads (int): Parallel attention heads.
feedforward_channels (int): The hidden dimension for FFNs.
drop_rate (float): Probability of an element to be zeroed.
after the feed forward layer. Default 0.0.
attn_drop_rate (float): The drop out rate for attention layer.
Default 0.0.
drop_path_rate (float): stochastic depth rate. Default 0.0.
qkv_bias (bool): enable bias for qkv if True.
Default: True.
act_cfg (dict): The activation config for FFNs.
Default: dict(type='GELU').
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default: False.
init_cfg (dict, optional): Initialization config dict.
Default:None.
sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
Attention of Segformer. Default: 1.
with_cp (bool): Use checkpoint or not. Using checkpoint will save
some memory while slowing down the training speed. Default: False.
"""
def __init__(self,
embed_dims,
num_heads,
feedforward_channels,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
qkv_bias=True,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
batch_first=True,
sr_ratio=1,
with_cp=False):
super(TransformerEncoderLayer, self).__init__()
# The ret[0] of build_norm_layer is norm name.
self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
self.attn = EfficientMultiheadAttention(
embed_dims=embed_dims,
num_heads=num_heads,
attn_drop=attn_drop_rate,
proj_drop=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
batch_first=batch_first,
qkv_bias=qkv_bias,
norm_cfg=norm_cfg,
sr_ratio=sr_ratio)
# The ret[0] of build_norm_layer is norm name.
self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
self.ffn = MixFFN(
embed_dims=embed_dims,
feedforward_channels=feedforward_channels,
ffn_drop=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
act_cfg=act_cfg)
self.with_cp = with_cp
def forward(self, x, hw_shape):
def _inner_forward(x):
x = self.attn(self.norm1(x), hw_shape, identity=x)
x = self.ffn(self.norm2(x), hw_shape, identity=x)
return x
if self.with_cp and x.requires_grad:
x = cp.checkpoint(_inner_forward, x)
else:
x = _inner_forward(x)
return x
@BACKBONES.register_module()
class MixVisionTransformer(BaseModule):
"""The backbone of Segformer.
This backbone is the implementation of `SegFormer: Simple and
Efficient Design for Semantic Segmentation with
Transformers <https://arxiv.org/abs/2105.15203>`_.
Args:
in_channels (int): Number of input channels. Default: 3.
embed_dims (int): Embedding dimension. Default: 768.
num_stags (int): The num of stages. Default: 4.
num_layers (Sequence[int]): The layer number of each transformer encode
layer. Default: [3, 4, 6, 3].
num_heads (Sequence[int]): The attention heads of each transformer
encode layer. Default: [1, 2, 4, 8].
patch_sizes (Sequence[int]): The patch_size of each overlapped patch
embedding. Default: [7, 3, 3, 3].
strides (Sequence[int]): The stride of each overlapped patch embedding.
Default: [4, 2, 2, 2].
sr_ratios (Sequence[int]): The spatial reduction rate of each
transformer encode layer. Default: [8, 4, 2, 1].
out_indices (Sequence[int] | int): Output from which stages.
Default: (0, 1, 2, 3).
mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
Default: 4.
qkv_bias (bool): Enable bias for qkv if True. Default: True.
drop_rate (float): Probability of an element to be zeroed.
Default 0.0
attn_drop_rate (float): The drop out rate for attention layer.
Default 0.0
drop_path_rate (float): stochastic depth rate. Default 0.0
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN')
act_cfg (dict): The activation config for FFNs.
Default: dict(type='GELU').
pretrained (str, optional): model pretrained path. Default: None.
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None.
with_cp (bool): Use checkpoint or not. Using checkpoint will save
some memory while slowing down the training speed. Default: False.
"""
def __init__(self,
in_channels=3,
embed_dims=64,
num_stages=4,
num_layers=[3, 4, 6, 3],
num_heads=[1, 2, 4, 8],
patch_sizes=[7, 3, 3, 3],
strides=[4, 2, 2, 2],
sr_ratios=[8, 4, 2, 1],
out_indices=(0, 1, 2, 3),
mlp_ratio=4,
qkv_bias=True,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN', eps=1e-6),
pretrained=None,
init_cfg=None,
with_cp=False):
super(MixVisionTransformer, self).__init__(init_cfg=init_cfg)
assert not (init_cfg and pretrained), \
'init_cfg and pretrained cannot be set at the same time'
if isinstance(pretrained, str):
warnings.warn('DeprecationWarning: pretrained is deprecated, '
'please use "init_cfg" instead')
self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
elif pretrained is not None:
raise TypeError('pretrained must be a str or None')
self.embed_dims = embed_dims
self.num_stages = num_stages
self.num_layers = num_layers
self.num_heads = num_heads
self.patch_sizes = patch_sizes
self.strides = strides
self.sr_ratios = sr_ratios
self.with_cp = with_cp
assert num_stages == len(num_layers) == len(num_heads) \
== len(patch_sizes) == len(strides) == len(sr_ratios)
self.out_indices = out_indices
assert max(out_indices) < self.num_stages
# transformer encoder
dpr = [
x.item()
for x in torch.linspace(0, drop_path_rate, sum(num_layers))
] # stochastic num_layer decay rule
cur = 0
self.layers = ModuleList()
for i, num_layer in enumerate(num_layers):
embed_dims_i = embed_dims * num_heads[i]
patch_embed = PatchEmbed(
in_channels=in_channels,
embed_dims=embed_dims_i,
kernel_size=patch_sizes[i],
stride=strides[i],
padding=patch_sizes[i] // 2,
norm_cfg=norm_cfg)
layer = ModuleList([
TransformerEncoderLayer(
embed_dims=embed_dims_i,
num_heads=num_heads[i],
feedforward_channels=mlp_ratio * embed_dims_i,
drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate,
drop_path_rate=dpr[cur + idx],
qkv_bias=qkv_bias,
act_cfg=act_cfg,
norm_cfg=norm_cfg,
with_cp=with_cp,
sr_ratio=sr_ratios[i]) for idx in range(num_layer)
])
in_channels = embed_dims_i
# The ret[0] of build_norm_layer is norm name.
norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
self.layers.append(ModuleList([patch_embed, layer, norm]))
cur += num_layer
def init_weights(self):
if self.init_cfg is None:
for m in self.modules():
if isinstance(m, nn.Linear):
trunc_normal_init(m, std=.02, bias=0.)
elif isinstance(m, nn.LayerNorm):
constant_init(m, val=1.0, bias=0.)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[
1] * m.out_channels
fan_out //= m.groups
normal_init(
m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
else:
super(MixVisionTransformer, self).init_weights()
def forward(self, x):
outs = []
for i, layer in enumerate(self.layers):
x, hw_shape = layer[0](x)
for block in layer[1]:
x = block(x, hw_shape)
x = layer[2](x)
x = nlc_to_nchw(x, hw_shape)
if i in self.out_indices:
outs.append(x)
return outs

View File

@ -1,6 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .fcn_head import FCNHead
from .mask2former_head import Mask2FormerHead
from .segformer_head import SegformerHead
from .uper_head import UPerHead
__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead']
__all__ = ['FCNHead', 'UPerHead', 'Mask2FormerHead', 'SegformerHead']

View File

@ -0,0 +1,146 @@
# Modified from
# https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/decode_heads/segformer_head.py
#
# This work is licensed under the NVIDIA Source Code License.
#
# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
# Augmentation (ADA)
#
# 1. Definitions
# "Licensor" means any person or entity that distributes its Work.
# "Software" means the original work of authorship made available under
# this License.
# "Work" means the Software and any additions to or derivative works of
# the Software that are made available under this License.
# The terms "reproduce," "reproduction," "derivative works," and
# "distribution" have the meaning as provided under U.S. copyright law;
# provided, however, that for the purposes of this License, derivative
# works shall not include works that remain separable from, or merely
# link (or bind by name) to the interfaces of, the Work.
# Works, including the Software, are "made available" under this License
# by including in or with the Work either (a) a copyright notice
# referencing the applicability of this License to the Work, or (b) a
# copy of this License.
# 2. License Grants
# 2.1 Copyright Grant. Subject to the terms and conditions of this
# License, each Licensor grants to you a perpetual, worldwide,
# non-exclusive, royalty-free, copyright license to reproduce,
# prepare derivative works of, publicly display, publicly perform,
# sublicense and distribute its Work and any resulting derivative
# works in any form.
# 3. Limitations
# 3.1 Redistribution. You may reproduce or distribute the Work only
# if (a) you do so under this License, (b) you include a complete
# copy of this License with your distribution, and (c) you retain
# without modification any copyright, patent, trademark, or
# attribution notices that are present in the Work.
# 3.2 Derivative Works. You may specify that additional or different
# terms apply to the use, reproduction, and distribution of your
# derivative works of the Work ("Your Terms") only if (a) Your Terms
# provide that the use limitation in Section 3.3 applies to your
# derivative works, and (b) you identify the specific derivative
# works that are subject to Your Terms. Notwithstanding Your Terms,
# this License (including the redistribution requirements in Section
# 3.1) will continue to apply to the Work itself.
# 3.3 Use Limitation. The Work and any derivative works thereof only
# may be used or intended for use non-commercially. Notwithstanding
# the foregoing, NVIDIA and its affiliates may use the Work and any
# derivative works commercially. As used herein, "non-commercially"
# means for research or evaluation purposes only.
# 3.4 Patent Claims. If you bring or threaten to bring a patent claim
# against any Licensor (including any claim, cross-claim or
# counterclaim in a lawsuit) to enforce any patents that you allege
# are infringed by any Work, then your rights under this License from
# such Licensor (including the grant in Section 2.1) will terminate
# immediately.
# 3.5 Trademarks. This License does not grant any rights to use any
# Licensors or its affiliates names, logos, or trademarks, except
# as necessary to reproduce the notices described in this License.
# 3.6 Termination. If you violate any term of this License, then your
# rights under this License (including the grant in Section 2.1) will
# terminate immediately.
# 4. Disclaimer of Warranty.
# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
# THIS LICENSE.
# 5. Limitation of Liability.
# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGES.
# Copyright (c) OpenMMLab. All rights reserved.
# Adapt from https://github.com/open-mmlab/mmsegmentation/blob/2d66179630035097dcae08ee958f60d4b5a7fcae/mmseg/models/decode_heads/segformer_head.py
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule
from easycv.models.builder import HEADS
from easycv.models.segmentation.heads.base import BaseDecodeHead
from easycv.models.utils.ops import resize_tensor as resize
@HEADS.register_module()
class SegformerHead(BaseDecodeHead):
"""The all mlp Head of segformer.
This head is the implementation of
`Segformer <https://arxiv.org/abs/2105.15203>` _.
Args:
interpolate_mode: The interpolate mode of MLP head upsample operation.
Default: 'bilinear'.
"""
def __init__(self, interpolate_mode='bilinear', **kwargs):
super().__init__(input_transform='multiple_select', **kwargs)
self.interpolate_mode = interpolate_mode
num_inputs = len(self.in_channels)
assert num_inputs == len(self.in_index)
self.convs = nn.ModuleList()
for i in range(num_inputs):
self.convs.append(
ConvModule(
in_channels=self.in_channels[i],
out_channels=self.channels,
kernel_size=1,
stride=1,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
self.fusion_conv = ConvModule(
in_channels=self.channels * num_inputs,
out_channels=self.channels,
kernel_size=1,
norm_cfg=self.norm_cfg)
def forward(self, inputs):
# Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
inputs = self._transform_inputs(inputs)
outs = []
for idx in range(len(inputs)):
x = inputs[idx]
conv = self.convs[idx]
outs.append(
resize(
input=conv(x),
size=inputs[0].shape[2:],
mode=self.interpolate_mode,
align_corners=self.align_corners))
out = self.fusion_conv(torch.cat(outs, dim=1))
out = self.cls_seg(out)
return out

View File

@ -0,0 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .embed import PatchEmbed
from .shape_convert import (nchw2nlc2nchw, nchw_to_nlc, nlc2nchw2nlc,
nlc_to_nchw)

View File

@ -0,0 +1,332 @@
# Copyright (c) OpenMMLab. All rights reserved.
# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/utils/embed.py
import math
from typing import Sequence
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.runner.base_module import BaseModule
from mmcv.utils import to_2tuple
class AdaptivePadding(nn.Module):
"""Applies padding to input (if needed) so that input can get fully covered
by filter you specified. It support two modes "same" and "corner". The
"same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
input. The "corner" mode would pad zero to bottom right.
Args:
kernel_size (int | tuple): Size of the kernel:
stride (int | tuple): Stride of the filter. Default: 1:
dilation (int | tuple): Spacing between kernel elements.
Default: 1.
padding (str): Support "same" and "corner", "corner" mode
would pad zero to bottom right, and "same" mode would
pad zero around input. Default: "corner".
Example:
>>> kernel_size = 16
>>> stride = 16
>>> dilation = 1
>>> input = torch.rand(1, 1, 15, 17)
>>> adap_pad = AdaptivePadding(
>>> kernel_size=kernel_size,
>>> stride=stride,
>>> dilation=dilation,
>>> padding="corner")
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
>>> input = torch.rand(1, 1, 16, 17)
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
"""
def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
super(AdaptivePadding, self).__init__()
assert padding in ('same', 'corner')
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
self.padding = padding
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
def get_pad_shape(self, input_shape):
input_h, input_w = input_shape
kernel_h, kernel_w = self.kernel_size
stride_h, stride_w = self.stride
output_h = math.ceil(input_h / stride_h)
output_w = math.ceil(input_w / stride_w)
pad_h = max((output_h - 1) * stride_h +
(kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
pad_w = max((output_w - 1) * stride_w +
(kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
return pad_h, pad_w
def forward(self, x):
pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
if pad_h > 0 or pad_w > 0:
if self.padding == 'corner':
x = F.pad(x, [0, pad_w, 0, pad_h])
elif self.padding == 'same':
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
pad_h - pad_h // 2
])
return x
class PatchEmbed(BaseModule):
"""Image to Patch Embedding.
We use a conv layer to implement PatchEmbed.
Args:
in_channels (int): The num of input channels. Default: 3
embed_dims (int): The dimensions of embedding. Default: 768
conv_type (str): The config dict for embedding
conv layer type selection. Default: "Conv2d".
kernel_size (int): The kernel_size of embedding conv. Default: 16.
stride (int, optional): The slide stride of embedding conv.
Default: None (Would be set as `kernel_size`).
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int): The dilation rate of embedding conv. Default: 1.
bias (bool): Bias of embed conv. Default: True.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: None.
input_size (int | tuple | None): The size of input, which will be
used to calculate the out size. Only work when `dynamic_size`
is False. Default: None.
init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
Default: None.
"""
def __init__(self,
in_channels=3,
embed_dims=768,
conv_type='Conv2d',
kernel_size=16,
stride=None,
padding='corner',
dilation=1,
bias=True,
norm_cfg=None,
input_size=None,
init_cfg=None):
super(PatchEmbed, self).__init__(init_cfg=init_cfg)
self.embed_dims = embed_dims
if stride is None:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adap_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of conv
padding = 0
else:
self.adap_padding = None
padding = to_2tuple(padding)
self.projection = build_conv_layer(
dict(type=conv_type),
in_channels=in_channels,
out_channels=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
else:
self.norm = None
if input_size:
input_size = to_2tuple(input_size)
# `init_out_size` would be used outside to
# calculate the num_patches
# when `use_abs_pos_embed` outside
self.init_input_size = input_size
if self.adap_padding:
pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
input_h, input_w = input_size
input_h = input_h + pad_h
input_w = input_w + pad_w
input_size = (input_h, input_w)
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
(kernel_size[0] - 1) - 1) // stride[0] + 1
w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
(kernel_size[1] - 1) - 1) // stride[1] + 1
self.init_out_size = (h_out, w_out)
else:
self.init_input_size = None
self.init_out_size = None
def forward(self, x):
"""
Args:
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
- out_size (tuple[int]): Spatial shape of x, arrange as
(out_h, out_w).
"""
if self.adap_padding:
x = self.adap_padding(x)
x = self.projection(x)
out_size = (x.shape[2], x.shape[3])
x = x.flatten(2).transpose(1, 2)
if self.norm is not None:
x = self.norm(x)
return x, out_size
class PatchMerging(BaseModule):
"""Merge patch feature map.
This layer groups feature map by kernel_size, and applies norm and linear
layers to the grouped feature map. Our implementation uses `nn.Unfold` to
merge patch, which is about 25% faster than original implementation.
Instead, we need to modify pretrained models for compatibility.
Args:
in_channels (int): The num of input channels.
out_channels (int): The num of output channels.
kernel_size (int | tuple, optional): the kernel size in the unfold
layer. Defaults to 2.
stride (int | tuple, optional): the stride of the sliding blocks in the
unfold layer. Default: None. (Would be set as `kernel_size`)
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int | tuple, optional): dilation parameter in the unfold
layer. Default: 1.
bias (bool, optional): Whether to add bias in linear layer or not.
Defaults: False.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (dict, optional): The extra config for initialization.
Default: None.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=2,
stride=None,
padding='corner',
dilation=1,
bias=False,
norm_cfg=dict(type='LN'),
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.in_channels = in_channels
self.out_channels = out_channels
if stride:
stride = stride
else:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adap_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of unfold
padding = 0
else:
self.adap_padding = None
padding = to_2tuple(padding)
self.sampler = nn.Unfold(
kernel_size=kernel_size,
dilation=dilation,
padding=padding,
stride=stride)
sample_dim = kernel_size[0] * kernel_size[1] * in_channels
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
else:
self.norm = None
self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
def forward(self, x, input_size):
"""
Args:
x (Tensor): Has shape (B, H*W, C_in).
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
Default: None.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
- out_size (tuple[int]): Spatial shape of x, arrange as
(Merged_H, Merged_W).
"""
B, L, C = x.shape
assert isinstance(input_size, Sequence), f'Expect ' \
f'input_size is ' \
f'`Sequence` ' \
f'but get {input_size}'
H, W = input_size
assert L == H * W, 'input feature has wrong size'
x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
# Use nn.Unfold to merge patch. About 25% faster than original method,
# but need to modify pretrained model for compatibility
if self.adap_padding:
x = self.adap_padding(x)
H, W = x.shape[-2:]
x = self.sampler(x)
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
(self.sampler.kernel_size[0] - 1) -
1) // self.sampler.stride[0] + 1
out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
(self.sampler.kernel_size[1] - 1) -
1) // self.sampler.stride[1] + 1
output_size = (out_h, out_w)
x = x.transpose(1, 2) # B, H/2*W/2, 4*C
x = self.norm(x) if self.norm else x
x = self.reduction(x)
return x, output_size

View File

@ -0,0 +1,110 @@
# Copyright (c) OpenMMLab. All rights reserved.
# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/utils/shape_convert.py
def nlc_to_nchw(x, hw_shape):
"""Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
Args:
x (Tensor): The input tensor of shape [N, L, C] before conversion.
hw_shape (Sequence[int]): The height and width of output feature map.
Returns:
Tensor: The output tensor of shape [N, C, H, W] after conversion.
"""
H, W = hw_shape
assert len(x.shape) == 3
B, L, C = x.shape
assert L == H * W, 'The seq_len doesn\'t match H, W'
return x.transpose(1, 2).reshape(B, C, H, W)
def nchw_to_nlc(x):
"""Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
Args:
x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
Returns:
Tensor: The output tensor of shape [N, L, C] after conversion.
"""
assert len(x.shape) == 4
return x.flatten(2).transpose(1, 2).contiguous()
def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
"""Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
reshaped tensor as the input of `module`, and the convert the output of
`module`, whose shape is.
[N, L, C], to [N, C, H, W].
Args:
module (Callable): A callable object the takes a tensor
with shape [N, L, C] as input.
x (Tensor): The input tensor of shape [N, C, H, W].
contiguous:
contiguous (Bool): Whether to make the tensor contiguous
after each shape transform.
Returns:
Tensor: The output tensor of shape [N, C, H, W].
Example:
>>> import torch
>>> import torch.nn as nn
>>> norm = nn.LayerNorm(4)
>>> feature_map = torch.rand(4, 4, 5, 5)
>>> output = nchw2nlc2nchw(norm, feature_map)
"""
B, C, H, W = x.shape
if not contiguous:
x = x.flatten(2).transpose(1, 2)
x = module(x, **kwargs)
x = x.transpose(1, 2).reshape(B, C, H, W)
else:
x = x.flatten(2).transpose(1, 2).contiguous()
x = module(x, **kwargs)
x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
return x
def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
"""Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
reshaped tensor as the input of `module`, and convert the output of
`module`, whose shape is.
[N, C, H, W], to [N, L, C].
Args:
module (Callable): A callable object the takes a tensor
with shape [N, C, H, W] as input.
x (Tensor): The input tensor of shape [N, L, C].
hw_shape: (Sequence[int]): The height and width of the
feature map with shape [N, C, H, W].
contiguous (Bool): Whether to make the tensor contiguous
after each shape transform.
Returns:
Tensor: The output tensor of shape [N, L, C].
Example:
>>> import torch
>>> import torch.nn as nn
>>> conv = nn.Conv2d(16, 16, 3, 1, 1)
>>> feature_map = torch.rand(4, 25, 16)
>>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
"""
H, W = hw_shape
assert len(x.shape) == 3
B, L, C = x.shape
assert L == H * W, 'The seq_len doesn\'t match H, W'
if not contiguous:
x = x.transpose(1, 2).reshape(B, C, H, W)
x = module(x, **kwargs)
x = x.flatten(2).transpose(1, 2)
else:
x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
x = module(x, **kwargs)
x = x.flatten(2).transpose(1, 2).contiguous()
return x

View File

@ -7,3 +7,4 @@ from .feature_extractor import (TorchFaceAttrExtractor,
TorchFeatureExtractor)
from .pose_predictor import (TorchPoseTopDownPredictor,
TorchPoseTopDownPredictorWithDetector)
from .segmentation import Mask2formerPredictor, SegFormerPredictor

View File

@ -1,5 +1,6 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import cv2
import mmcv
import numpy as np
import torch
from matplotlib.collections import PatchCollection
@ -13,6 +14,7 @@ from easycv.models import build_model
from easycv.predictors.builder import PREDICTORS
from easycv.predictors.interface import PredictorInterface
from easycv.utils.checkpoint import load_checkpoint
from easycv.utils.config_tools import mmcv_config_fromfile
from easycv.utils.registry import build_from_cfg
@ -108,6 +110,147 @@ class Mask2formerPredictor(PredictorInterface):
return instance_result
@PREDICTORS.register_module()
class SegFormerPredictor(PredictorInterface):
def __init__(self, model_path, model_config):
"""init model
Args:
model_path (str): Path of model path
model_config (config): config string for model to init. Defaults to None.
"""
self.model_path = model_path
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model = None
with io.open(self.model_path, 'rb') as infile:
checkpoint = torch.load(infile, map_location='cpu')
self.cfg = mmcv_config_fromfile(model_config)
self.CLASSES = self.cfg.CLASSES
self.PALETTE = self.cfg.PALETTE
# build model
self.model = build_model(self.cfg.model)
self.ckpt = load_checkpoint(
self.model, self.model_path, map_location=self.device)
self.model.to(self.device)
self.model.eval()
# build pipeline
test_pipeline = self.cfg.test_pipeline
pipeline = [build_from_cfg(p, PIPELINES) for p in test_pipeline]
self.pipeline = Compose(pipeline)
def predict(self, input_data_list):
"""
using session run predict a number of samples using batch_size
Args:
input_data_list: a list of numpy array(in rgb order), each array is a sample
to be predicted
use a fixed number if you do not want to adjust batch_size in runtime
"""
output_list = []
for idx, img in enumerate(input_data_list):
if type(img) is not np.ndarray:
img = np.asarray(img)
ori_img_shape = img.shape[:2]
data_dict = {'img': img}
data_dict['ori_shape'] = ori_img_shape
data_dict = self.pipeline(data_dict)
img = data_dict['img']
img = torch.unsqueeze(img[0], 0).to(self.device)
data_dict.pop('img')
with torch.no_grad():
out = self.model([img],
mode='test',
img_metas=[[data_dict['img_metas'][0]._data]])
output_list.append(out)
return output_list
def show_result(self,
img,
result,
palette=None,
win_name='',
show=False,
wait_time=0,
out_file=None,
opacity=0.5):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (Tensor): The semantic segmentation results to draw over
`img`.
palette (list[list[int]]] | np.ndarray | None): The palette of
segmentation map. If None is given, random palette will be
generated. Default: None
win_name (str): The window name.
wait_time (int): Value of waitKey param.
Default: 0.
show (bool): Whether to show the image.
Default: False.
out_file (str or None): The filename to write the image.
Default: None.
opacity(float): Opacity of painted segmentation map.
Default 0.5.
Must be in (0, 1] range.
Returns:
img (Tensor): Only if not `show` or `out_file`
"""
img = mmcv.imread(img)
img = img.copy()
seg = result[0]
if palette is None:
if self.PALETTE is None:
# Get random state before set seed,
# and restore random state later.
# It will prevent loss of randomness, as the palette
# may be different in each iteration if not specified.
# See: https://github.com/open-mmlab/mmdetection/issues/5844
state = np.random.get_state()
np.random.seed(42)
# random palette
palette = np.random.randint(
0, 255, size=(len(self.CLASSES), 3))
np.random.set_state(state)
else:
palette = self.PALETTE
palette = np.array(palette)
assert palette.shape[0] == len(self.CLASSES)
assert palette.shape[1] == 3
assert len(palette.shape) == 2
assert 0 < opacity <= 1.0
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
for label, color in enumerate(palette):
color_seg[seg == label, :] = color
# convert to BGR
color_seg = color_seg[..., ::-1]
img = img * (1 - opacity) + color_seg * opacity
img = img.astype(np.uint8)
# if out_file specified, do not show image in window
if out_file is not None:
show = False
if show:
mmcv.imshow(img, win_name, wait_time)
if out_file is not None:
mmcv.imwrite(img, out_file)
if not (show or out_file):
return img
def _get_bias_color(base, max_dist=30):
"""Get different colors for each masks.

3
result.jpg 100644
View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fca74c76fea1589f9479b3261fd281bdb5673041f4ab7a70dbe14663873a19d3
size 189945

View File

@ -0,0 +1,57 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
"""
isort:skip_file
"""
import os
import tempfile
import unittest
import cv2
import numpy as np
from PIL import Image
from easycv.predictors.detector import TorchYoloXPredictor, TorchViTDetPredictor
from tests.ut_config import (PRETRAINED_MODEL_YOLOXS_EXPORT,
PRETRAINED_MODEL_YOLOXS_EXPORT_JIT,
PRETRAINED_MODEL_YOLOXS_END2END_JIT,
DET_DATA_SMALL_COCO_LOCAL)
from tests.ut_config import (PRETRAINED_MODEL_SEGFORMER,
MODEL_CONFIG_SEGFORMER)
from easycv.predictors.segmentation import (SegFormerPredictor)
from numpy.testing import assert_array_almost_equal
class SegmentorTest(unittest.TestCase):
def setUp(self):
print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
def test_segformer_detector(self):
segmentation_model_path = PRETRAINED_MODEL_SEGFORMER
segmentation_model_config = MODEL_CONFIG_SEGFORMER
img = os.path.join(DET_DATA_SMALL_COCO_LOCAL,
'val2017/000000289059.jpg')
if os.path.exists(img) == False:
img = './data/test/segmentation/coco_stuff_164k/val2017/000000289059.jpg'
input_data_list = [np.asarray(Image.open(img))]
predictor = SegFormerPredictor(
model_path=segmentation_model_path,
model_config=segmentation_model_config)
output = predictor.predict(input_data_list)[0]
self.assertIn('seg_pred', output)
self.assertListEqual(
list(input_data_list[0].shape)[:2],
list(output['seg_pred'][0].shape))
self.assertListEqual(output['seg_pred'][0][1, :10].tolist(),
[161 for i in range(10)])
self.assertListEqual(output['seg_pred'][0][-1, -10:].tolist(),
[133 for i in range(10)])
if __name__ == '__main__':
unittest.main()

View File

@ -111,3 +111,10 @@ PRETRAINED_MODEL_MAE = os.path.join(
PRETRAINED_MODEL_MASK2FORMER = os.path.join(
BASE_LOCAL_PATH,
'pretrained_models/segmentation/mask2former/mask2former_r50_instance.pth')
PRETRAINED_MODEL_SEGFORMER = os.path.join(
BASE_LOCAL_PATH,
'pretrained_models/segformer/segformer_b0/SegmentationEvaluator_mIoU_best.pth'
)
MODEL_CONFIG_SEGFORMER = (
'./configs/segmentation/segformer/segformer_b0_coco.py')

View File

@ -0,0 +1,266 @@
# Copyright (c) OpenMMLab. All rights reserved.
# Adapt from: https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/coco_stuff164k.py
import argparse
import os.path as osp
import shutil
from functools import partial
from glob import glob
import mmcv
import numpy as np
from PIL import Image
COCO_LEN = 123287
clsID_to_trID = {
0: 0,
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 6,
7: 7,
8: 8,
9: 9,
10: 10,
12: 11,
13: 12,
14: 13,
15: 14,
16: 15,
17: 16,
18: 17,
19: 18,
20: 19,
21: 20,
22: 21,
23: 22,
24: 23,
26: 24,
27: 25,
30: 26,
31: 27,
32: 28,
33: 29,
34: 30,
35: 31,
36: 32,
37: 33,
38: 34,
39: 35,
40: 36,
41: 37,
42: 38,
43: 39,
45: 40,
46: 41,
47: 42,
48: 43,
49: 44,
50: 45,
51: 46,
52: 47,
53: 48,
54: 49,
55: 50,
56: 51,
57: 52,
58: 53,
59: 54,
60: 55,
61: 56,
62: 57,
63: 58,
64: 59,
66: 60,
69: 61,
71: 62,
72: 63,
73: 64,
74: 65,
75: 66,
76: 67,
77: 68,
78: 69,
79: 70,
80: 71,
81: 72,
83: 73,
84: 74,
85: 75,
86: 76,
87: 77,
88: 78,
89: 79,
91: 80,
92: 81,
93: 82,
94: 83,
95: 84,
96: 85,
97: 86,
98: 87,
99: 88,
100: 89,
101: 90,
102: 91,
103: 92,
104: 93,
105: 94,
106: 95,
107: 96,
108: 97,
109: 98,
110: 99,
111: 100,
112: 101,
113: 102,
114: 103,
115: 104,
116: 105,
117: 106,
118: 107,
119: 108,
120: 109,
121: 110,
122: 111,
123: 112,
124: 113,
125: 114,
126: 115,
127: 116,
128: 117,
129: 118,
130: 119,
131: 120,
132: 121,
133: 122,
134: 123,
135: 124,
136: 125,
137: 126,
138: 127,
139: 128,
140: 129,
141: 130,
142: 131,
143: 132,
144: 133,
145: 134,
146: 135,
147: 136,
148: 137,
149: 138,
150: 139,
151: 140,
152: 141,
153: 142,
154: 143,
155: 144,
156: 145,
157: 146,
158: 147,
159: 148,
160: 149,
161: 150,
162: 151,
163: 152,
164: 153,
165: 154,
166: 155,
167: 156,
168: 157,
169: 158,
170: 159,
171: 160,
172: 161,
173: 162,
174: 163,
175: 164,
176: 165,
177: 166,
178: 167,
179: 168,
180: 169,
181: 170,
255: 255
}
def convert_to_trainID(maskpath, out_mask_dir, is_train):
mask = np.array(Image.open(maskpath))
mask_copy = mask.copy()
for clsID, trID in clsID_to_trID.items():
mask_copy[mask == clsID] = trID
seg_filename = osp.join(
out_mask_dir, 'train2017',
osp.basename(maskpath).split('.')[0] +
'_labelTrainIds.png') if is_train else osp.join(
out_mask_dir, 'val2017',
osp.basename(maskpath).split('.')[0] + '_labelTrainIds.png')
Image.fromarray(mask_copy).save(seg_filename, 'PNG')
def parse_args():
parser = argparse.ArgumentParser(
description=\
'Convert COCO Stuff 164k annotations to mmsegmentation format') # noqa
parser.add_argument('coco_path', help='coco stuff path')
parser.add_argument('-o', '--out_dir', help='output path')
parser.add_argument(
'--nproc', default=16, type=int, help='number of process')
args = parser.parse_args()
return args
def main():
args = parse_args()
coco_path = args.coco_path
nproc = args.nproc
out_dir = args.out_dir or coco_path
out_img_dir = osp.join(out_dir, 'images')
out_mask_dir = osp.join(out_dir, 'annotations')
mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
if out_dir != coco_path:
shutil.copytree(osp.join(coco_path, 'images'), out_img_dir)
train_list = glob(osp.join(coco_path, 'annotations', 'train2017', '*.png'))
train_list = [file for file in train_list if '_labelTrainIds' not in file]
test_list = glob(osp.join(coco_path, 'annotations', 'val2017', '*.png'))
test_list = [file for file in test_list if '_labelTrainIds' not in file]
assert (len(train_list) +
len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
len(train_list), len(test_list))
if args.nproc > 1:
mmcv.track_parallel_progress(
partial(
convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
train_list,
nproc=nproc)
mmcv.track_parallel_progress(
partial(
convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
test_list,
nproc=nproc)
else:
mmcv.track_progress(
partial(
convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
train_list)
mmcv.track_progress(
partial(
convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
test_list)
print('Done!')
if __name__ == '__main__':
main()