diff --git a/configs/_base_/det_dataset/toy_dataset.py b/configs/_base_/det_dataset/toy_dataset.py deleted file mode 100644 index 9789c511..00000000 --- a/configs/_base_/det_dataset/toy_dataset.py +++ /dev/null @@ -1,97 +0,0 @@ -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -train_cfg = None -test_cfg = None - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 640)], - ratio_range=(0.7, 1.3), - aspect_ratio_range=(0.9, 1.1), - multiscale_mode='value', - keep_ratio=False), - # shrink_ratio is from big to small. The 1st must be 1.0 - dict(type='PANetTargets', shrink_ratio=(1.0, 0.7)), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(640, 640), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(3000, 640), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(3000, 640), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] - -dataset_type = 'TextDetDataset' -img_prefix = 'tests/data/toy_dataset/imgs' -train_anno_file = 'tests/data/toy_dataset/instances_test.txt' -train1 = dict( - type=dataset_type, - img_prefix=img_prefix, - ann_file=train_anno_file, - loader=dict( - type='HardDiskLoader', - repeat=4, - parser=dict( - type='LineJsonParser', - keys=['file_name', 'height', 'width', 'annotations'])), - pipeline=train_pipeline, - test_mode=False) - -data_root = 'tests/data/toy_dataset' -train2 = dict( - type='IcdarDataset', - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=train_pipeline) - -test_anno_file = 'tests/data/toy_dataset/instances_test.txt' -test = dict( - type=dataset_type, - img_prefix=img_prefix, - ann_file=test_anno_file, - loader=dict( - type='HardDiskLoader', - repeat=1, - parser=dict( - type='LineJsonParser', - keys=['file_name', 'height', 'width', 'annotations'])), - pipeline=test_pipeline, - test_mode=True) - -data = dict( - samples_per_gpu=2, - workers_per_gpu=2, - train=dict(type='ConcatDataset', datasets=[train1, train2]), - val=dict(type='ConcatDataset', datasets=[test]), - test=dict(type='ConcatDataset', datasets=[test])) - -evaluation = dict(interval=1, metric='hmean-iou') diff --git a/configs/_base_/det_datasets/ctw1500.py b/configs/_base_/det_datasets/ctw1500.py new file mode 100644 index 00000000..466ea7e1 --- /dev/null +++ b/configs/_base_/det_datasets/ctw1500.py @@ -0,0 +1,18 @@ +dataset_type = 'IcdarDataset' +data_root = 'data/ctw1500' + +train = dict( + type=dataset_type, + ann_file=f'{data_root}/instances_training.json', + img_prefix=f'{data_root}/imgs', + pipeline=None) + +test = dict( + type=dataset_type, + ann_file=f'{data_root}/instances_test.json', + img_prefix=f'{data_root}/imgs', + pipeline=None) + +train_list = [train] + +test_list = [test] diff --git a/configs/_base_/det_datasets/icdar2015.py b/configs/_base_/det_datasets/icdar2015.py new file mode 100644 index 00000000..f711c06d --- /dev/null +++ b/configs/_base_/det_datasets/icdar2015.py @@ -0,0 +1,18 @@ +dataset_type = 'IcdarDataset' +data_root = 'data/icdar2015' + +train = dict( + type=dataset_type, + ann_file=f'{data_root}/instances_training.json', + img_prefix=f'{data_root}/imgs', + pipeline=None) + +test = dict( + type=dataset_type, + ann_file=f'{data_root}/instances_test.json', + img_prefix=f'{data_root}/imgs', + pipeline=None) + +train_list = [train] + +test_list = [test] diff --git a/configs/_base_/det_datasets/icdar2017.py b/configs/_base_/det_datasets/icdar2017.py new file mode 100644 index 00000000..446ea7ef --- /dev/null +++ b/configs/_base_/det_datasets/icdar2017.py @@ -0,0 +1,18 @@ +dataset_type = 'IcdarDataset' +data_root = 'data/icdar2017' + +train = dict( + type=dataset_type, + ann_file=f'{data_root}/instances_training.json', + img_prefix=f'{data_root}/imgs', + pipeline=None) + +test = dict( + type=dataset_type, + ann_file=f'{data_root}/instances_val.json', + img_prefix=f'{data_root}/imgs', + pipeline=None) + +train_list = [train] + +test_list = [test] diff --git a/configs/_base_/det_datasets/toy_data.py b/configs/_base_/det_datasets/toy_data.py new file mode 100644 index 00000000..11c55591 --- /dev/null +++ b/configs/_base_/det_datasets/toy_data.py @@ -0,0 +1,39 @@ +root = 'tests/data/toy_dataset' + +# dataset with type='TextDetDataset' +train1 = dict( + type='TextDetDataset', + img_prefix=f'{root}/imgs', + ann_file=f'{root}/instances_test.txt', + loader=dict( + type='HardDiskLoader', + repeat=4, + parser=dict( + type='LineJsonParser', + keys=['file_name', 'height', 'width', 'annotations'])), + pipeline=None, + test_mode=False) + +# dataset with type='IcdarDataset' +train2 = dict( + type='IcdarDataset', + ann_file=f'{root}/instances_test.json', + img_prefix=f'{root}/imgs', + pipeline=None) + +test = dict( + type='TextDetDataset', + img_prefix=f'{root}/imgs', + ann_file=f'{root}/instances_test.txt', + loader=dict( + type='HardDiskLoader', + repeat=1, + parser=dict( + type='LineJsonParser', + keys=['file_name', 'height', 'width', 'annotations'])), + pipeline=None, + test_mode=True) + +train_list = [train1, train2] + +test_list = [test] diff --git a/configs/_base_/det_models/dbnet_r18_fpnc.py b/configs/_base_/det_models/dbnet_r18_fpnc.py new file mode 100644 index 00000000..d06db5c2 --- /dev/null +++ b/configs/_base_/det_models/dbnet_r18_fpnc.py @@ -0,0 +1,21 @@ +model = dict( + type='DBNet', + backbone=dict( + type='mmdet.ResNet', + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'), + norm_eval=False, + style='caffe'), + neck=dict( + type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256), + bbox_head=dict( + type='DBHead', + text_repr_type='quad', + in_channels=256, + loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True)), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py b/configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py new file mode 100644 index 00000000..452ab320 --- /dev/null +++ b/configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py @@ -0,0 +1,23 @@ +model = dict( + type='DBNet', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256), + bbox_head=dict( + type='DBHead', + text_repr_type='quad', + in_channels=256, + loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True)), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/det_models/drrg_r50_fpn_unet.py b/configs/_base_/det_models/drrg_r50_fpn_unet.py new file mode 100644 index 00000000..1c10c61a --- /dev/null +++ b/configs/_base_/det_models/drrg_r50_fpn_unet.py @@ -0,0 +1,21 @@ +model = dict( + type='DRRG', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32), + bbox_head=dict( + type='DRRGHead', + in_channels=32, + text_region_thr=0.3, + center_region_thr=0.4, + link_thr=0.80, + loss=dict(type='DRRGLoss'))) diff --git a/configs/_base_/det_models/fcenet_r50_fpn.py b/configs/_base_/det_models/fcenet_r50_fpn.py new file mode 100644 index 00000000..90e92999 --- /dev/null +++ b/configs/_base_/det_models/fcenet_r50_fpn.py @@ -0,0 +1,30 @@ +model = dict( + type='FCENet', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + norm_eval=False, + style='pytorch'), + neck=dict( + type='mmdet.FPN', + in_channels=[512, 1024, 2048], + out_channels=256, + add_extra_convs='on_output', + num_outs=3, + relu_before_extra_convs=True, + act_cfg=None), + bbox_head=dict( + type='FCEHead', + in_channels=256, + scales=(8, 16, 32), + loss=dict(type='FCELoss'), + alpha=1.2, + beta=1.0, + text_repr_type='quad', + fourier_degree=5, + )) diff --git a/configs/_base_/det_models/fcenet_r50dcnv2_fpn.py b/configs/_base_/det_models/fcenet_r50dcnv2_fpn.py new file mode 100644 index 00000000..27bb95e5 --- /dev/null +++ b/configs/_base_/det_models/fcenet_r50dcnv2_fpn.py @@ -0,0 +1,29 @@ +model = dict( + type='FCENet', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=2, fallback_on_stride=False), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='mmdet.FPN', + in_channels=[512, 1024, 2048], + out_channels=256, + add_extra_convs='on_output', + num_outs=3, + relu_before_extra_convs=True, + act_cfg=None), + bbox_head=dict( + type='FCEHead', + in_channels=256, + scales=(8, 16, 32), + loss=dict(type='FCELoss'), + fourier_degree=5, + )) diff --git a/configs/_base_/models/ocr_mask_rcnn_r50_fpn_ohem.py b/configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py similarity index 100% rename from configs/_base_/models/ocr_mask_rcnn_r50_fpn_ohem.py rename to configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py diff --git a/configs/_base_/models/ocr_mask_rcnn_r50_fpn_ohem_poly.py b/configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py similarity index 100% rename from configs/_base_/models/ocr_mask_rcnn_r50_fpn_ohem_poly.py rename to configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py diff --git a/configs/_base_/det_models/panet_r18_fpem_ffm.py b/configs/_base_/det_models/panet_r18_fpem_ffm.py new file mode 100644 index 00000000..56465126 --- /dev/null +++ b/configs/_base_/det_models/panet_r18_fpem_ffm.py @@ -0,0 +1,43 @@ +model_poly = dict( + type='PANet', + backbone=dict( + type='mmdet.ResNet', + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'), + norm_eval=True, + style='caffe'), + neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]), + bbox_head=dict( + type='PANHead', + text_repr_type='poly', + in_channels=[128, 128, 128, 128], + out_channels=6, + loss=dict(type='PANLoss')), + train_cfg=None, + test_cfg=None) + +model_quad = dict( + type='PANet', + backbone=dict( + type='mmdet.ResNet', + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'), + norm_eval=True, + style='caffe'), + neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]), + bbox_head=dict( + type='PANHead', + text_repr_type='quad', + in_channels=[128, 128, 128, 128], + out_channels=6, + loss=dict(type='PANLoss')), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/det_models/panet_r50_fpem_ffm.py b/configs/_base_/det_models/panet_r50_fpem_ffm.py new file mode 100644 index 00000000..99a115e0 --- /dev/null +++ b/configs/_base_/det_models/panet_r50_fpem_ffm.py @@ -0,0 +1,20 @@ +model = dict( + type='PANet', + pretrained='torchvision://resnet50', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='caffe'), + neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]), + bbox_head=dict( + type='PANHead', + in_channels=[128, 128, 128, 128], + out_channels=6, + loss=dict(type='PANLoss', speedup_bbox_thr=32)), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/det_models/psenet_r50_fpnf.py b/configs/_base_/det_models/psenet_r50_fpnf.py new file mode 100644 index 00000000..c9669d41 --- /dev/null +++ b/configs/_base_/det_models/psenet_r50_fpnf.py @@ -0,0 +1,51 @@ +model_poly = dict( + type='PSENet', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPNF', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + fusion_type='concat'), + bbox_head=dict( + type='PSEHead', + text_repr_type='poly', + in_channels=[256], + out_channels=7, + loss=dict(type='PSELoss')), + train_cfg=None, + test_cfg=None) + +model_quad = dict( + type='PSENet', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='SyncBN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPNF', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + fusion_type='concat'), + bbox_head=dict( + type='PSEHead', + text_repr_type='quad', + in_channels=[256], + out_channels=7, + loss=dict(type='PSELoss')), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/det_models/textsnake_r50_fpn_unet.py b/configs/_base_/det_models/textsnake_r50_fpn_unet.py new file mode 100644 index 00000000..81c3b6b2 --- /dev/null +++ b/configs/_base_/det_models/textsnake_r50_fpn_unet.py @@ -0,0 +1,21 @@ +model = dict( + type='TextSnake', + backbone=dict( + type='mmdet.ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=True), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32), + bbox_head=dict( + type='TextSnakeHead', + in_channels=32, + text_repr_type='poly', + loss=dict(type='TextSnakeLoss')), + train_cfg=None, + test_cfg=None) diff --git a/configs/_base_/det_pipelines/dbnet_pipeline.py b/configs/_base_/det_pipelines/dbnet_pipeline.py new file mode 100644 index 00000000..f243b91d --- /dev/null +++ b/configs/_base_/det_pipelines/dbnet_pipeline.py @@ -0,0 +1,88 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline_r18 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='ImgAug', + args=[['Fliplr', 0.5], + dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]), + dict(type='EastRandomCrop', target_size=(640, 640)), + dict(type='DBNetTargets', shrink_ratio=0.4), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'], + visualize=dict(flag=False, boundary_key='gt_shrink')), + dict( + type='Collect', + keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask']) +] + +test_pipeline_1333_736 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 736), + flip=False, + transforms=[ + dict(type='Resize', img_scale=(2944, 736), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# for dbnet_r50dcnv2_fpnc +img_norm_cfg_r50dcnv2 = dict( + mean=[122.67891434, 116.66876762, 104.00698793], + std=[58.395, 57.12, 57.375], + to_rgb=True) + +train_pipeline_r50dcnv2 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg_r50dcnv2), + dict( + type='ImgAug', + args=[['Fliplr', 0.5], + dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]), + dict(type='EastRandomCrop', target_size=(640, 640)), + dict(type='DBNetTargets', shrink_ratio=0.4), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'], + visualize=dict(flag=False, boundary_key='gt_shrink')), + dict( + type='Collect', + keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask']) +] + +test_pipeline_4068_1024 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=(4068, 1024), + flip=False, + transforms=[ + dict(type='Resize', img_scale=(2944, 736), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg_r50dcnv2), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/det_pipelines/drrg_pipeline.py b/configs/_base_/det_pipelines/drrg_pipeline.py new file mode 100644 index 00000000..2a169149 --- /dev/null +++ b/configs/_base_/det_pipelines/drrg_pipeline.py @@ -0,0 +1,60 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='RandomScaling', size=800, scale=(0.75, 2.5)), + dict( + type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2), + dict( + type='RandomCropPolyInstances', + instance_key='gt_masks', + crop_ratio=0.8, + min_side_ratio=0.3), + dict( + type='RandomRotatePolyInstances', + rotate_ratio=0.5, + max_angle=60, + pad_with_fixed_color=False), + dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='DRRGTargets'), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=[ + 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', + 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map', + 'gt_cos_map', 'gt_comp_attribs' + ], + visualize=dict(flag=False, boundary_key='gt_text_mask')), + dict( + type='Collect', + keys=[ + 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', + 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map', + 'gt_cos_map', 'gt_comp_attribs' + ]) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=(1024, 640), + flip=False, + transforms=[ + dict(type='Resize', img_scale=(1024, 640), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/det_pipelines/fcenet_pipeline.py b/configs/_base_/det_pipelines/fcenet_pipeline.py new file mode 100644 index 00000000..b1be6b22 --- /dev/null +++ b/configs/_base_/det_pipelines/fcenet_pipeline.py @@ -0,0 +1,118 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# for icdar2015 +leval_prop_range_icdar2015 = ((0, 0.4), (0.3, 0.7), (0.6, 1.0)) +train_pipeline_icdar2015 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='ColorJitter', + brightness=32.0 / 255, + saturation=0.5, + contrast=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)), + dict( + type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2), + dict( + type='RandomCropPolyInstances', + instance_key='gt_masks', + crop_ratio=0.8, + min_side_ratio=0.3), + dict( + type='RandomRotatePolyInstances', + rotate_ratio=0.5, + max_angle=30, + pad_with_fixed_color=False), + dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='Pad', size_divisor=32), + dict( + type='FCENetTargets', + fourier_degree=5, + level_proportion_range=leval_prop_range_icdar2015), + dict( + type='CustomFormatBundle', + keys=['p3_maps', 'p4_maps', 'p5_maps'], + visualize=dict(flag=False, boundary_key=None)), + dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps']) +] + +img_scale_icdar2015 = (2260, 2260) +test_pipeline_icdar2015 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_icdar2015, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(1280, 800), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# for ctw1500 +leval_prop_range_ctw1500 = ((0, 0.25), (0.2, 0.65), (0.55, 1.0)) +train_pipeline_ctw1500 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='ColorJitter', + brightness=32.0 / 255, + saturation=0.5, + contrast=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)), + dict( + type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2), + dict( + type='RandomCropPolyInstances', + instance_key='gt_masks', + crop_ratio=0.8, + min_side_ratio=0.3), + dict( + type='RandomRotatePolyInstances', + rotate_ratio=0.5, + max_angle=30, + pad_with_fixed_color=False), + dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='Pad', size_divisor=32), + dict( + type='FCENetTargets', + fourier_degree=5, + level_proportion_range=leval_prop_range_ctw1500), + dict( + type='CustomFormatBundle', + keys=['p3_maps', 'p4_maps', 'p5_maps'], + visualize=dict(flag=False, boundary_key=None)), + dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps']) +] + +img_scale_ctw1500 = (1080, 736) +test_pipeline_ctw1500 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_ctw1500, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(1280, 800), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/det_pipelines/maskrcnn_pipeline.py b/configs/_base_/det_pipelines/maskrcnn_pipeline.py new file mode 100644 index 00000000..f9301025 --- /dev/null +++ b/configs/_base_/det_pipelines/maskrcnn_pipeline.py @@ -0,0 +1,57 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='ScaleAspectJitter', + img_scale=None, + keep_ratio=False, + resize_type='indep_sample_in_range', + scale_range=(640, 2560)), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='RandomCropInstances', + target_size=(640, 640), + mask_type='union_all', + instance_key='gt_masks'), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] + +# for ctw1500 +img_scale_ctw1500 = (1600, 1600) +test_pipeline_ctw1500 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_ctw1500, + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# for icdar2015 +img_scale_icdar2015 = (1920, 1920) +test_pipeline_icdar2015 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_icdar2015, + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/det_pipelines/panet_pipeline.py b/configs/_base_/det_pipelines/panet_pipeline.py new file mode 100644 index 00000000..36d239b4 --- /dev/null +++ b/configs/_base_/det_pipelines/panet_pipeline.py @@ -0,0 +1,156 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +# for ctw1500 +img_scale_train_ctw1500 = [(3000, 640)] +shrink_ratio_train_ctw1500 = (1.0, 0.7) +target_size_train_ctw1500 = (640, 640) +train_pipeline_ctw1500 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='ScaleAspectJitter', + img_scale=img_scale_train_ctw1500, + ratio_range=(0.7, 1.3), + aspect_ratio_range=(0.9, 1.1), + multiscale_mode='value', + keep_ratio=False), + # shrink_ratio is from big to small. The 1st must be 1.0 + dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_ctw1500), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='RandomRotateTextDet'), + dict( + type='RandomCropInstances', + target_size=target_size_train_ctw1500, + instance_key='gt_kernels'), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=['gt_kernels', 'gt_mask'], + visualize=dict(flag=False, boundary_key='gt_kernels')), + dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) +] + +img_scale_test_ctw1500 = (3000, 640) +test_pipeline_ctw1500 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_test_ctw1500, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(3000, 640), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# for icdar2015 +img_scale_train_icdar2015 = [(3000, 736)] +shrink_ratio_train_icdar2015 = (1.0, 0.5) +target_size_train_icdar2015 = (736, 736) +train_pipeline_icdar2015 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='ScaleAspectJitter', + img_scale=img_scale_train_icdar2015, + ratio_range=(0.7, 1.3), + aspect_ratio_range=(0.9, 1.1), + multiscale_mode='value', + keep_ratio=False), + dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2015), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='RandomRotateTextDet'), + dict( + type='RandomCropInstances', + target_size=target_size_train_icdar2015, + instance_key='gt_kernels'), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=['gt_kernels', 'gt_mask'], + visualize=dict(flag=False, boundary_key='gt_kernels')), + dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) +] + +img_scale_test_icdar2015 = (1333, 736) +test_pipeline_icdar2015 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_test_icdar2015, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(3000, 640), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# for icdar2017 +img_scale_train_icdar2017 = [(3000, 800)] +shrink_ratio_train_icdar2017 = (1.0, 0.5) +target_size_train_icdar2017 = (800, 800) +train_pipeline_icdar2017 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='ScaleAspectJitter', + img_scale=img_scale_train_icdar2017, + ratio_range=(0.7, 1.3), + aspect_ratio_range=(0.9, 1.1), + multiscale_mode='value', + keep_ratio=False), + dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2017), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='RandomRotateTextDet'), + dict( + type='RandomCropInstances', + target_size=target_size_train_icdar2017, + instance_key='gt_kernels'), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=['gt_kernels', 'gt_mask'], + visualize=dict(flag=False, boundary_key='gt_kernels')), + dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) +] + +img_scale_test_icdar2017 = (1333, 800) +test_pipeline_icdar2017 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_test_icdar2017, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(3000, 640), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/det_pipelines/psenet_pipeline.py b/configs/_base_/det_pipelines/psenet_pipeline.py new file mode 100644 index 00000000..004dd63a --- /dev/null +++ b/configs/_base_/det_pipelines/psenet_pipeline.py @@ -0,0 +1,70 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='ScaleAspectJitter', + img_scale=[(3000, 736)], + ratio_range=(0.5, 3), + aspect_ratio_range=(1, 1), + multiscale_mode='value', + long_size_bound=1280, + short_size_bound=640, + resize_type='long_short_bound', + keep_ratio=False), + dict(type='PSENetTargets'), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='RandomRotateTextDet'), + dict( + type='RandomCropInstances', + target_size=(640, 640), + instance_key='gt_kernels'), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=['gt_kernels', 'gt_mask'], + visualize=dict(flag=False, boundary_key='gt_kernels')), + dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) +] + +# for ctw1500 +img_scale_test_ctw1500 = (1280, 1280) +test_pipeline_ctw1500 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_test_ctw1500, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(1280, 1280), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] + +# for icdar2015 +img_scale_test_icdar2015 = (2240, 2240) +test_pipeline_icdar2015 = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=img_scale_test_icdar2015, + flip=False, + transforms=[ + dict(type='Resize', img_scale=(1280, 1280), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/det_pipelines/textsnake_pipeline.py b/configs/_base_/det_pipelines/textsnake_pipeline.py new file mode 100644 index 00000000..583abec2 --- /dev/null +++ b/configs/_base_/det_pipelines/textsnake_pipeline.py @@ -0,0 +1,65 @@ +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='LoadTextAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), + dict(type='Normalize', **img_norm_cfg), + dict( + type='RandomCropPolyInstances', + instance_key='gt_masks', + crop_ratio=0.65, + min_side_ratio=0.3), + dict( + type='RandomRotatePolyInstances', + rotate_ratio=0.5, + max_angle=20, + pad_with_fixed_color=False), + dict( + type='ScaleAspectJitter', + img_scale=[(3000, 736)], # unused + ratio_range=(0.7, 1.3), + aspect_ratio_range=(0.9, 1.1), + multiscale_mode='value', + long_size_bound=800, + short_size_bound=480, + resize_type='long_short_bound', + keep_ratio=False), + dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), + dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), + dict(type='TextSnakeTargets'), + dict(type='Pad', size_divisor=32), + dict( + type='CustomFormatBundle', + keys=[ + 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', + 'gt_radius_map', 'gt_sin_map', 'gt_cos_map' + ], + visualize=dict(flag=False, boundary_key='gt_text_mask')), + dict( + type='Collect', + keys=[ + 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', + 'gt_radius_map', 'gt_sin_map', 'gt_cos_map' + ]) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 736), + flip=False, + transforms=[ + dict(type='Resize', img_scale=(1333, 736), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] diff --git a/configs/_base_/schedules/schedule_adam_step_600e.py b/configs/_base_/schedules/schedule_adam_step_600e.py new file mode 100644 index 00000000..a861e821 --- /dev/null +++ b/configs/_base_/schedules/schedule_adam_step_600e.py @@ -0,0 +1,6 @@ +# optimizer +optimizer = dict(type='Adam', lr=1e-4) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict(policy='step', step=[200, 400]) +total_epochs = 600 diff --git a/configs/_base_/schedules/schedule_1200e.py b/configs/_base_/schedules/schedule_sgd_1200e.py similarity index 100% rename from configs/_base_/schedules/schedule_1200e.py rename to configs/_base_/schedules/schedule_sgd_1200e.py diff --git a/configs/_base_/schedules/schedule_sgd_1500e.py b/configs/_base_/schedules/schedule_sgd_1500e.py new file mode 100644 index 00000000..63a1e2dd --- /dev/null +++ b/configs/_base_/schedules/schedule_sgd_1500e.py @@ -0,0 +1,5 @@ +# optimizer +optimizer = dict(type='SGD', lr=1e-3, momentum=0.90, weight_decay=5e-4) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='poly', power=0.9, min_lr=1e-7, by_epoch=True) +total_epochs = 1500 diff --git a/configs/_base_/schedules/schedule_160e.py b/configs/_base_/schedules/schedule_sgd_160e.py similarity index 100% rename from configs/_base_/schedules/schedule_160e.py rename to configs/_base_/schedules/schedule_sgd_160e.py diff --git a/configs/textdet/dbnet/dbnet_r18_fpnc_1200e_icdar2015.py b/configs/textdet/dbnet/dbnet_r18_fpnc_1200e_icdar2015.py index 4c9ed225..997668f2 100644 --- a/configs/textdet/dbnet/dbnet_r18_fpnc_1200e_icdar2015.py +++ b/configs/textdet/dbnet/dbnet_r18_fpnc_1200e_icdar2015.py @@ -1,98 +1,33 @@ _base_ = [ - '../../_base_/schedules/schedule_1200e.py', '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/schedules/schedule_sgd_1200e.py', + '../../_base_/det_models/dbnet_r18_fpnc.py', + '../../_base_/det_datasets/icdar2015.py', + '../../_base_/det_pipelines/dbnet_pipeline.py' ] -model = dict( - type='DBNet', - backbone=dict( - type='mmdet.ResNet', - depth=18, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'), - norm_eval=False, - style='caffe'), - neck=dict( - type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256), - bbox_head=dict( - type='DBHead', - text_repr_type='quad', - in_channels=256, - loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True)), - train_cfg=None, - test_cfg=None) -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2015' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -# for visualizing img, pls uncomment it. -# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_r18 = {{_base_.train_pipeline_r18}} +test_pipeline_1333_736 = {{_base_.test_pipeline_1333_736}} -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - # img aug - dict( - type='ImgAug', - args=[['Fliplr', 0.5], - dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]), - # random crop - dict(type='EastRandomCrop', target_size=(640, 640)), - dict(type='DBNetTargets', shrink_ratio=0.4), - dict(type='Pad', size_divisor=32), - # for visualizing img and gts, pls set visualize = True - dict( - type='CustomFormatBundle', - keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'], - visualize=dict(flag=False, boundary_key='gt_shrink')), - dict( - type='Collect', - keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 736), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(2944, 736), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] data = dict( samples_per_gpu=16, workers_per_gpu=8, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - # for debugging top k imgs - # select_first_k=200, - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_r18), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_1333_736), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_1333_736)) + evaluation = dict(interval=100, metric='hmean-iou') diff --git a/configs/textdet/dbnet/dbnet_r50dcnv2_fpnc_1200e_icdar2015.py b/configs/textdet/dbnet/dbnet_r50dcnv2_fpnc_1200e_icdar2015.py index 4ce70397..bd0b8c84 100644 --- a/configs/textdet/dbnet/dbnet_r50dcnv2_fpnc_1200e_icdar2015.py +++ b/configs/textdet/dbnet/dbnet_r50dcnv2_fpnc_1200e_icdar2015.py @@ -1,104 +1,35 @@ _base_ = [ - '../../_base_/schedules/schedule_1200e.py', '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/schedules/schedule_sgd_1200e.py', + '../../_base_/det_models/dbnet_r50dcnv2_fpnc.py', + '../../_base_/det_datasets/icdar2015.py', + '../../_base_/det_pipelines/dbnet_pipeline.py' ] + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_r50dcnv2 = {{_base_.train_pipeline_r50dcnv2}} +test_pipeline_4068_1024 = {{_base_.test_pipeline_4068_1024}} + load_from = 'checkpoints/textdet/dbnet/res50dcnv2_synthtext.pth' -model = dict( - type='DBNet', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=False, - style='pytorch', - dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - stage_with_dcn=(False, True, True, True)), - neck=dict( - type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256), - bbox_head=dict( - type='DBHead', - text_repr_type='quad', - in_channels=256, - loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True)), - train_cfg=None, - test_cfg=None) - -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2015/' -img_norm_cfg = dict( - mean=[122.67891434, 116.66876762, 104.00698793], - std=[58.395, 57.12, 57.375], - to_rgb=True) -# for visualizing img, pls uncomment it. -# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - # img aug - dict( - type='ImgAug', - args=[['Fliplr', 0.5], - dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]), - # random crop - dict(type='EastRandomCrop', target_size=(640, 640)), - dict(type='DBNetTargets', shrink_ratio=0.4), - dict(type='Pad', size_divisor=32), - # for visualizing img and gts, pls set visualize = True - dict( - type='CustomFormatBundle', - keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'], - visualize=dict(flag=False, boundary_key='gt_shrink')), - dict( - type='Collect', - keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(4068, 1024), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(4068, 1024), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] data = dict( samples_per_gpu=8, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - # for debugging top k imgs - # select_first_k=200, - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_r50dcnv2), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_4068_1024), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_4068_1024)) + evaluation = dict(interval=100, metric='hmean-iou') diff --git a/configs/textdet/drrg/drrg_r50_fpn_unet_1200e_ctw1500.py b/configs/textdet/drrg/drrg_r50_fpn_unet_1200e_ctw1500.py index 2c1671d0..e30b1a74 100644 --- a/configs/textdet/drrg/drrg_r50_fpn_unet_1200e_ctw1500.py +++ b/configs/textdet/drrg/drrg_r50_fpn_unet_1200e_ctw1500.py @@ -1,112 +1,33 @@ _base_ = [ - '../../_base_/schedules/schedule_1200e.py', - '../../_base_/default_runtime.py' + '../../_base_/schedules/schedule_sgd_1200e.py', + '../../_base_/default_runtime.py', + '../../_base_/det_models/drrg_r50_fpn_unet.py', + '../../_base_/det_datasets/ctw1500.py', + '../../_base_/det_pipelines/drrg_pipeline.py' ] -model = dict( - type='DRRG', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - norm_eval=True, - style='caffe'), - neck=dict( - type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32), - bbox_head=dict( - type='DRRGHead', - in_channels=32, - text_region_thr=0.3, - center_region_thr=0.4, - link_thr=0.80, - loss=dict(type='DRRGLoss'))) -train_cfg = None -test_cfg = None -dataset_type = 'IcdarDataset' -data_root = 'data/ctw1500/' +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = {{_base_.train_pipeline}} +test_pipeline = {{_base_.test_pipeline}} -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='RandomScaling', size=800, scale=(0.75, 2.5)), - dict( - type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2), - dict( - type='RandomCropPolyInstances', - instance_key='gt_masks', - crop_ratio=0.8, - min_side_ratio=0.3), - dict( - type='RandomRotatePolyInstances', - rotate_ratio=0.5, - max_angle=60, - pad_with_fixed_color=False), - dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='DRRGTargets'), - dict(type='Pad', size_divisor=32), - dict( - type='CustomFormatBundle', - keys=[ - 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', - 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map', - 'gt_cos_map', 'gt_comp_attribs' - ], - visualize=dict(flag=False, boundary_key='gt_text_mask')), - dict( - type='Collect', - keys=[ - 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', - 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map', - 'gt_cos_map', 'gt_comp_attribs' - ]) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1024, 640), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1024, 640), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] data = dict( samples_per_gpu=4, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=f'{data_root}/instances_training.json', - img_prefix=f'{data_root}/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - ann_file=f'{data_root}/instances_test.json', - img_prefix=f'{data_root}/imgs', + type='UniformConcatDataset', + datasets=test_list, pipeline=test_pipeline), test=dict( - type=dataset_type, - ann_file=f'{data_root}/instances_test.json', - img_prefix=f'{data_root}/imgs', + type='UniformConcatDataset', + datasets=test_list, pipeline=test_pipeline)) evaluation = dict(interval=20, metric='hmean-iou') diff --git a/configs/textdet/fcenet/fcenet_r50_fpn_1500e_icdar2015.py b/configs/textdet/fcenet/fcenet_r50_fpn_1500e_icdar2015.py index 42864aba..c17f892c 100644 --- a/configs/textdet/fcenet/fcenet_r50_fpn_1500e_icdar2015.py +++ b/configs/textdet/fcenet/fcenet_r50_fpn_1500e_icdar2015.py @@ -1,136 +1,33 @@ -fourier_degree = 5 -model = dict( - type='FCENet', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - norm_eval=False, - style='pytorch'), - neck=dict( - type='mmdet.FPN', - in_channels=[512, 1024, 2048], - out_channels=256, - add_extra_convs='on_output', - num_outs=3, - relu_before_extra_convs=True, - act_cfg=None), - bbox_head=dict( - type='FCEHead', - in_channels=256, - scales=(8, 16, 32), - loss=dict(type='FCELoss'), - alpha=1.2, - beta=1.0, - text_repr_type='quad', - fourier_degree=fourier_degree, - )) - -train_cfg = None -test_cfg = None - -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2015/' - -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict( - type='ColorJitter', - brightness=32.0 / 255, - saturation=0.5, - contrast=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)), - dict( - type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2), - dict( - type='RandomCropPolyInstances', - instance_key='gt_masks', - crop_ratio=0.8, - min_side_ratio=0.3), - dict( - type='RandomRotatePolyInstances', - rotate_ratio=0.5, - max_angle=30, - pad_with_fixed_color=False), - dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='Pad', size_divisor=32), - dict( - type='FCENetTargets', - fourier_degree=fourier_degree, - level_proportion_range=((0, 0.4), (0.3, 0.7), (0.6, 1.0))), - dict( - type='CustomFormatBundle', - keys=['p3_maps', 'p4_maps', 'p5_maps'], - visualize=dict(flag=False, boundary_key=None)), - dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(2260, 2260), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1280, 800), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) +_base_ = [ + '../../_base_/runtime_10e.py', + '../../_base_/schedules/schedule_sgd_1500e.py', + '../../_base_/det_models/fcenet_r50_fpn.py', + '../../_base_/det_datasets/icdar2015.py', + '../../_base_/det_pipelines/fcenet_pipeline.py' ] + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_icdar2015 = {{_base_.train_pipeline_icdar2015}} +test_pipeline_icdar2015 = {{_base_.test_pipeline_icdar2015}} + data = dict( samples_per_gpu=8, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_icdar2015), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) -evaluation = dict(interval=5, metric='hmean-iou') + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015)) -# optimizer -optimizer = dict(type='SGD', lr=1e-3, momentum=0.90, weight_decay=5e-4) -optimizer_config = dict(grad_clip=None) -lr_config = dict(policy='poly', power=0.9, min_lr=1e-7, by_epoch=True) -total_epochs = 1500 - -checkpoint_config = dict(interval=5) -# yapf:disable -log_config = dict( - interval=20, - hooks=[ - dict(type='TextLoggerHook') - - ]) -# yapf:enable -dist_params = dict(backend='nccl') -log_level = 'INFO' -load_from = None -resume_from = None -workflow = [('train', 1)] +evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500.py b/configs/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500.py index d6ceae2d..56ee4999 100644 --- a/configs/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500.py +++ b/configs/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500.py @@ -1,135 +1,33 @@ -fourier_degree = 5 -model = dict( - type='FCENet', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=True, - style='pytorch', - dcn=dict(type='DCNv2', deform_groups=2, fallback_on_stride=False), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - stage_with_dcn=(False, True, True, True)), - neck=dict( - type='mmdet.FPN', - in_channels=[512, 1024, 2048], - out_channels=256, - add_extra_convs='on_output', - num_outs=3, - relu_before_extra_convs=True, - act_cfg=None), - bbox_head=dict( - type='FCEHead', - in_channels=256, - scales=(8, 16, 32), - loss=dict(type='FCELoss'), - fourier_degree=fourier_degree, - )) - -train_cfg = None -test_cfg = None - -dataset_type = 'IcdarDataset' -data_root = 'data/ctw1500/' - -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict( - type='ColorJitter', - brightness=32.0 / 255, - saturation=0.5, - contrast=0.5), - dict(type='Normalize', **img_norm_cfg), - dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)), - dict( - type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2), - dict( - type='RandomCropPolyInstances', - instance_key='gt_masks', - crop_ratio=0.8, - min_side_ratio=0.3), - dict( - type='RandomRotatePolyInstances', - rotate_ratio=0.5, - max_angle=30, - pad_with_fixed_color=False), - dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='Pad', size_divisor=32), - dict( - type='FCENetTargets', - fourier_degree=fourier_degree, - level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0))), - dict( - type='CustomFormatBundle', - keys=['p3_maps', 'p4_maps', 'p5_maps'], - visualize=dict(flag=False, boundary_key=None)), - dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1080, 736), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1280, 800), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) +_base_ = [ + '../../_base_/runtime_10e.py', + '../../_base_/schedules/schedule_sgd_1500e.py', + '../../_base_/det_models/fcenet_r50dcnv2_fpn.py', + '../../_base_/det_datasets/ctw1500.py', + '../../_base_/det_pipelines/fcenet_pipeline.py' ] + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_ctw1500 = {{_base_.train_pipeline_ctw1500}} +test_pipeline_ctw1500 = {{_base_.test_pipeline_ctw1500}} + data = dict( samples_per_gpu=6, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_ctw1500), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) -evaluation = dict(interval=5, metric='hmean-iou') + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500)) -# optimizer -optimizer = dict(type='SGD', lr=1e-3, momentum=0.90, weight_decay=5e-4) -optimizer_config = dict(grad_clip=None) -lr_config = dict(policy='poly', power=0.9, min_lr=1e-7, by_epoch=True) -total_epochs = 1500 - -checkpoint_config = dict(interval=5) -# yapf:disable -log_config = dict( - interval=20, - hooks=[ - dict(type='TextLoggerHook') - - ]) -# yapf:enable -dist_params = dict(backend='nccl') -log_level = 'INFO' -load_from = None -resume_from = None -workflow = [('train', 1)] +evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500.py b/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500.py index ace32ad4..42b7e7b8 100644 --- a/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500.py +++ b/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_ctw1500.py @@ -1,69 +1,33 @@ _base_ = [ - '../../_base_/models/ocr_mask_rcnn_r50_fpn_ohem_poly.py', - '../../_base_/schedules/schedule_160e.py', '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py', + '../../_base_/schedules/schedule_sgd_160e.py', + '../../_base_/det_datasets/ctw1500.py', + '../../_base_/det_pipelines/maskrcnn_pipeline.py' ] -dataset_type = 'IcdarDataset' -data_root = 'data/ctw1500/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='ScaleAspectJitter', - img_scale=None, - keep_ratio=False, - resize_type='indep_sample_in_range', - scale_range=(640, 2560)), - dict(type='RandomFlip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='RandomCropInstances', - target_size=(640, 640), - mask_type='union_all', - instance_key='gt_masks'), - dict(type='Pad', size_divisor=32), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - # resize the long size to 1600 - img_scale=(1600, 1600), - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - # no flip - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline = {{_base_.train_pipeline}} +test_pipeline_ctw1500 = {{_base_.test_pipeline_ctw1500}} + data = dict( samples_per_gpu=8, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - # select_first_k=1, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500), test=dict( - type=dataset_type, - # select_first_k=1, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500)) + evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015.py b/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015.py index a1c31f8d..efffa12b 100644 --- a/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015.py +++ b/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2015.py @@ -1,68 +1,33 @@ _base_ = [ - '../../_base_/models/ocr_mask_rcnn_r50_fpn_ohem.py', - '../../_base_/schedules/schedule_160e.py', '../../_base_/runtime_10e.py' -] -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2015/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='ScaleAspectJitter', - img_scale=None, - keep_ratio=False, - resize_type='indep_sample_in_range', - scale_range=(640, 2560)), - dict(type='RandomFlip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='RandomCropInstances', - target_size=(640, 640), - mask_type='union_all', - instance_key='gt_masks'), - dict(type='Pad', size_divisor=32), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - # resize the long size to 1600 - img_scale=(1920, 1920), - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - # no flip - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) + '../../_base_/runtime_10e.py', + '../../_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py', + '../../_base_/schedules/schedule_sgd_160e.py', + '../../_base_/det_datasets/icdar2015.py', + '../../_base_/det_pipelines/maskrcnn_pipeline.py' ] + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline = {{_base_.train_pipeline}} +test_pipeline_icdar2015 = {{_base_.test_pipeline_icdar2015}} + data = dict( samples_per_gpu=8, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - # select_first_k=1, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015), test=dict( - type=dataset_type, - # select_first_k=1, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015)) + evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017.py b/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017.py index 088461af..b6b46ba4 100644 --- a/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017.py +++ b/configs/textdet/maskrcnn/mask_rcnn_r50_fpn_160e_icdar2017.py @@ -1,69 +1,33 @@ _base_ = [ - '../../_base_/models/ocr_mask_rcnn_r50_fpn_ohem.py', - '../../_base_/schedules/schedule_160e.py', '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py', + '../../_base_/schedules/schedule_sgd_160e.py', + '../../_base_/det_datasets/icdar2017.py', + '../../_base_/det_pipelines/maskrcnn_pipeline.py' ] -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2017/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -# img_norm_cfg = dict(mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict(type='LoadAnnotations', with_bbox=True, with_mask=True), - dict( - type='ScaleAspectJitter', - img_scale=None, - keep_ratio=False, - resize_type='indep_sample_in_range', - scale_range=(640, 2560)), - dict(type='RandomFlip', flip_ratio=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='RandomCropInstances', - target_size=(640, 640), - mask_type='union_all', - instance_key='gt_masks'), - dict(type='Pad', size_divisor=32), - dict(type='DefaultFormatBundle'), - dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - # resize the long size to 1600 - img_scale=(1600, 1600), - flip=False, - transforms=[ - dict(type='Resize', keep_ratio=True), - # no flip - dict(type='RandomFlip'), - dict(type='Normalize', **img_norm_cfg), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline = {{_base_.train_pipeline}} +test_pipeline_icdar2015 = {{_base_.test_pipeline_icdar2015}} + data = dict( samples_per_gpu=8, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - # select_first_k=1, - ann_file=data_root + '/instances_val.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015), test=dict( - type=dataset_type, - # select_first_k=1, - ann_file=data_root + '/instances_val.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015)) + evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/panet/panet_r18_fpem_ffm_600e_ctw1500.py b/configs/textdet/panet/panet_r18_fpem_ffm_600e_ctw1500.py index 01573df1..b564a1aa 100644 --- a/configs/textdet/panet/panet_r18_fpem_ffm_600e_ctw1500.py +++ b/configs/textdet/panet/panet_r18_fpem_ffm_600e_ctw1500.py @@ -1,106 +1,35 @@ _base_ = [ '../../_base_/schedules/schedule_adam_600e.py', - '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/det_models/panet_r18_fpem_ffm.py', + '../../_base_/det_datasets/ctw1500.py', + '../../_base_/det_pipelines/panet_pipeline.py' ] -model = dict( - type='PANet', - backbone=dict( - type='mmdet.ResNet', - depth=18, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='SyncBN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'), - norm_eval=True, - style='caffe'), - neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]), - bbox_head=dict( - type='PANHead', - text_repr_type='poly', - in_channels=[128, 128, 128, 128], - out_channels=6, - loss=dict(type='PANLoss')), - train_cfg=None, - test_cfg=None) -dataset_type = 'IcdarDataset' -data_root = 'data/ctw1500/' +model = {{_base_.model_poly}} -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -# for visualizing img, pls uncomment it. -# img_norm_cfg = dict( -# mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_ctw1500 = {{_base_.train_pipeline_ctw1500}} +test_pipeline_ctw1500 = {{_base_.test_pipeline_ctw1500}} -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 640)], - ratio_range=(0.7, 1.3), - aspect_ratio_range=(0.9, 1.1), - multiscale_mode='value', - keep_ratio=False), - # shrink_ratio is from big to small. The 1st must be 1.0 - dict(type='PANetTargets', shrink_ratio=(1.0, 0.7)), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(640, 640), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - # for visualizing img and gts, pls set visualize = True - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(3000, 640), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(3000, 640), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] data = dict( samples_per_gpu=2, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - # for debugging top k imgs - # select_first_k=200, - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_ctw1500), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500)) + evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/panet/panet_r18_fpem_ffm_600e_icdar2015.py b/configs/textdet/panet/panet_r18_fpem_ffm_600e_icdar2015.py index 02000fa9..e06fcd85 100644 --- a/configs/textdet/panet/panet_r18_fpem_ffm_600e_icdar2015.py +++ b/configs/textdet/panet/panet_r18_fpem_ffm_600e_icdar2015.py @@ -1,104 +1,35 @@ _base_ = [ '../../_base_/schedules/schedule_adam_600e.py', - '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/det_models/panet_r18_fpem_ffm.py', + '../../_base_/det_datasets/icdar2015.py', + '../../_base_/det_pipelines/panet_pipeline.py' ] -model = dict( - type='PANet', - backbone=dict( - type='mmdet.ResNet', - depth=18, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='SyncBN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'), - norm_eval=True, - style='caffe'), - neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]), - bbox_head=dict( - type='PANHead', - text_repr_type='quad', - in_channels=[128, 128, 128, 128], - out_channels=6, - loss=dict(type='PANLoss')), - train_cfg=None, - test_cfg=None) -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2015/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -# for visualizing img, pls uncomment it. -# img_norm_cfg = dict( -# mean=[0, 0, 0], std=[1, 1, 1], to_rgb=True) +model = {{_base_.model_quad}} + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_icdar2015 = {{_base_.train_pipeline_icdar2015}} +test_pipeline_icdar2015 = {{_base_.test_pipeline_icdar2015}} -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 736)], - ratio_range=(0.7, 1.3), - aspect_ratio_range=(0.9, 1.1), - multiscale_mode='value', - keep_ratio=False), - dict(type='PANetTargets', shrink_ratio=(1.0, 0.5), max_shrink=20), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(736, 736), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - # for visualizing img and gts, pls set visualize = True - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 736), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1333, 736), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] data = dict( samples_per_gpu=8, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - # for debugging top k imgs - # select_first_k=200, - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_icdar2015), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - # select_first_k=100, - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015)) + evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/panet/panet_r50_fpem_ffm_600e_icdar2017.py b/configs/textdet/panet/panet_r50_fpem_ffm_600e_icdar2017.py index 01486f42..9cb31143 100644 --- a/configs/textdet/panet/panet_r50_fpem_ffm_600e_icdar2017.py +++ b/configs/textdet/panet/panet_r50_fpem_ffm_600e_icdar2017.py @@ -1,95 +1,33 @@ _base_ = [ '../../_base_/schedules/schedule_adam_600e.py', - '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/det_models/panet_r50_fpem_ffm.py', + '../../_base_/det_datasets/icdar2017.py', + '../../_base_/det_pipelines/panet_pipeline.py' ] -model = dict( - type='PANet', - pretrained='torchvision://resnet50', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=True, - style='caffe'), - neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]), - bbox_head=dict( - type='PANHead', - in_channels=[128, 128, 128, 128], - out_channels=6, - loss=dict(type='PANLoss', speedup_bbox_thr=32)), - train_cfg=None, - test_cfg=None) -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2017/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 800)], - ratio_range=(0.7, 1.3), - aspect_ratio_range=(0.9, 1.1), - multiscale_mode='value', - keep_ratio=False), - dict(type='PANetTargets', shrink_ratio=(1.0, 0.5)), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(800, 800), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - # for visualizing img and gts, pls set visualize = True - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 800), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline_icdar2017 = {{_base_.train_pipeline_icdar2017}} +test_pipeline_icdar2017 = {{_base_.test_pipeline_icdar2017}} + data = dict( samples_per_gpu=4, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', - pipeline=train_pipeline), + type='UniformConcatDataset', + datasets=train_list, + pipeline=train_pipeline_icdar2017), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_val.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2017), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_val.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2017)) + evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/psenet/psenet_r50_fpnf_600e_ctw1500.py b/configs/textdet/psenet/psenet_r50_fpnf_600e_ctw1500.py index 12357fd9..483a2b2e 100644 --- a/configs/textdet/psenet/psenet_r50_fpnf_600e_ctw1500.py +++ b/configs/textdet/psenet/psenet_r50_fpnf_600e_ctw1500.py @@ -1,110 +1,35 @@ -_base_ = ['../../_base_/default_runtime.py'] - -# optimizer -optimizer = dict(type='Adam', lr=1e-4) -optimizer_config = dict(grad_clip=None) -# learning policy -lr_config = dict(policy='step', step=[200, 400]) -total_epochs = 600 - -model = dict( - type='PSENet', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='SyncBN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - norm_eval=True, - style='caffe'), - neck=dict( - type='FPNF', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - fusion_type='concat'), - bbox_head=dict( - type='PSEHead', - text_repr_type='poly', - in_channels=[256], - out_channels=7, - loss=dict(type='PSELoss')), - train_cfg=None, - test_cfg=None) - -dataset_type = 'IcdarDataset' -data_root = 'data/ctw1500/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 736)], - ratio_range=(0.5, 3), - aspect_ratio_range=(1, 1), - multiscale_mode='value', - long_size_bound=1280, - short_size_bound=640, - resize_type='long_short_bound', - keep_ratio=False), - dict(type='PSENetTargets'), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(640, 640), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1280, 1280), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1280, 1280), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) +_base_ = [ + '../../_base_/default_runtime.py', + '../../_base_/schedules/schedule_adam_step_600e.py', + '../../_base_/det_models/psenet_r50_fpnf.py', + '../../_base_/det_datasets/ctw1500.py', + '../../_base_/det_pipelines/psenet_pipeline.py' ] +model = {{_base_.model_poly}} + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline = {{_base_.train_pipeline}} +test_pipeline_ctw1500 = {{_base_.test_pipeline_ctw1500}} + data = dict( samples_per_gpu=2, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_ctw1500)) evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2015.py b/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2015.py index c5b495cc..f96d8a5d 100644 --- a/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2015.py +++ b/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2015.py @@ -1,110 +1,35 @@ -_base_ = ['../../_base_/runtime_10e.py'] - -# optimizer -optimizer = dict(type='Adam', lr=1e-4) -optimizer_config = dict(grad_clip=None) -# learning policy -lr_config = dict(policy='step', step=[200, 400]) -total_epochs = 600 - -model = dict( - type='PSENet', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='SyncBN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - norm_eval=True, - style='caffe'), - neck=dict( - type='FPNF', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - fusion_type='concat'), - bbox_head=dict( - type='PSEHead', - text_repr_type='quad', - in_channels=[256], - out_channels=7, - loss=dict(type='PSELoss')), - train_cfg=None, - test_cfg=None) - -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2015/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 736)], # unused - ratio_range=(0.5, 3), - aspect_ratio_range=(1, 1), - multiscale_mode='value', - long_size_bound=1280, - short_size_bound=640, - resize_type='long_short_bound', - keep_ratio=False), - dict(type='PSENetTargets'), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(640, 640), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(2240, 2200), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(2240, 2200), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) +_base_ = [ + '../../_base_/runtime_10e.py', + '../../_base_/schedules/schedule_adam_step_600e.py', + '../../_base_/det_models/psenet_r50_fpnf.py', + '../../_base_/det_datasets/icdar2015.py', + '../../_base_/det_pipelines/psenet_pipeline.py' ] +model = {{_base_.model_quad}} + +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline = {{_base_.train_pipeline}} +test_pipeline_icdar2015 = {{_base_.test_pipeline_icdar2015}} + data = dict( samples_per_gpu=8, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_test.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015)) evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2017.py b/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2017.py index 6e0e158d..acd40684 100644 --- a/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2017.py +++ b/configs/textdet/psenet/psenet_r50_fpnf_600e_icdar2017.py @@ -1,85 +1,18 @@ _base_ = [ '../../_base_/schedules/schedule_sgd_600e.py', - '../../_base_/runtime_10e.py' + '../../_base_/runtime_10e.py', + '../../_base_/det_models/psenet_r50_fpnf.py', + '../../_base_/det_datasets/icdar2017.py', + '../../_base_/det_pipelines/psenet_pipeline.py' ] -model = dict( - type='PSENet', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - norm_eval=True, - style='caffe'), - neck=dict( - type='FPNF', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - fusion_type='concat'), - bbox_head=dict( - type='PSEHead', - text_repr_type='quad', - in_channels=[256], - out_channels=7, - loss=dict(type='PSELoss')), - train_cfg=None, - test_cfg=None) -dataset_type = 'IcdarDataset' -data_root = 'data/icdar2017/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +model = {{_base_.model_quad}} -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 736)], - ratio_range=(0.5, 3), - aspect_ratio_range=(1, 1), - multiscale_mode='value', - long_size_bound=1280, - short_size_bound=640, - resize_type='long_short_bound', - keep_ratio=False), - dict(type='PSENetTargets'), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='RandomRotateTextDet'), - dict( - type='RandomCropInstances', - target_size=(640, 640), - instance_key='gt_kernels'), - dict(type='Pad', size_divisor=32), - dict( - type='CustomFormatBundle', - keys=['gt_kernels', 'gt_mask'], - visualize=dict(flag=False, boundary_key='gt_kernels')), - dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask']) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(2240, 2200), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(2240, 2200), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} + +train_pipeline = {{_base_.train_pipeline}} +test_pipeline_icdar2015 = {{_base_.test_pipeline_icdar2015}} data = dict( samples_per_gpu=8, @@ -87,19 +20,16 @@ data = dict( val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=data_root + '/instances_training.json', - img_prefix=data_root + '/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - ann_file=data_root + '/instances_val.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline), + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015), test=dict( - type=dataset_type, - ann_file=data_root + '/instances_val.json', - img_prefix=data_root + '/imgs', - pipeline=test_pipeline)) + type='UniformConcatDataset', + datasets=test_list, + pipeline=test_pipeline_icdar2015)) evaluation = dict(interval=10, metric='hmean-iou') diff --git a/configs/textdet/textsnake/textsnake_r50_fpn_unet_1200e_ctw1500.py b/configs/textdet/textsnake/textsnake_r50_fpn_unet_1200e_ctw1500.py index 87983cad..0270b059 100644 --- a/configs/textdet/textsnake/textsnake_r50_fpn_unet_1200e_ctw1500.py +++ b/configs/textdet/textsnake/textsnake_r50_fpn_unet_1200e_ctw1500.py @@ -1,95 +1,16 @@ _base_ = [ - '../../_base_/schedules/schedule_1200e.py', - '../../_base_/default_runtime.py' + '../../_base_/schedules/schedule_sgd_1200e.py', + '../../_base_/default_runtime.py', + '../../_base_/det_models/textsnake_r50_fpn_unet.py', + '../../_base_/det_datasets/ctw1500.py', + '../../_base_/det_pipelines/textsnake_pipeline.py' ] -model = dict( - type='TextSnake', - backbone=dict( - type='mmdet.ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - norm_eval=True, - style='caffe'), - neck=dict( - type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32), - bbox_head=dict( - type='TextSnakeHead', - in_channels=32, - text_repr_type='poly', - loss=dict(type='TextSnakeLoss')), - train_cfg=None, - test_cfg=None) -dataset_type = 'IcdarDataset' -data_root = 'data/ctw1500/' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_list = {{_base_.train_list}} +test_list = {{_base_.test_list}} -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadTextAnnotations', - with_bbox=True, - with_mask=True, - poly2mask=False), - dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5), - dict(type='Normalize', **img_norm_cfg), - dict( - type='RandomCropPolyInstances', - instance_key='gt_masks', - crop_ratio=0.65, - min_side_ratio=0.3), - dict( - type='RandomRotatePolyInstances', - rotate_ratio=0.5, - max_angle=20, - pad_with_fixed_color=False), - dict( - type='ScaleAspectJitter', - img_scale=[(3000, 736)], # unused - ratio_range=(0.7, 1.3), - aspect_ratio_range=(0.9, 1.1), - multiscale_mode='value', - long_size_bound=800, - short_size_bound=480, - resize_type='long_short_bound', - keep_ratio=False), - dict(type='SquareResizePad', target_size=800, pad_ratio=0.6), - dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'), - dict(type='TextSnakeTargets'), - dict(type='Pad', size_divisor=32), - dict( - type='CustomFormatBundle', - keys=[ - 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', - 'gt_radius_map', 'gt_sin_map', 'gt_cos_map' - ], - visualize=dict(flag=False, boundary_key='gt_text_mask')), - dict( - type='Collect', - keys=[ - 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask', - 'gt_radius_map', 'gt_sin_map', 'gt_cos_map' - ]) -] -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='MultiScaleFlipAug', - img_scale=(1333, 736), - flip=False, - transforms=[ - dict(type='Resize', img_scale=(1333, 736), keep_ratio=True), - dict(type='Normalize', **img_norm_cfg), - dict(type='Pad', size_divisor=32), - dict(type='ImageToTensor', keys=['img']), - dict(type='Collect', keys=['img']), - ]) -] +train_pipeline = {{_base_.train_pipeline}} +test_pipeline = {{_base_.test_pipeline}} data = dict( samples_per_gpu=4, @@ -97,19 +18,16 @@ data = dict( val_dataloader=dict(samples_per_gpu=1), test_dataloader=dict(samples_per_gpu=1), train=dict( - type=dataset_type, - ann_file=f'{data_root}/instances_training.json', - img_prefix=f'{data_root}/imgs', + type='UniformConcatDataset', + datasets=train_list, pipeline=train_pipeline), val=dict( - type=dataset_type, - ann_file=f'{data_root}/instances_test.json', - img_prefix=f'{data_root}/imgs', + type='UniformConcatDataset', + datasets=test_list, pipeline=test_pipeline), test=dict( - type=dataset_type, - ann_file=f'{data_root}/instances_test.json', - img_prefix=f'{data_root}/imgs', + type='UniformConcatDataset', + datasets=test_list, pipeline=test_pipeline)) evaluation = dict(interval=10, metric='hmean-iou') diff --git a/tests/test_apis/test_model_inference.py b/tests/test_apis/test_model_inference.py index 3cb2e864..4476b35e 100644 --- a/tests/test_apis/test_model_inference.py +++ b/tests/test_apis/test_model_inference.py @@ -7,11 +7,13 @@ from mmcv.image import imread from mmocr.apis.inference import init_detector, model_inference from mmocr.datasets import build_dataset # noqa: F401 from mmocr.models import build_detector # noqa: F401 +from mmocr.utils import revert_sync_batchnorm def build_model(config_file): device = 'cpu' model = init_detector(config_file, checkpoint=None, device=device) + model = revert_sync_batchnorm(model) if model.cfg.data.test['type'] == 'ConcatDataset': model.cfg.data.test.pipeline = model.cfg.data.test['datasets'][