diff --git a/configs/textdet/maskrcnn/_base_mask-rcnn_resnet50_fpn.py b/configs/textdet/maskrcnn/_base_mask-rcnn_resnet50_fpn.py index b99ab187..aae385f6 100644 --- a/configs/textdet/maskrcnn/_base_mask-rcnn_resnet50_fpn.py +++ b/configs/textdet/maskrcnn/_base_mask-rcnn_resnet50_fpn.py @@ -1,138 +1,21 @@ +_base_ = ['mmdet::_base_/models/mask-rcnn_r50_fpn.py'] + file_client_args = dict(backend='disk') -model = dict( - type='MMDetWrapper', - text_repr_type='poly', - cfg=dict( - type='MaskRCNN', - data_preprocessor=dict( - type='DetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='ResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=True, - style='pytorch', - init_cfg=dict( - type='Pretrained', checkpoint='torchvision://resnet50')), - neck=dict( - type='FPN', - in_channels=[256, 512, 1024, 2048], - out_channels=256, - num_outs=5), +mask_rcnn = _base_.pop('model') +# Adapt Mask R-CNN model to OCR task +mask_rcnn.update( + dict( + data_preprocessor=dict(pad_mask=False), rpn_head=dict( - type='RPNHead', - in_channels=256, - feat_channels=256, anchor_generator=dict( - type='AnchorGenerator', - scales=[4], - ratios=[0.17, 0.44, 1.13, 2.90, 7.46], - strides=[4, 8, 16, 32, 64]), - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[.0, .0, .0, .0], - target_stds=[1.0, 1.0, 1.0, 1.0]), - loss_cls=dict( - type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), - loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + scales=[4], ratios=[0.17, 0.44, 1.13, 2.90, 7.46])), roi_head=dict( - type='StandardRoIHead', - bbox_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict( - type='RoIAlign', output_size=7, sampling_ratio=0.), - out_channels=256, - featmap_strides=[4, 8, 16, 32]), - bbox_head=dict( - type='Shared2FCBBoxHead', - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=1, - bbox_coder=dict( - type='DeltaXYWHBBoxCoder', - target_means=[0., 0., 0., 0.], - target_stds=[0.1, 0.1, 0.2, 0.2]), - reg_class_agnostic=False, - loss_cls=dict( - type='CrossEntropyLoss', - use_sigmoid=False, - loss_weight=1.0), - loss_bbox=dict(type='L1Loss', loss_weight=1.0)), - mask_roi_extractor=dict( - type='SingleRoIExtractor', - roi_layer=dict( - type='RoIAlign', output_size=14, sampling_ratio=0.), - out_channels=256, - featmap_strides=[4, 8, 16, 32]), - mask_head=dict( - type='FCNMaskHead', - num_convs=4, - in_channels=256, - conv_out_channels=256, - num_classes=1, - loss_mask=dict( - type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), - # model training and testing settings - train_cfg=dict( - rpn=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.7, - neg_iou_thr=0.3, - min_pos_iou=0.3, - match_low_quality=True, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=256, - pos_fraction=0.5, - neg_pos_ub=-1, - add_gt_as_proposals=False), - allowed_border=-1, - pos_weight=-1, - debug=False), - rpn_proposal=dict( - nms_pre=2000, - max_per_img=1000, - nms=dict(type='nms', iou_threshold=0.7), - min_bbox_size=0), - rcnn=dict( - assigner=dict( - type='MaxIoUAssigner', - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0.5, - match_low_quality=True, - ignore_iof_thr=-1), - sampler=dict( - type='RandomSampler', - num=512, - pos_fraction=0.25, - neg_pos_ub=-1, - add_gt_as_proposals=True), - mask_size=28, - pos_weight=-1, - debug=False)), - test_cfg=dict( - rpn=dict( - nms_pre=1000, - max_per_img=1000, - nms=dict(type='nms', iou_threshold=0.7), - min_bbox_size=0), - rcnn=dict( - score_thr=0.05, - nms=dict(type='nms', iou_threshold=0.5), - max_per_img=100, - mask_thr_binary=0.5)))) + bbox_head=dict(num_classes=1), + mask_head=dict(num_classes=1), + ))) + +model = dict(type='MMDetWrapper', text_repr_type='poly', cfg=mask_rcnn) train_pipeline = [ dict(