From 4b887676a3a7dd85e95fe5161045596afc1e3cdc Mon Sep 17 00:00:00 2001 From: Tong Gao Date: Tue, 28 Mar 2023 16:20:24 +0800 Subject: [PATCH] [Fix] MJSynth & SynthText Dataset Preparer config (#1805) * [Fix] MJSynth * update * fix * fix --- configs/textrecog/_base_/datasets/mjsynth.py | 6 +++ dataset_zoo/mjsynth/textrecog.py | 19 +++++-- dataset_zoo/synthtext/textrecog.py | 3 +- mmocr/datasets/preparers/parsers/__init__.py | 3 +- .../preparers/parsers/mjsynth_parser.py | 50 +++++++++++++++++++ 5 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 mmocr/datasets/preparers/parsers/mjsynth_parser.py diff --git a/configs/textrecog/_base_/datasets/mjsynth.py b/configs/textrecog/_base_/datasets/mjsynth.py index 5f26e937..defe84a8 100644 --- a/configs/textrecog/_base_/datasets/mjsynth.py +++ b/configs/textrecog/_base_/datasets/mjsynth.py @@ -5,3 +5,9 @@ mjsynth_textrecog_train = dict( data_root=mjsynth_textrecog_data_root, ann_file='textrecog_train.json', pipeline=None) + +mjsynth_sub_textrecog_train = dict( + type='OCRDataset', + data_root=mjsynth_textrecog_data_root, + ann_file='subset_textrecog_train.json', + pipeline=None) diff --git a/dataset_zoo/mjsynth/textrecog.py b/dataset_zoo/mjsynth/textrecog.py index f54b7044..19751ec9 100644 --- a/dataset_zoo/mjsynth/textrecog.py +++ b/dataset_zoo/mjsynth/textrecog.py @@ -23,12 +23,19 @@ train_preparer = dict( 'annotations/annotation.txt' ] ]), + dict( + url='https://download.openmmlab.com/mmocr/data/1.x/recog/' + 'Syn90k/subset_textrecog_train.json', + save_name='subset_textrecog_train.json', + md5='ba958d87bb170980f39e194180c15b9e', + split=['train'], + content=['annotation']) ]), gatherer=dict(type='MonoGatherer', ann_name='annotation.txt'), parser=dict( - type='ICDARTxtTextRecogAnnParser', + type='MJSynthAnnParser', separator=' ', - format='img text', + format='img num', remove_strs=None), packer=dict(type='TextRecogPacker'), dumper=dict(type='JsonDumper'), @@ -37,4 +44,10 @@ train_preparer = dict( delete = ['mjsynth', 'annotations'] config_generator = dict( - type='TextRecogConfigGenerator', data_root=data_root, test_anns=None) + type='TextRecogConfigGenerator', + data_root=data_root, + train_anns=[ + dict(ann_file='textrecog_train.json', dataset_postfix=''), + dict(ann_file='subset_textrecog_train.json', dataset_postfix='sub'), + ], + test_anns=None) diff --git a/dataset_zoo/synthtext/textrecog.py b/dataset_zoo/synthtext/textrecog.py index 2ab2bdd1..2c16a32a 100644 --- a/dataset_zoo/synthtext/textrecog.py +++ b/dataset_zoo/synthtext/textrecog.py @@ -27,4 +27,5 @@ config_generator = dict( dict( ann_file='alphanumeric_textrecog_train.json', dataset_postfix='an'), - ]) + ], + test_anns=None) diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 58d6d9bd..fd379471 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -5,6 +5,7 @@ from .ctw1500_parser import CTW1500AnnParser from .funsd_parser import FUNSDTextDetAnnParser from .icdar_txt_parser import (ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser) +from .mjsynth_parser import MJSynthAnnParser from .naf_parser import NAFAnnParser from .sroie_parser import SROIETextDetAnnParser from .svt_parser import SVTTextDetAnnParser @@ -17,5 +18,5 @@ __all__ = [ 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', 'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser', - 'SynthTextAnnParser' + 'SynthTextAnnParser', 'MJSynthAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/mjsynth_parser.py b/mmocr/datasets/preparers/parsers/mjsynth_parser.py new file mode 100644 index 00000000..3eee6e29 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/mjsynth_parser.py @@ -0,0 +1,50 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List + +from mmocr.registry import DATA_PARSERS +from .icdar_txt_parser import ICDARTxtTextRecogAnnParser + + +@DATA_PARSERS.register_module() +class MJSynthAnnParser(ICDARTxtTextRecogAnnParser): + """MJSynth Text Recognition Annotation Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following format: + img_path, transcription + + Args: + separator (str): The separator between each element in a line. Defaults + to ','. + ignore (str): The text to be ignored. Defaults to '#'. + format (str): The format of the annotation. Defaults to 'img, text'. + encoding (str): The encoding of the annotation file. Defaults to + 'utf-8-sig'. + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + base_name (bool): Whether to use the basename of the image path as the + image name. Defaults to False. + remove_strs (List[str], Optional): Used to remove redundant strings in + the transcription. Defaults to ['"']. + """ + + def parse_files(self, img_dir: str, ann_path: str) -> List: + """Parse annotations.""" + assert isinstance(ann_path, str) + samples = list() + for anno in self.loader( + file_path=ann_path, + format=self.format, + encoding=self.encoding, + separator=self.sep): + text = osp.basename(anno['img']).split('_')[1] + if self.remove_strs is not None: + for strs in self.remove_strs: + text = text.replace(strs, '') + if text == self.ignore: + continue + img_name = anno['img'] + samples.append((osp.join(img_dir, img_name), text)) + + return samples