[Fix] MJSynth & SynthText Dataset Preparer config (#1805)

* [Fix] MJSynth * update * fix * fix
2023-03-28 16:20:24 +08:00 · 2023-03-28 16:20:24 +08:00 · 4b887676a3
parent bb591d2b1b
commit 4b887676a3
5 changed files with 76 additions and 5 deletions
--- a/configs/textrecog/_base_/datasets/mjsynth.py
+++ b/configs/textrecog/_base_/datasets/mjsynth.py
@ -5,3 +5,9 @@ mjsynth_textrecog_train = dict(
    data_root=mjsynth_textrecog_data_root,
    ann_file='textrecog_train.json',
    pipeline=None)
+
+mjsynth_sub_textrecog_train = dict(
+    type='OCRDataset',
+    data_root=mjsynth_textrecog_data_root,
+    ann_file='subset_textrecog_train.json',
+    pipeline=None)
--- a/dataset_zoo/mjsynth/textrecog.py
+++ b/dataset_zoo/mjsynth/textrecog.py
@ -23,12 +23,19 @@ train_preparer = dict(
                        'annotations/annotation.txt'
                    ]
                ]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+                'Syn90k/subset_textrecog_train.json',
+                save_name='subset_textrecog_train.json',
+                md5='ba958d87bb170980f39e194180c15b9e',
+                split=['train'],
+                content=['annotation'])
        ]),
    gatherer=dict(type='MonoGatherer', ann_name='annotation.txt'),
    parser=dict(
-        type='ICDARTxtTextRecogAnnParser',
+        type='MJSynthAnnParser',
        separator=' ',
-        format='img text',
+        format='img num',
        remove_strs=None),
    packer=dict(type='TextRecogPacker'),
    dumper=dict(type='JsonDumper'),
@ -37,4 +44,10 @@ train_preparer = dict(
 delete = ['mjsynth', 'annotations']

 config_generator = dict(
-    type='TextRecogConfigGenerator', data_root=data_root, test_anns=None)
+    type='TextRecogConfigGenerator',
+    data_root=data_root,
+    train_anns=[
+        dict(ann_file='textrecog_train.json', dataset_postfix=''),
+        dict(ann_file='subset_textrecog_train.json', dataset_postfix='sub'),
+    ],
+    test_anns=None)
--- a/dataset_zoo/synthtext/textrecog.py
+++ b/dataset_zoo/synthtext/textrecog.py
@ -27,4 +27,5 @@ config_generator = dict(
        dict(
            ann_file='alphanumeric_textrecog_train.json',
            dataset_postfix='an'),
-    ])
+    ],
+    test_anns=None)
--- a/mmocr/datasets/preparers/parsers/init.py
+++ b/mmocr/datasets/preparers/parsers/init.py
@ -5,6 +5,7 @@ from .ctw1500_parser import CTW1500AnnParser
 from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                               ICDARTxtTextRecogAnnParser)
+from .mjsynth_parser import MJSynthAnnParser
 from .naf_parser import NAFAnnParser
 from .sroie_parser import SROIETextDetAnnParser
 from .svt_parser import SVTTextDetAnnParser
@ -17,5 +18,5 @@ __all__ = [
    'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
    'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser',
-    'SynthTextAnnParser'
+    'SynthTextAnnParser', 'MJSynthAnnParser'
 ]
--- a/mmocr/datasets/preparers/parsers/mjsynth_parser.py
+++ b/mmocr/datasets/preparers/parsers/mjsynth_parser.py
@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmocr.registry import DATA_PARSERS
+from .icdar_txt_parser import ICDARTxtTextRecogAnnParser
+
+
+@DATA_PARSERS.register_module()
+class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
+    """MJSynth Text Recognition Annotation Parser.
+
+    The original annotation format of this dataset is stored in txt files,
+    which is formed as the following format:
+        img_path, transcription
+
+    Args:
+        separator (str): The separator between each element in a line. Defaults
+            to ','.
+        ignore (str): The text to be ignored. Defaults to '#'.
+        format (str): The format of the annotation. Defaults to 'img, text'.
+        encoding (str): The encoding of the annotation file. Defaults to
+            'utf-8-sig'.
+        nproc (int): The number of processes to parse the annotation. Defaults
+            to 1.
+        base_name (bool): Whether to use the basename of the image path as the
+            image name. Defaults to False.
+        remove_strs (List[str], Optional): Used to remove redundant strings in
+            the transcription. Defaults to ['"'].
+    """
+
+    def parse_files(self, img_dir: str, ann_path: str) -> List:
+        """Parse annotations."""
+        assert isinstance(ann_path, str)
+        samples = list()
+        for anno in self.loader(
+                file_path=ann_path,
+                format=self.format,
+                encoding=self.encoding,
+                separator=self.sep):
+            text = osp.basename(anno['img']).split('_')[1]
+            if self.remove_strs is not None:
+                for strs in self.remove_strs:
+                    text = text.replace(strs, '')
+            if text == self.ignore:
+                continue
+            img_name = anno['img']
+            samples.append((osp.join(img_dir, img_name), text))
+
+        return samples