From 4b887676a3a7dd85e95fe5161045596afc1e3cdc Mon Sep 17 00:00:00 2001
From: Tong Gao <gaotongxiao@gmail.com>
Date: Tue, 28 Mar 2023 16:20:24 +0800
Subject: [PATCH] [Fix] MJSynth & SynthText Dataset Preparer config (#1805)

* [Fix] MJSynth

* update

* fix

* fix
---
 configs/textrecog/_base_/datasets/mjsynth.py  |  6 +++
 dataset_zoo/mjsynth/textrecog.py              | 19 +++++--
 dataset_zoo/synthtext/textrecog.py            |  3 +-
 mmocr/datasets/preparers/parsers/__init__.py  |  3 +-
 .../preparers/parsers/mjsynth_parser.py       | 50 +++++++++++++++++++
 5 files changed, 76 insertions(+), 5 deletions(-)
 create mode 100644 mmocr/datasets/preparers/parsers/mjsynth_parser.py

diff --git a/configs/textrecog/_base_/datasets/mjsynth.py b/configs/textrecog/_base_/datasets/mjsynth.py
index 5f26e937..defe84a8 100644
--- a/configs/textrecog/_base_/datasets/mjsynth.py
+++ b/configs/textrecog/_base_/datasets/mjsynth.py
@@ -5,3 +5,9 @@ mjsynth_textrecog_train = dict(
     data_root=mjsynth_textrecog_data_root,
     ann_file='textrecog_train.json',
     pipeline=None)
+
+mjsynth_sub_textrecog_train = dict(
+    type='OCRDataset',
+    data_root=mjsynth_textrecog_data_root,
+    ann_file='subset_textrecog_train.json',
+    pipeline=None)
diff --git a/dataset_zoo/mjsynth/textrecog.py b/dataset_zoo/mjsynth/textrecog.py
index f54b7044..19751ec9 100644
--- a/dataset_zoo/mjsynth/textrecog.py
+++ b/dataset_zoo/mjsynth/textrecog.py
@@ -23,12 +23,19 @@ train_preparer = dict(
                         'annotations/annotation.txt'
                     ]
                 ]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+                'Syn90k/subset_textrecog_train.json',
+                save_name='subset_textrecog_train.json',
+                md5='ba958d87bb170980f39e194180c15b9e',
+                split=['train'],
+                content=['annotation'])
         ]),
     gatherer=dict(type='MonoGatherer', ann_name='annotation.txt'),
     parser=dict(
-        type='ICDARTxtTextRecogAnnParser',
+        type='MJSynthAnnParser',
         separator=' ',
-        format='img text',
+        format='img num',
         remove_strs=None),
     packer=dict(type='TextRecogPacker'),
     dumper=dict(type='JsonDumper'),
@@ -37,4 +44,10 @@ train_preparer = dict(
 delete = ['mjsynth', 'annotations']
 
 config_generator = dict(
-    type='TextRecogConfigGenerator', data_root=data_root, test_anns=None)
+    type='TextRecogConfigGenerator',
+    data_root=data_root,
+    train_anns=[
+        dict(ann_file='textrecog_train.json', dataset_postfix=''),
+        dict(ann_file='subset_textrecog_train.json', dataset_postfix='sub'),
+    ],
+    test_anns=None)
diff --git a/dataset_zoo/synthtext/textrecog.py b/dataset_zoo/synthtext/textrecog.py
index 2ab2bdd1..2c16a32a 100644
--- a/dataset_zoo/synthtext/textrecog.py
+++ b/dataset_zoo/synthtext/textrecog.py
@@ -27,4 +27,5 @@ config_generator = dict(
         dict(
             ann_file='alphanumeric_textrecog_train.json',
             dataset_postfix='an'),
-    ])
+    ],
+    test_anns=None)
diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py
index 58d6d9bd..fd379471 100644
--- a/mmocr/datasets/preparers/parsers/__init__.py
+++ b/mmocr/datasets/preparers/parsers/__init__.py
@@ -5,6 +5,7 @@ from .ctw1500_parser import CTW1500AnnParser
 from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                                ICDARTxtTextRecogAnnParser)
+from .mjsynth_parser import MJSynthAnnParser
 from .naf_parser import NAFAnnParser
 from .sroie_parser import SROIETextDetAnnParser
 from .svt_parser import SVTTextDetAnnParser
@@ -17,5 +18,5 @@ __all__ = [
     'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
     'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
     'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser',
-    'SynthTextAnnParser'
+    'SynthTextAnnParser', 'MJSynthAnnParser'
 ]
diff --git a/mmocr/datasets/preparers/parsers/mjsynth_parser.py b/mmocr/datasets/preparers/parsers/mjsynth_parser.py
new file mode 100644
index 00000000..3eee6e29
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/mjsynth_parser.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmocr.registry import DATA_PARSERS
+from .icdar_txt_parser import ICDARTxtTextRecogAnnParser
+
+
+@DATA_PARSERS.register_module()
+class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
+    """MJSynth Text Recognition Annotation Parser.
+
+    The original annotation format of this dataset is stored in txt files,
+    which is formed as the following format:
+        img_path, transcription
+
+    Args:
+        separator (str): The separator between each element in a line. Defaults
+            to ','.
+        ignore (str): The text to be ignored. Defaults to '#'.
+        format (str): The format of the annotation. Defaults to 'img, text'.
+        encoding (str): The encoding of the annotation file. Defaults to
+            'utf-8-sig'.
+        nproc (int): The number of processes to parse the annotation. Defaults
+            to 1.
+        base_name (bool): Whether to use the basename of the image path as the
+            image name. Defaults to False.
+        remove_strs (List[str], Optional): Used to remove redundant strings in
+            the transcription. Defaults to ['"'].
+    """
+
+    def parse_files(self, img_dir: str, ann_path: str) -> List:
+        """Parse annotations."""
+        assert isinstance(ann_path, str)
+        samples = list()
+        for anno in self.loader(
+                file_path=ann_path,
+                format=self.format,
+                encoding=self.encoding,
+                separator=self.sep):
+            text = osp.basename(anno['img']).split('_')[1]
+            if self.remove_strs is not None:
+                for strs in self.remove_strs:
+                    text = text.replace(strs, '')
+            if text == self.ignore:
+                continue
+            img_name = anno['img']
+            samples.append((osp.join(img_dir, img_name), text))
+
+        return samples