[Enhancement] Simplify mono_gather (#1588)

* [Enhancement] Simplify mono_gather * remove mono gather split Co-authored-by: liukuikun <641417025@qq.com>
2022-12-06 16:03:12 +08:00 · 2022-12-06 16:03:12 +08:00 · d9ea92191e
parent 3a0aa05d9c
commit d9ea92191e
8 changed files with 41 additions and 23 deletions
--- a/dataset_zoo/cute80/textrecog.py
+++ b/dataset_zoo/cute80/textrecog.py
@ -41,7 +41,7 @@ data_obtainer = dict(
 #     type='TextRecogDataConverter',
 #     splits=['test'],
 #     data_root=data_root,
-#     gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
+#     gatherer=dict(type='mono_gather', test_ann='test.txt'),
 #     parser=dict(
 #         type='ICDARTxtTextRecogAnnParser',
 #         separator=' ',
--- a/dataset_zoo/icdar2013/textrecog.py
+++ b/dataset_zoo/icdar2013/textrecog.py
@ -79,9 +79,11 @@ data_obtainer = dict(
 #     type='TextRecogDataConverter',
 #     splits=['train', 'test'],
 #     data_root=data_root,
-#     gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
+#     gatherer=dict(
+#         type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
 #     parser=dict(
-#         type='ICDARTxtTextRecogAnnParser', separator=', ', format='img, text'), # noqa
+#         type='ICDARTxtTextRecogAnnParser', separator=', ',
+#         format='img, text'),  # noqa
 #     dumper=dict(type='JsonDumper'))

 config_generator = dict(
--- a/dataset_zoo/icdar2015/textrecog.py
+++ b/dataset_zoo/icdar2015/textrecog.py
@ -51,7 +51,8 @@ data_converter = dict(
    type='TextRecogDataConverter',
    splits=['train', 'test'],
    data_root=data_root,
-    gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
+    gatherer=dict(
+        type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
    parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'),
    dumper=dict(type='JsonDumper'))

--- a/dataset_zoo/iiit5k/textrecog.py
+++ b/dataset_zoo/iiit5k/textrecog.py
@ -37,7 +37,8 @@ data_converter = dict(
    type='TextRecogDataConverter',
    splits=['train', 'test'],
    data_root=data_root,
-    gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
+    gatherer=dict(
+        type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
    parser=dict(
        type='ICDARTxtTextRecogAnnParser',
        encoding='utf-8',
--- a/dataset_zoo/svt/textdet.py
+++ b/dataset_zoo/svt/textdet.py
@ -21,7 +21,8 @@ data_converter = dict(
    type='TextDetDataConverter',
    splits=['train', 'test'],
    data_root=data_root,
-    gatherer=dict(type='mono_gather', mapping="f'{split}.xml'"),
+    gatherer=dict(
+        type='mono_gather', train_ann='train.xml', test_ann='test.xml'),
    parser=dict(type='SVTTextDetAnnParser', data_root=data_root),
    dumper=dict(type='JsonDumper'),
    delete=['annotations', 'svt'])
--- a/dataset_zoo/svtp/textrecog.py
+++ b/dataset_zoo/svtp/textrecog.py
@ -20,7 +20,7 @@ data_converter = dict(
    type='TextRecogDataConverter',
    splits=['test'],
    data_root=data_root,
-    gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
+    gatherer=dict(type='mono_gather', test_ann='test.txt'),
    parser=dict(
        type='ICDARTxtTextRecogAnnParser', separator=' ', format='img text'),
    dumper=dict(type='JsonDumper'),
--- a/dataset_zoo/wildreceipt/kie.py
+++ b/dataset_zoo/wildreceipt/kie.py
@ -26,7 +26,10 @@ data_converter = dict(
    splits=['train', 'test'],
    data_root=data_root,
    gatherer=dict(
-        type='mono_gather', mapping="f'{split}.txt'", ann_path=data_root),
+        type='mono_gather',
+        train_ann='train.txt',
+        test_ann='test.txt',
+        ann_path=data_root),
    parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root),
    dumper=dict(type='WildreceiptOpensetDumper'),
    delete=['wildreceipt'])
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@ -72,23 +72,23 @@ class BaseDataConverter:
                files.
        """
        # Convert and dump annotations to MMOCR format
-        for split in self.splits:
-            print(f'Parsing {split} split...')
+        for self.current_split in self.splits:
+            print(f'Parsing {self.current_split} split...')
            # Gather the info such as file names required by parser
-            img_path = osp.join(self.data_root, self.img_dir, split)
+            img_path = osp.join(self.data_root, self.img_dir,
+                                self.current_split)
            ann_path = osp.join(self.data_root, 'annotations')
-            gatherer_args = dict(
-                img_path=img_path, ann_path=ann_path, split=split)
+            gatherer_args = dict(img_path=img_path, ann_path=ann_path)
            gatherer_args.update(self.gatherer_args)
            files = self.gatherer(**gatherer_args)
            # Convert dataset annotations to MMOCR format
-            samples = self.parser.parse_files(files, split)
-            print(f'Packing {split} annotations...')
-            func = partial(self.pack_instance, split=split)
+            samples = self.parser.parse_files(files, self.current_split)
+            print(f'Packing {self.current_split} annotations...')
+            func = partial(self.pack_instance, split=self.current_split)
            samples = track_parallel_progress(func, samples, nproc=self.nproc)
            samples = self.add_meta(samples)
            # Dump annotation files
-            self.dumper.dump(samples, self.data_root, split)
+            self.dumper.dump(samples, self.data_root, self.current_split)
        self.clean()

    @abstractmethod
@ -116,7 +116,11 @@ class BaseDataConverter:
            Dict: A dict contains the meta information and samples.
        """

-    def mono_gather(self, ann_path: str, mapping: str, split: str,
+    def mono_gather(self,
+                    ann_path: str,
+                    train_ann: Optional[str] = None,
+                    val_ann: Optional[str] = None,
+                    test_ann: Optional[str] = None,
                    **kwargs) -> str:
        """Gather the dataset file. Specifically for the case that only one
        annotation file is needed. For example,
@ -127,16 +131,22 @@ class BaseDataConverter:

        Args:
            anno_path (str): Path to the annotations.
-            mapping (str): Mapping rule of the annotation names. For example,
-                "f'{split}.json'" will return 'train.json' when the split is
-                'train'.
-            split (str): The current split.
+            train_ann (str, optional): The annotation file name of the train
+                split in the original dataset. Defaults to None.
+            val_ann (str, optional): The annotation file name of the val split
+                in the original dataset. Defaults to None.
+            test_ann (str, optional): The annotation file name of the test
+                split in the original dataset. Defaults to None.

        Returns:
            str: Path to the annotation file.
        """

-        return osp.join(ann_path, eval(mapping))
+        ann_file = eval(f'{self.current_split}_ann')
+        if ann_file is None:
+            raise ValueError(
+                f'{self.current_split}_ann must be specified in gatherer!')
+        return osp.join(ann_path, ann_file)

    def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
                    **kwargs) -> List[Tuple]: