[Refactor] Refactor data converter and gather (#1707)

* Refactor dataprepare, abstract gather, packer * update ic13 ic15 naf iiit5k cute80 funsd * update dataset zoo config * add ut * finsh docstring * fix coco * fix comment
2025-06-03 21:54:47 +08:00 · 2023-03-03 15:27:19 +08:00 · 2023-03-03 15:27:19 +08:00 · 82f81ff67c
commit 82f81ff67c
parent 3aa9572a64
94 changed files with 3321 additions and 2069 deletions
--- a/dataset_zoo/cocotextv2/textdet.py
+++ b/dataset_zoo/cocotextv2/textdet.py
@ -1,41 +1,39 @@
 data_root = 'data/cocotextv2'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='http://images.cocodataset.org/zips/train2014.zip',
-            save_name='cocotextv2_train_img.zip',
-            md5='0da8c0bd3d6becc4dcb32757491aca88',
-            split=['train', 'val'],
-            content=['image'],
-            mapping=[['cocotextv2_train_img/train2014',
-                      'textdet_imgs/train']]),
-        dict(
-            url='https://github.com/bgshih/cocotext/releases/download/dl/'
-            'cocotext.v2.zip',
-            save_name='cocotextv2_annotation.zip',
-            md5='5e39f7d6f2f11324c6451e63523c440c',
-            split=['train', 'val'],
-            content=['annotation'],
-            mapping=[[
-                'cocotextv2_annotation/cocotext.v2.json',
-                'annotations/train.json'
-            ]]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train'],
-    data_root=data_root,
-    gatherer=dict(type='mono_gather', train_ann='train.json'),
-    parser=dict(
-        type='COCOTextDetAnnParser',
-        variant='cocotext',
-        data_root=data_root + '/textdet_imgs/train'),
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='http://images.cocodataset.org/zips/train2014.zip',
+                save_name='cocotextv2_train_img.zip',
+                md5='0da8c0bd3d6becc4dcb32757491aca88',
+                content=['image'],
+                mapping=[[
+                    'cocotextv2_train_img/train2014', 'textdet_imgs/imgs'
+                ]]),
+            dict(
+                url='https://github.com/bgshih/cocotext/releases/download/dl/'
+                'cocotext.v2.zip',
+                save_name='cocotextv2_annotation.zip',
+                md5='5e39f7d6f2f11324c6451e63523c440c',
+                content=['annotation'],
+                mapping=[[
+                    'cocotextv2_annotation/cocotext.v2.json',
+                    'annotations/train.json'
+                ]]),
+        ]),
+    gatherer=dict(
+        type='MonoGatherer',
+        ann_name='train.json',
+        img_dir='textdet_imgs/imgs'),
+    parser=dict(type='COCOTextDetAnnParser', variant='cocotext'),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'))

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+val_preparer = train_preparer
+
+delete = ['annotations', 'cocotextv2_annotation', 'cocotextv2_train_img']
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/cocotextv2/textrecog.py
+++ b/dataset_zoo/cocotextv2/textrecog.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.val_preparer.packer.type = 'TextRecogCropPacker'

 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/cocotextv2/textspotting.py
+++ b/dataset_zoo/cocotextv2/textspotting.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/cute80/textrecog.py
+++ b/dataset_zoo/cute80/textrecog.py
@ -2,51 +2,65 @@
 # the fixed version as done in
 # https://github.com/clovaai/deep-text-recognition-benchmark by default.
 # If you want to use the original version, please comment out the following
-# lines: L31-L38, and uncomment L23-L30, L40-L49.
+# lines: L10-L31, and uncomment L33-L63

 data_root = 'data/cute80'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/mixture/ct80/'
-            'timage.tar.gz',
-            save_name='ct80.tar.gz',
-            md5='9f3b1fe0e76f1fdfc70de3a365603d5e',
-            split=['test'],
-            content=['image'],
-            mapping=[['ct80/timage', 'textrecog_imgs/test']]),
-        # dict(
-        #     url='https://download.openmmlab.com/mmocr/data/mixture/ct80/'
-        #     'test_label.txt',
-        #     save_name='ct80_test.txt',
-        #     md5='f679dec62916d3268aff9cd81990d260',
-        #     split=['test'],
-        #     content=['annotation'],
-        #     mapping=[['ct80_test.txt', 'annotations/test.txt']])
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/1.x/recog/ct80/'
-            'textrecog_test.json',
-            save_name='textrecog_test.json',
-            md5='9c5c79d843b900325e7fd453b318cad9',
-            split=['test'],
-            content=['annotation'])
-    ])
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        data_root=data_root,
+        files=[
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/mixture/ct80/'
+                'timage.tar.gz',
+                save_name='ct80.tar.gz',
+                md5='9f3b1fe0e76f1fdfc70de3a365603d5e',
+                split=['test'],
+                content=['image'],
+                mapping=[['ct80/timage', 'textrecog_imgs/test']]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/ct80/'
+                'textrecog_test.json',
+                save_name='textrecog_test.json',
+                md5='9c5c79d843b900325e7fd453b318cad9',
+                split=['test'],
+                content=['annotation'])
+        ]))

-# data_converter = dict(
-#     type='TextRecogDataConverter',
-#     splits=['test'],
-#     data_root=data_root,
-#     gatherer=dict(type='mono_gather', test_ann='test.txt'),
+# test_preparer = dict(
+#     obtainer=dict(
+#         type='NaiveDataObtainer',
+#         cache_path=cache_path,
+#         data_root=data_root,
+#         files=[
+#             dict(
+#                 url='https://download.openmmlab.com/mmocr/data/mixture/ct80/'
+#                 'timage.tar.gz',
+#                 save_name='ct80.tar.gz',
+#                 md5='9f3b1fe0e76f1fdfc70de3a365603d5e',
+#                 split=['test'],
+#                 content=['image'],
+#                 mapping=[['ct80/timage', 'textrecog_imgs/test']]),
+#             dict(
+#                 url='https://download.openmmlab.com/mmocr/data/mixture/ct80/'
+#                 'test_label.txt',
+#                 save_name='ct80_test.txt',
+#                 md5='f679dec62916d3268aff9cd81990d260',
+#                 split=['test'],
+#                 content=['annotation'],
+#                 mapping=[['ct80_test.txt', 'annotations/test.txt']])
+#         ]),
+#     gatherer=dict(type='MonoGatherer', ann_name='test.txt'),
 #     parser=dict(
 #         type='ICDARTxtTextRecogAnnParser',
 #         separator=' ',
 #         format='img text ignore1 ignore2'),
-#     dumper=dict(type='JsonDumper'))
-
+#     packer=dict(type='TextRecogPacker'),
+#     dumper=dict(type='JsonDumper'),
+# )
+delete = ['ct80']
 config_generator = dict(
    type='TextRecogConfigGenerator', data_root=data_root, train_anns=None)
--- a/dataset_zoo/funsd/textdet.py
+++ b/dataset_zoo/funsd/textdet.py
@ -1,38 +1,62 @@
 data_root = 'data/funsd'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://guillaumejaume.github.io/FUNSD/dataset.zip',
-            save_name='funsd.zip',
-            md5='e05de47de238aa343bf55d8807d659a9',
-            split=['train', 'test'],
-            content=['image', 'annotation'],
-            mapping=[
-                ['funsd/dataset/training_data/images', 'textdet_imgs/train'],
-                ['funsd/dataset/testing_data/images', 'textdet_imgs/test'],
-                [
-                    'funsd/dataset/training_data/annotations',
-                    'annotations/train'
-                ],
-                ['funsd/dataset/testing_data/annotations', 'annotations/test'],
-            ]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        data_root=data_root,
+        files=[
+            dict(
+                url='https://guillaumejaume.github.io/FUNSD/dataset.zip',
+                save_name='funsd.zip',
+                md5='e05de47de238aa343bf55d8807d659a9',
+                content=['image', 'annotation'],
+                mapping=[
+                    [
+                        'funsd/dataset/training_data/images',
+                        'textdet_imgs/train'
+                    ],
+                    [
+                        'funsd/dataset/training_data/annotations',
+                        'annotations/train'
+                    ],
+                ]),
+        ]),
    gatherer=dict(
-        type='pair_gather',
-        suffixes=['.png'],
+        type='PairGatherer',
+        img_suffixes=['.png'],
        rule=[r'(\w+)\.png', r'\1.json']),
    parser=dict(type='FUNSDTextDetAnnParser'),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'),
-    delete=['annotations', 'funsd'])
+)

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://guillaumejaume.github.io/FUNSD/dataset.zip',
+                save_name='funsd.zip',
+                md5='e05de47de238aa343bf55d8807d659a9',
+                content=['image', 'annotation'],
+                mapping=[
+                    ['funsd/dataset/testing_data/images', 'textdet_imgs/test'],
+                    [
+                        'funsd/dataset/testing_data/annotations',
+                        'annotations/test'
+                    ],
+                ]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.png'],
+        rule=[r'(\w+)\.png', r'\1.json']),
+    parser=dict(type='FUNSDTextDetAnnParser'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+delete = ['annotations', 'funsd']
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/funsd/textrecog.py
+++ b/dataset_zoo/funsd/textrecog.py
@ -1,5 +1,9 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.test_preparer.packer.type = 'TextRecogCropPacker'

 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/funsd/textspotting.py
+++ b/dataset_zoo/funsd/textspotting.py
@ -1,5 +1,8 @@
 _base_ = ['textdet.py']
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/icdar2013/textdet.py
+++ b/dataset_zoo/icdar2013/textdet.py
@ -1,52 +1,29 @@
 data_root = 'data/icdar2013'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge2_Training_Task12_Images.zip',
-            save_name='ic13_textdet_train_img.zip',
-            md5='a443b9649fda4229c9bc52751bad08fb',
-            split=['train'],
-            content=['image'],
-            mapping=[['ic13_textdet_train_img', 'textdet_imgs/train']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge2_Test_Task12_Images.zip',
-            save_name='ic13_textdet_test_img.zip',
-            md5='af2e9f070c4c6a1c7bdb7b36bacf23e3',
-            split=['test'],
-            content=['image'],
-            mapping=[['ic13_textdet_test_img', 'textdet_imgs/test']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge2_Training_Task1_GT.zip',
-            save_name='ic13_textdet_train_gt.zip',
-            md5='f3a425284a66cd67f455d389c972cce4',
-            split=['train'],
-            content=['annotation'],
-            mapping=[['ic13_textdet_train_gt', 'annotations/train']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge2_Test_Task1_GT.zip',
-            save_name='ic13_textdet_test_gt.zip',
-            md5='3191c34cd6ac28b60f5a7db7030190fb',
-            split=['test'],
-            content=['annotation'],
-            mapping=[['ic13_textdet_test_gt', 'annotations/test']]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge2_Training_Task12_Images.zip',
+                save_name='ic13_textdet_train_img.zip',
+                md5='a443b9649fda4229c9bc52751bad08fb',
+                content=['image'],
+                mapping=[['ic13_textdet_train_img', 'textdet_imgs/train']]),
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge2_Training_Task1_GT.zip',
+                save_name='ic13_textdet_train_gt.zip',
+                md5='f3a425284a66cd67f455d389c972cce4',
+                content=['annotation'],
+                mapping=[['ic13_textdet_train_gt', 'annotations/train']]),
+        ]),
    gatherer=dict(
-        type='pair_gather',
-        suffixes=['.jpg'],
+        type='PairGatherer',
+        img_suffixes=['.jpg'],
        rule=[r'(\w+)\.jpg', r'gt_\1.txt']),
    parser=dict(
        type='ICDARTxtTextDetAnnParser',
@ -54,6 +31,45 @@ data_converter = dict(
        format='x1 y1 x2 y2 trans',
        separator=' ',
        mode='xyxy'),
-    dumper=dict(type='JsonDumper'))
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge2_Test_Task12_Images.zip',
+                save_name='ic13_textdet_test_img.zip',
+                md5='af2e9f070c4c6a1c7bdb7b36bacf23e3',
+                content=['image'],
+                mapping=[['ic13_textdet_test_img', 'textdet_imgs/test']]),
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge2_Test_Task1_GT.zip',
+                save_name='ic13_textdet_test_gt.zip',
+                md5='3191c34cd6ac28b60f5a7db7030190fb',
+                content=['annotation'],
+                mapping=[['ic13_textdet_test_gt', 'annotations/test']]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.jpg'],
+        rule=[r'(\w+)\.jpg', r'gt_\1.txt']),
+    parser=dict(
+        type='ICDARTxtTextDetAnnParser',
+        remove_strs=[',', '"'],
+        format='x1 y1 x2 y2 trans',
+        separator=' ',
+        mode='xyxy'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+delete = [
+    'annotations', 'ic13_textdet_train_img', 'ic13_textdet_train_gt',
+    'ic13_textdet_test_img', 'ic13_textdet_test_gt'
+]
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/icdar2013/textrecog.py
+++ b/dataset_zoo/icdar2013/textrecog.py
@ -8,87 +8,118 @@
 data_root = 'data/icdar2013'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge2_Training_Task3_Images_GT.zip',
-            save_name='ic13_textrecog_train_img_gt.zip',
-            md5='6f0dbc823645968030878df7543f40a4',
-            split=['train'],
-            content=['image'],
-            mapping=[[
-                'ic13_textrecog_train_img_gt/gt.txt', 'annotations/train.txt'
-            ], ['ic13_textrecog_train_img_gt', 'textrecog_imgs/train']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge2_Test_Task3_Images.zip',
-            save_name='ic13_textrecog_test_img.zip',
-            md5='3206778eebb3a5c5cc15c249010bf77f',
-            split=['test'],
-            content=['image'],
-            mapping=[['ic13_textrecog_test_img', 'textrecog_imgs/test']]),
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
-            'icdar_2013/train_labels.json',
-            save_name='ic13_train_labels.json',
-            md5='008fcd0056e72c4cf3064fb4d1fce81b',
-            split=['train'],
-            content=['annotation'],
-            mapping=[['ic13_train_labels.json', 'textrecog_train.json']]),
-        # Note that we offer two versions of test set annotations as follows.
-        # Please choose one of them to download and comment the other. By
-        # default, we use the second one.
-        # 1. The original official annotation, which contains 1095 test
-        # samples.
-        # dict(
-        #     url='https://rrc.cvc.uab.es/downloads/'
-        #     'Challenge2_Test_Task3_GT.txt',
-        #     save_name='ic13_textrecog_test_gt.txt',
-        #     md5='2634060ed8fe6e7a4a9b8d68785835a1',
-        #     split=['test'],
-        #     content=['annotation'],
-        #     mapping=[['ic13_textrecog_test_gt.txt', 'annotations/test.txt']]),  # noqa
-        # 2. The widely-used version for academic purpose, which filters out
-        # words with non-alphanumeric characters. This version contains 1015
-        # test samples.
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
-            'icdar_2013/textrecog_test_1015.json',
-            save_name='textrecog_test.json',
-            md5='68fdd818f63df8b93dc952478952009a',
-            split=['test'],
-            content=['annotation'],
-        ),
-        # 3. The 857 version further pruned words shorter than 3 characters.
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
-            'icdar_2013/textrecog_test_857.json',
-            save_name='textrecog_test_857.json',
-            md5='3bed3985b0c51a989ad4006f6de8352b',
-            split=['test'],
-            content=['annotation'],
-        ),
-    ])
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge2_Training_Task3_Images_GT.zip',
+                save_name='ic13_textrecog_train_img_gt.zip',
+                md5='6f0dbc823645968030878df7543f40a4',
+                content=['image'],
+                mapping=[
+                    # ['ic13_textrecog_train_img_gt/gt.txt',
+                    # 'annotations/train.txt'],
+                    ['ic13_textrecog_train_img_gt', 'textrecog_imgs/train']
+                ]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+                'icdar_2013/train_labels.json',
+                save_name='ic13_train_labels.json',
+                md5='008fcd0056e72c4cf3064fb4d1fce81b',
+                content=['annotation'],
+                mapping=[['ic13_train_labels.json', 'textrecog_train.json']]),
+        ]))

-# Uncomment the data converter if you want to use the original 1095 version.
-# data_converter = dict(
-#     type='TextRecogDataConverter',
-#     splits=['train', 'test'],
-#     data_root=data_root,
-#     gatherer=dict(
-#         type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
+# Note that we offer two versions of test set annotations as follows.Please
+# choose one of them to download and comment the other. By default, we use the
+# second one.
+# 1. The original official annotation, which contains 1095 test
+# samples.
+
+# Uncomment the test_preparer if you want to use the original 1095 version.
+
+# test_preparer = dict(
+#     obtainer=dict(
+#         type='NaiveDataObtainer',
+#         cache_path=cache_path,
+#         files=[
+#             dict(
+#                 url='https://rrc.cvc.uab.es/downloads/'
+#                 'Challenge2_Test_Task3_Images.zip',
+#                 save_name='ic13_textrecog_test_img.zip',
+#                 md5='3206778eebb3a5c5cc15c249010bf77f',
+#                 split=['test'],
+#                 content=['image'],
+#                 mapping=[['ic13_textrecog_test_img',
+#                           'textrecog_imgs/test']]),
+#             dict(
+#                 url='https://rrc.cvc.uab.es/downloads/'
+#                 'Challenge2_Test_Task3_GT.txt',
+#                 save_name='ic13_textrecog_test_gt.txt',
+#                 md5='2634060ed8fe6e7a4a9b8d68785835a1',
+#                 split=['test'],
+#                 content=['annotation'],
+#                 mapping=[[
+#                     'ic13_textrecog_test_gt.txt', 'annotations/test.txt'
+#                 ]]),  # noqa
+#             # The 857 version further pruned words shorter than 3 characters.
+#             dict(
+#                 url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+#                 'icdar_2013/textrecog_test_857.json',
+#                 save_name='textrecog_test_857.json',
+#                 md5='3bed3985b0c51a989ad4006f6de8352b',
+#                 split=['test'],
+#                 content=['annotation'],
+#             ),
+#         ]),
+#     gatherer=dict(type='MonoGatherer', ann_name='test.txt'),
 #     parser=dict(
 #         type='ICDARTxtTextRecogAnnParser', separator=', ',
 #         format='img, text'),  # noqa
-#     dumper=dict(type='JsonDumper'))
+#     packer=dict(type='TextRecogPacker'),
+#     dumper=dict(type='JsonDumper'),
+# )
+
+# 2. The widely-used version for academic purpose, which filters
+# out words with non-alphanumeric characters. This version contains
+# 1015 test samples.
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge2_Test_Task3_Images.zip',
+                save_name='ic13_textrecog_test_img.zip',
+                md5='3206778eebb3a5c5cc15c249010bf77f',
+                split=['test'],
+                content=['image'],
+                mapping=[['ic13_textrecog_test_img', 'textrecog_imgs/test']]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+                'icdar_2013/textrecog_test_1015.json',
+                save_name='textrecog_test.json',
+                md5='68fdd818f63df8b93dc952478952009a',
+                split=['test'],
+                content=['annotation'],
+            ),
+            # The 857 version further pruned words shorter than 3 characters.
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+                'icdar_2013/textrecog_test_857.json',
+                save_name='textrecog_test_857.json',
+                md5='3bed3985b0c51a989ad4006f6de8352b',
+                split=['test'],
+                content=['annotation'],
+            ),
+        ]))

 config_generator = dict(
    type='TextRecogConfigGenerator',
-    data_root=data_root,
    test_anns=[
        dict(ann_file='textrecog_test.json'),
        dict(dataset_postfix='857', ann_file='textrecog_test_857.json')
--- a/dataset_zoo/icdar2013/textspotting.py
+++ b/dataset_zoo/icdar2013/textspotting.py
@ -1,5 +1,8 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/icdar2015/textdet.py
+++ b/dataset_zoo/icdar2015/textdet.py
@ -1,53 +1,60 @@
 data_root = 'data/icdar2015'
 cache_path = 'data/cache'
-
-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip',
-            save_name='ic15_textdet_train_img.zip',
-            md5='c51cbace155dcc4d98c8dd19d378f30d',
-            split=['train'],
-            content=['image'],
-            mapping=[['ic15_textdet_train_img', 'textdet_imgs/train']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip',
-            save_name='ic15_textdet_test_img.zip',
-            md5='97e4c1ddcf074ffcc75feff2b63c35dd',
-            split=['test'],
-            content=['image'],
-            mapping=[['ic15_textdet_test_img', 'textdet_imgs/test']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'ch4_training_localization_transcription_gt.zip',
-            save_name='ic15_textdet_train_gt.zip',
-            md5='3bfaf1988960909014f7987d2343060b',
-            split=['train'],
-            content=['annotation'],
-            mapping=[['ic15_textdet_train_gt', 'annotations/train']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge4_Test_Task4_GT.zip',
-            save_name='ic15_textdet_test_gt.zip',
-            md5='8bce173b06d164b98c357b0eb96ef430',
-            split=['test'],
-            content=['annotation'],
-            mapping=[['ic15_textdet_test_gt', 'annotations/test']]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip',
+                save_name='ic15_textdet_train_img.zip',
+                md5='c51cbace155dcc4d98c8dd19d378f30d',
+                content=['image'],
+                mapping=[['ic15_textdet_train_img', 'textdet_imgs/train']]),
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'ch4_training_localization_transcription_gt.zip',
+                save_name='ic15_textdet_train_gt.zip',
+                md5='3bfaf1988960909014f7987d2343060b',
+                content=['annotation'],
+                mapping=[['ic15_textdet_train_gt', 'annotations/train']]),
+        ]),
    gatherer=dict(
-        type='pair_gather',
-        suffixes=['.jpg', '.JPG'],
+        type='PairGatherer',
+        img_suffixes=['.jpg', '.JPG'],
        rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
    parser=dict(type='ICDARTxtTextDetAnnParser', encoding='utf-8-sig'),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'),
-    delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img'])
+)

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip',
+                save_name='ic15_textdet_test_img.zip',
+                md5='97e4c1ddcf074ffcc75feff2b63c35dd',
+                content=['image'],
+                mapping=[['ic15_textdet_test_img', 'textdet_imgs/test']]),
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge4_Test_Task4_GT.zip',
+                save_name='ic15_textdet_test_gt.zip',
+                md5='8bce173b06d164b98c357b0eb96ef430',
+                content=['annotation'],
+                mapping=[['ic15_textdet_test_gt', 'annotations/test']]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.jpg', '.JPG'],
+        rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
+    parser=dict(type='ICDARTxtTextDetAnnParser', encoding='utf-8-sig'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+
+config_generator = dict(type='TextDetConfigGenerator')
+delete = ['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']
--- a/dataset_zoo/icdar2015/textrecog.py
+++ b/dataset_zoo/icdar2015/textrecog.py
@ -4,61 +4,66 @@
 data_root = 'data/icdar2015'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'ch4_training_word_images_gt.zip',
-            save_name='ic15_textrecog_train_img_gt.zip',
-            md5='600caf8c6a64a3dcf638839820edcca9',
-            split=['train'],
-            content=['image', 'annotation'],
-            mapping=[[
-                'ic15_textrecog_train_img_gt/gt.txt', 'annotations/train.txt'
-            ], ['ic15_textrecog_train_img_gt', 'textrecog_imgs/train']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/ch4_test_word_images_gt.zip',
-            save_name='ic15_textrecog_test_img.zip',
-            md5='d7a71585f4cc69f89edbe534e7706d5d',
-            split=['test'],
-            content=['image'],
-            mapping=[['ic15_textrecog_test_img', 'textrecog_imgs/test']]),
-        dict(
-            url='https://rrc.cvc.uab.es/downloads/'
-            'Challenge4_Test_Task3_GT.txt',
-            save_name='ic15_textrecog_test_gt.txt',
-            md5='d7a71585f4cc69f89edbe534e7706d5d',
-            split=['test'],
-            content=['annotation'],
-            mapping=[['ic15_textrecog_test_gt.txt', 'annotations/test.txt']]),
-        # 3. The 1811 version discards non-alphanumeric character images and
-        # some extremely rotated, perspective-shifted, and curved images for
-        # evaluation
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
-            'icdar_2015/textrecog_test_1811.json',
-            save_name='textrecog_test_1811.json',
-            md5='8d218ef1c37540ea959e22eeabc79ae4',
-            split=['test'],
-            content=['annotation'],
-        ),
-    ])
-
-data_converter = dict(
-    type='TextRecogDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
-    gatherer=dict(
-        type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'ch4_training_word_images_gt.zip',
+                save_name='ic15_textrecog_train_img_gt.zip',
+                md5='600caf8c6a64a3dcf638839820edcca9',
+                content=['image', 'annotation'],
+                mapping=[[
+                    'ic15_textrecog_train_img_gt/gt.txt',
+                    'annotations/train.txt'
+                ], ['ic15_textrecog_train_img_gt', 'textrecog_imgs/train']]),
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='train.txt'),
    parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'),
+    packer=dict(type='TextRecogPacker'),
    dumper=dict(type='JsonDumper'))

+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'ch4_test_word_images_gt.zip',
+                save_name='ic15_textrecog_test_img.zip',
+                md5='d7a71585f4cc69f89edbe534e7706d5d',
+                content=['image'],
+                mapping=[['ic15_textrecog_test_img', 'textrecog_imgs/test']]),
+            dict(
+                url='https://rrc.cvc.uab.es/downloads/'
+                'Challenge4_Test_Task3_GT.txt',
+                save_name='ic15_textrecog_test_gt.txt',
+                md5='d7a71585f4cc69f89edbe534e7706d5d',
+                content=['annotation'],
+                mapping=[[
+                    'ic15_textrecog_test_gt.txt', 'annotations/test.txt'
+                ]]),
+            # 3. The 1811 version discards non-alphanumeric character images
+            # and some extremely rotated, perspective-shifted, and curved
+            # images for evaluation
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
+                'icdar_2015/textrecog_test_1811.json',
+                save_name='textrecog_test_1811.json',
+                md5='8d218ef1c37540ea959e22eeabc79ae4',
+                content=['annotation'],
+            ),
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='test.txt'),
+    parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'),
+    packer=dict(type='TextRecogPacker'),
+    dumper=dict(type='JsonDumper'))
+delete = ['annotations']
 config_generator = dict(
    type='TextRecogConfigGenerator',
-    data_root=data_root,
    test_anns=[
        dict(ann_file='textrecog_test.json'),
        dict(dataset_postfix='1811', ann_file='textrecog_test_1811.json')
--- a/dataset_zoo/icdar2015/textspotting.py
+++ b/dataset_zoo/icdar2015/textspotting.py
@ -1,5 +1,7 @@
 _base_ = ['textdet.py']
-
-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/iiit5k/textrecog.py
+++ b/dataset_zoo/iiit5k/textrecog.py
@ -1,50 +1,64 @@
 data_root = 'data/iiit5k'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/'
-            'IIIT5K-Word_V3.0.tar.gz',
-            save_name='IIIT5K.tar.gz',
-            md5='56781bc327d22066aa1c239ee788fd46',
-            split=['test', 'train'],
-            content=['image'],
-            mapping=[['IIIT5K/IIIT5K/test', 'textrecog_imgs/test'],
-                     ['IIIT5K/IIIT5K/train', 'textrecog_imgs/train']]),
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/'
-            'test_label.txt',
-            save_name='iiit5k_test.txt',
-            md5='82ecfa34a28d59284d1914dc906f5380',
-            split=['test'],
-            content=['annotation'],
-            mapping=[['iiit5k_test.txt', 'annotations/test.txt']]),
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/'
-            'train_label.txt',
-            save_name='iiit5k_train.txt',
-            md5='f4731ce1eadc259532c2834266e5126d',
-            split=['train'],
-            content=['annotation'],
-            mapping=[['iiit5k_train.txt', 'annotations/train.txt']]),
-    ])
-
-data_converter = dict(
-    type='TextRecogDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
-    gatherer=dict(
-        type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/'
+                'IIIT5K-Word_V3.0.tar.gz',
+                save_name='IIIT5K.tar.gz',
+                md5='56781bc327d22066aa1c239ee788fd46',
+                content=['image'],
+                mapping=[['IIIT5K/IIIT5K/train', 'textrecog_imgs/train']]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/'
+                'train_label.txt',
+                save_name='iiit5k_train.txt',
+                md5='f4731ce1eadc259532c2834266e5126d',
+                content=['annotation'],
+                mapping=[['iiit5k_train.txt', 'annotations/train.txt']])
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='train.txt'),
    parser=dict(
        type='ICDARTxtTextRecogAnnParser',
        encoding='utf-8',
        separator=' ',
        format='img text'),
+    packer=dict(type='TextRecogPacker'),
    dumper=dict(type='JsonDumper'),
-    delete=['annotations', 'IIIT5K'])
+)

-config_generator = dict(type='TextRecogConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/'
+                'IIIT5K-Word_V3.0.tar.gz',
+                save_name='IIIT5K.tar.gz',
+                md5='56781bc327d22066aa1c239ee788fd46',
+                content=['image'],
+                mapping=[['IIIT5K/IIIT5K/test', 'textrecog_imgs/test']]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/'
+                'test_label.txt',
+                save_name='iiit5k_test.txt',
+                md5='82ecfa34a28d59284d1914dc906f5380',
+                content=['annotation'],
+                mapping=[['iiit5k_test.txt', 'annotations/test.txt']])
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='test.txt'),
+    parser=dict(
+        type='ICDARTxtTextRecogAnnParser',
+        encoding='utf-8',
+        separator=' ',
+        format='img text'),
+    packer=dict(type='TextRecogPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+delete = ['annotations', 'IIIT5K']
+config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/naf/textdet.py
+++ b/dataset_zoo/naf/textdet.py
@ -1,17 +1,15 @@
 data_root = 'data/naf'
 cache_path = 'data/cache'

-data_obtainer = dict(
+obtainer = dict(
    type='NaiveDataObtainer',
    cache_path=cache_path,
-    data_root=data_root,
    files=[
        dict(
            url='https://github.com/herobd/NAF_dataset/releases/'
            'download/v1.0/labeled_images.tar.gz',
            save_name='naf_image.tar.gz',
            md5='6521cdc25c313a1f2928a16a77ad8f29',
-            split=['train', 'test', 'val'],
            content=['image'],
            mapping=[['naf_image/labeled_images', 'temp_images/']]),
        dict(
@ -19,7 +17,6 @@ data_obtainer = dict(
            'refs/heads/master.zip',
            save_name='naf_anno.zip',
            md5='abf5af6266cc527d772231751bc884b3',
-            split=['train', 'test', 'val'],
            content=['annotation'],
            mapping=[
                [
@ -33,17 +30,21 @@ data_obtainer = dict(
            ]),
    ])

-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test', 'val'],
-    data_root=data_root,
-    gatherer=dict(type='naf_gather'),
-    parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
-    delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
+train_preparer = dict(
+    obtainer=obtainer,
+    gatherer=dict(type='NAFGatherer'),
+    parser=dict(type='NAFAnnParser', det=True),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'),
-    nproc=1)
+)

+test_preparer = train_preparer
+
+val_preparer = train_preparer
+
+delete = [
+    'temp_images', 'data_split.json', 'annotations', 'naf_anno', 'naf_image'
+]
 config_generator = dict(
    type='TextDetConfigGenerator',
-    data_root=data_root,
    val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])
--- a/dataset_zoo/naf/textrecog.py
+++ b/dataset_zoo/naf/textrecog.py
@ -4,16 +4,15 @@
 # not to use them for recognition and text spotting.

 _base_ = ['textdet.py']
-data_root = 'data/naf'
-
-data_converter = dict(
-    type='TextRecogCropConverter',
-    parser=dict(
-        type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
-        det=False),
-    delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
-
+_base_.train_preparer.parser.update(dict(ignore=['¿', '§'], det=False))
+_base_.test_preparer.parser.update(dict(ignore=['¿', '§'], det=False))
+_base_.val_preparer.parser.update(dict(ignore=['¿', '§'], det=False))
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.test_preparer.packer.type = 'TextRecogCropPacker'
+_base_.val_preparer.packer.type = 'TextRecogCropPacker'
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val'
 config_generator = dict(
    type='TextRecogConfigGenerator',
-    data_root=data_root,
    val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])
--- a/dataset_zoo/naf/textspotting.py
+++ b/dataset_zoo/naf/textspotting.py
@ -4,15 +4,16 @@
 # not to use them for recognition and text spotting.

 _base_ = ['textdet.py']
-data_root = 'data/naf'
-data_converter = dict(
-    type='TextSpottingDataConverter',
-    parser=dict(
-        type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
-        det=False),
-    delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
+_base_.train_preparer.parser.update(dict(ignore=['¿', '§'], det=False))
+_base_.test_preparer.parser.update(dict(ignore=['¿', '§'], det=False))
+_base_.val_preparer.parser.update(dict(ignore=['¿', '§'], det=False))
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'
+_base_.val_preparer.packer.type = 'TextSpottingPacker'
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val'

 config_generator = dict(
    type='TextSpottingConfigGenerator',
-    data_root=data_root,
    val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])
--- a/dataset_zoo/sroie/textdet.py
+++ b/dataset_zoo/sroie/textdet.py
@ -1,55 +1,64 @@
 data_root = 'data/sroie'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/'
-            'sroie/0325updated.task1train(626p).zip',
-            save_name='0325updated.task1train(626p).zip',
-            md5='16137490f6865caac75772b9111d348c',
-            split=['train'],
-            content=['image', 'annotation'],
-            mapping=[[
-                '0325updated/0325updated.task1train(626p)/*.jpg',
-                'textdet_imgs/train'
-            ],
-                     [
-                         '0325updated/0325updated.task1train(626p)/*.txt',
-                         'annotations/train'
-                     ]]),
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/'
-            'sroie/task1&2_test(361p).zip',
-            save_name='task1&2_test(361p).zip',
-            md5='1bde54705db0995c57a6e34cce437fea',
-            split=['test'],
-            content=['image'],
-            mapping=[[
-                'task1&2_test(361p)/fulltext_test(361p)', 'textdet_imgs/test'
-            ]]),
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/sroie/text.zip',
-            save_name='text.zip',
-            md5='8c534653f252ff4d3943fa27a956a74b',
-            split=['test'],
-            content=['annotation'],
-            mapping=[['text', 'annotations/test']]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/'
+                'sroie/0325updated.task1train(626p).zip',
+                save_name='0325updated.task1train(626p).zip',
+                md5='16137490f6865caac75772b9111d348c',
+                content=['image', 'annotation'],
+                mapping=[[
+                    '0325updated/0325updated.task1train(626p)/*.jpg',
+                    'textdet_imgs/train'
+                ],
+                         [
+                             '0325updated/0325updated.task1train(626p)/*.txt',
+                             'annotations/train'
+                         ]])
+        ]),
    gatherer=dict(
-        type='pair_gather',
-        suffixes=['.jpg'],
+        type='PairGatherer',
+        img_suffixes=['.jpg'],
        rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
    parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'),
-    delete=['text', 'task1&2_test(361p)', '0325updated', 'annotations'])
+)

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/'
+                'sroie/task1&2_test(361p).zip',
+                save_name='task1&2_test(361p).zip',
+                md5='1bde54705db0995c57a6e34cce437fea',
+                content=['image'],
+                mapping=[[
+                    'task1&2_test(361p)/fulltext_test(361p)',
+                    'textdet_imgs/test'
+                ]]),
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/sroie/text.zip',
+                save_name='text.zip',
+                md5='8c534653f252ff4d3943fa27a956a74b',
+                content=['annotation'],
+                mapping=[['text', 'annotations/test']]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.jpg'],
+        rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
+    parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+delete = ['text', 'task1&2_test(361p)', '0325updated', 'annotations']
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/sroie/textrecog.py
+++ b/dataset_zoo/sroie/textrecog.py
@ -1,5 +1,8 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.test_preparer.packer.type = 'TextRecogCropPacker'

 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/sroie/textspotting.py
+++ b/dataset_zoo/sroie/textspotting.py
@ -1,5 +1,8 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/svt/textdet.py
+++ b/dataset_zoo/svt/textdet.py
@ -1,30 +1,44 @@
 data_root = 'data/svt'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='http://www.iapr-tc11.org/dataset/SVT/svt.zip',
-            save_name='svt.zip',
-            md5='42d19160010d990ae6223b14f45eff88',
-            split=['train', 'test'],
-            content=['image', 'annotations'],
-            mapping=[['svt/svt1/train.xml', 'annotations/train.xml'],
-                     ['svt/svt1/test.xml', 'annotations/test.xml'],
-                     ['svt/svt1/img', 'textdet_imgs/img']]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='http://www.iapr-tc11.org/dataset/SVT/svt.zip',
+                save_name='svt.zip',
+                md5='42d19160010d990ae6223b14f45eff88',
+                content=['image', 'annotations'],
+                mapping=[['svt/svt1/train.xml', 'annotations/train.xml'],
+                         ['svt/svt1/img', 'textdet_imgs/img']]),
+        ]),
    gatherer=dict(
-        type='mono_gather', train_ann='train.xml', test_ann='test.xml'),
-    parser=dict(type='SVTTextDetAnnParser', data_root=data_root),
+        type='MonoGatherer', ann_name='train.xml', img_dir='textdet_imgs/img'),
+    parser=dict(type='SVTTextDetAnnParser'),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'),
-    delete=['annotations', 'svt'])
+)

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='http://www.iapr-tc11.org/dataset/SVT/svt.zip',
+                save_name='svt.zip',
+                md5='42d19160010d990ae6223b14f45eff88',
+                content=['image', 'annotations'],
+                mapping=[['svt/svt1/test.xml', 'annotations/test.xml'],
+                         ['svt/svt1/img', 'textdet_imgs/img']]),
+        ]),
+    gatherer=dict(
+        type='MonoGatherer', ann_name='test.xml', img_dir='textdet_imgs/img'),
+    parser=dict(type='SVTTextDetAnnParser'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+delete = ['annotations', 'svt']
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/svt/textrecog.py
+++ b/dataset_zoo/svt/textrecog.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.test_preparer.packer.type = 'TextRecogCropPacker'

 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/svt/textspotting.py
+++ b/dataset_zoo/svt/textspotting.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/svtp/textrecog.py
+++ b/dataset_zoo/svtp/textrecog.py
@ -1,29 +1,23 @@
 data_root = 'data/svtp'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/svtp.zip',
-            save_name='svtp.zip',
-            md5='4232b46c81ba99eea6d057dcb06b8f75',
-            split=['test'],
-            content=['image', 'annotation'],
-            mapping=[['svtp/par1', 'textrecog_imgs/test'],
-                     ['svtp/gt.txt', 'annotations/test.txt']]),
-    ])
-
-data_converter = dict(
-    type='TextRecogDataConverter',
-    splits=['test'],
-    data_root=data_root,
-    gatherer=dict(type='mono_gather', test_ann='test.txt'),
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/svtp.zip',
+                save_name='svtp.zip',
+                md5='4232b46c81ba99eea6d057dcb06b8f75',
+                content=['image', 'annotation'],
+                mapping=[['svtp/par1', 'textrecog_imgs/test'],
+                         ['svtp/gt.txt', 'annotations/test.txt']]),
+        ]),
+    gatherer=dict(type='MonoGatherer', ann_name='test.txt'),
    parser=dict(
        type='ICDARTxtTextRecogAnnParser', separator=' ', format='img text'),
-    dumper=dict(type='JsonDumper'),
-    delete=['svtp', 'annotations'])
-
-config_generator = dict(type='TextRecogConfigGenerator', data_root=data_root)
+    packer=dict(type='TextRecogPacker'),
+    dumper=dict(type='JsonDumper'))
+delete = ['svtp', 'annotations']
+config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/textocr/textdet.py
+++ b/dataset_zoo/textocr/textdet.py
@ -1,52 +1,67 @@
 data_root = 'data/textocr'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://dl.fbaipublicfiles.com/textvqa/images/'
-            'train_val_images.zip',
-            save_name='textocr_textdet_train_val_img.zip',
-            md5='d12dd8098899044e4ae1af34db7ecfef',
-            split=['train', 'val'],
-            content=['image'],
-            mapping=[[
-                'textocr_textdet_train_val_img/train_images',
-                'textdet_imgs/train'
-            ]]),
-        dict(
-            url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/'
-            'TextOCR_0.1_train.json',
-            save_name='textocr_textdet_train.json',
-            md5='0f8ba1beefd2ca4d08a4f82bcbe6cfb4',
-            split=['train'],
-            content=['annotation'],
-            mapping=[['textocr_textdet_train.json',
-                      'annotations/train.json']]),
-        dict(
-            url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/'
-            'TextOCR_0.1_val.json',
-            save_name='textocr_textdet_val.json',
-            md5='fb151383ea7b3c530cde9ef0d5c08347',
-            split=['val'],
-            content=['annotation'],
-            mapping=[['textocr_textdet_val.json', 'annotations/val.json']]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'val'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://dl.fbaipublicfiles.com/textvqa/images/'
+                'train_val_images.zip',
+                save_name='textocr_textdet_img.zip',
+                md5='d12dd8098899044e4ae1af34db7ecfef',
+                content=['image'],
+                mapping=[[
+                    'textocr_textdet_img/train_images', 'textdet_imgs/images'
+                ]]),
+            dict(
+                url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/'
+                'TextOCR_0.1_train.json',
+                save_name='textocr_textdet_train.json',
+                md5='0f8ba1beefd2ca4d08a4f82bcbe6cfb4',
+                content=['annotation'],
+                mapping=[[
+                    'textocr_textdet_train.json', 'annotations/train.json'
+                ]]),
+        ]),
    gatherer=dict(
-        type='mono_gather', train_ann='train.json', val_ann='val.json'),
-    parser=dict(
-        type='COCOTextDetAnnParser',
-        variant='textocr',
-        data_root=data_root + '/textdet_imgs/'),
-    dumper=dict(type='JsonDumper'),
-    delete=['annotations', 'textocr_textdet_train_val_img'])
+        type='MonoGatherer',
+        ann_name='train.json',
+        img_dir='textdet_imgs/images'),
+    parser=dict(type='COCOTextDetAnnParser', variant='textocr'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'))

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+val_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://dl.fbaipublicfiles.com/textvqa/images/'
+                'train_val_images.zip',
+                save_name='textocr_textdet_img.zip',
+                md5='d12dd8098899044e4ae1af34db7ecfef',
+                content=['image'],
+                mapping=[[
+                    'textocr_textdet_img/train_images', 'textdet_imgs/images'
+                ]]),
+            dict(
+                url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/'
+                'TextOCR_0.1_val.json',
+                save_name='textocr_textdet_val.json',
+                md5='fb151383ea7b3c530cde9ef0d5c08347',
+                content=['annotation'],
+                mapping=[['textocr_textdet_val.json',
+                          'annotations/val.json']]),
+        ]),
+    gatherer=dict(
+        type='MonoGatherer',
+        ann_name='val.json',
+        img_dir='textdet_imgs/images'),
+    parser=dict(type='COCOTextDetAnnParser', variant='textocr'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'))
+delete = ['annotations', 'textocr_textdet_img']
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/textocr/textrecog.py
+++ b/dataset_zoo/textocr/textrecog.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.val_preparer.packer.type = 'TextRecogCropPacker'

 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/textocr/textspotting.py
+++ b/dataset_zoo/textocr/textspotting.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.val_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/totaltext/textdet.py
+++ b/dataset_zoo/totaltext/textdet.py
@ -1,41 +1,62 @@
 data_root = 'data/totaltext'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://universityofadelaide.box.com/shared/static/'
-            '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip',
-            save_name='totaltext.zip',
-            md5='5b56d71a4005a333cf200ff35ce87f75',
-            split=['train', 'test'],
-            content=['image'],
-            mapping=[['totaltext/Images/Train', 'textdet_imgs/train'],
-                     ['totaltext/Images/Test', 'textdet_imgs/test']]),
-        dict(
-            url='https://universityofadelaide.box.com/shared/static/'
-            '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip',
-            save_name='txt_format.zip',
-            md5='53377a83420b4a0244304467512134e8',
-            split=['train', 'test'],
-            content=['annotation'],
-            mapping=[['txt_format/Train', 'annotations/train'],
-                     ['txt_format/Test', 'annotations/test']]),
-    ])
-
-data_converter = dict(
-    type='TextDetDataConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://universityofadelaide.box.com/shared/static/'
+                '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip',
+                save_name='totaltext.zip',
+                md5='5b56d71a4005a333cf200ff35ce87f75',
+                content=['image'],
+                mapping=[['totaltext/Images/Train', 'textdet_imgs/train']]),
+            dict(
+                url='https://universityofadelaide.box.com/shared/static/'
+                '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip',
+                save_name='txt_format.zip',
+                md5='53377a83420b4a0244304467512134e8',
+                content=['annotation'],
+                mapping=[['txt_format/Train', 'annotations/train']]),
+        ]),
    gatherer=dict(
-        type='pair_gather',
-        suffixes=['.jpg', '.JPG'],
+        type='PairGatherer',
+        img_suffixes=['.jpg', '.JPG'],
        rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']),
-    parser=dict(type='TotaltextTextDetAnnParser', data_root=data_root),
+    parser=dict(type='TotaltextTextDetAnnParser'),
+    packer=dict(type='TextDetPacker'),
    dumper=dict(type='JsonDumper'),
-    delete=['totaltext', 'txt_format', 'annotations'])
+)

-config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://universityofadelaide.box.com/shared/static/'
+                '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip',
+                save_name='totaltext.zip',
+                md5='5b56d71a4005a333cf200ff35ce87f75',
+                content=['image'],
+                mapping=[['totaltext/Images/Test', 'textdet_imgs/test']]),
+            dict(
+                url='https://universityofadelaide.box.com/shared/static/'
+                '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip',
+                save_name='txt_format.zip',
+                md5='53377a83420b4a0244304467512134e8',
+                content=['annotation'],
+                mapping=[['txt_format/Test', 'annotations/test']]),
+        ]),
+    gatherer=dict(
+        type='PairGatherer',
+        img_suffixes=['.jpg', '.JPG'],
+        rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']),
+    parser=dict(type='TotaltextTextDetAnnParser'),
+    packer=dict(type='TextDetPacker'),
+    dumper=dict(type='JsonDumper'),
+)
+delete = ['totaltext', 'txt_format', 'annotations']
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/totaltext/textrecog.py
+++ b/dataset_zoo/totaltext/textrecog.py
@ -1,5 +1,8 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.train_preparer.packer.type = 'TextRecogCropPacker'
+_base_.test_preparer.packer.type = 'TextRecogCropPacker'

 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/totaltext/textspotting.py
+++ b/dataset_zoo/totaltext/textspotting.py
@ -1,5 +1,8 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train'
+_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test'
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/dataset_zoo/wildreceipt/kie.py
+++ b/dataset_zoo/wildreceipt/kie.py
@ -1,35 +1,71 @@
 data_root = 'data/wildreceipt'
 cache_path = 'data/cache'

-data_obtainer = dict(
-    type='NaiveDataObtainer',
-    cache_path=cache_path,
-    data_root=data_root,
-    files=[
-        dict(
-            url='https://download.openmmlab.com/mmocr/data/wildreceipt.tar',
-            save_name='wildreceipt.tar',
-            md5='2a2c4a1b4777fb4fe185011e17ad46ae',
-            split=['train', 'test'],
-            content=['image', 'annotation'],
-            mapping=[
-                ['wildreceipt/wildreceipt/class_list.txt', 'class_list.txt'],
-                ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'],
-                ['wildreceipt/wildreceipt/test.txt', 'test.txt'],
-                ['wildreceipt/wildreceipt/train.txt', 'train.txt'],
-                ['wildreceipt/wildreceipt/image_files', 'image_files'],
-            ]),
-    ])
-
-data_converter = dict(
-    type='WildReceiptConverter',
-    splits=['train', 'test'],
-    data_root=data_root,
+train_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/'
+                'wildreceipt.tar',
+                save_name='wildreceipt.tar',
+                md5='2a2c4a1b4777fb4fe185011e17ad46ae',
+                content=['image', 'annotation'],
+                mapping=[
+                    [
+                        'wildreceipt/wildreceipt/class_list.txt',
+                        'class_list.txt'
+                    ],
+                    ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'],
+                    [
+                        'wildreceipt/wildreceipt/train.txt',
+                        'annotations/train.txt'
+                    ],
+                    [
+                        'wildreceipt/wildreceipt/image_files/*/*/*.*',
+                        'image_files'
+                    ],
+                ]),
+        ]),
    gatherer=dict(
-        type='mono_gather',
-        train_ann='train.txt',
-        test_ann='test.txt',
-        ann_path=data_root),
-    parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root),
+        type='MonoGatherer', ann_name='train.txt', img_dir='image_files'),
+    parser=dict(type='WildreceiptKIEAnnParser'),
+    packer=dict(type='WildReceiptPacker'),
    dumper=dict(type='WildreceiptOpensetDumper'),
-    delete=['wildreceipt'])
+)
+
+test_preparer = dict(
+    obtainer=dict(
+        type='NaiveDataObtainer',
+        cache_path=cache_path,
+        files=[
+            dict(
+                url='https://download.openmmlab.com/mmocr/data/'
+                'wildreceipt.tar',
+                save_name='wildreceipt.tar',
+                md5='2a2c4a1b4777fb4fe185011e17ad46ae',
+                content=['image', 'annotation'],
+                mapping=[
+                    [
+                        'wildreceipt/wildreceipt/class_list.txt',
+                        'class_list.txt'
+                    ],
+                    ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'],
+                    [
+                        'wildreceipt/wildreceipt/test.txt',
+                        'annotations/test.txt'
+                    ],
+                    [
+                        'wildreceipt/wildreceipt/image_files/*/*/*.*',
+                        'image_files'
+                    ],
+                ]),
+        ]),
+    gatherer=dict(
+        type='MonoGatherer', img_dir='image_files', ann_name='test.txt'),
+    parser=dict(type='WildreceiptKIEAnnParser'),
+    packer=dict(type='WildReceiptPacker'),
+    dumper=dict(type='WildreceiptOpensetDumper'),
+)
+delete = ['wildreceipt', 'annotations']
--- a/dataset_zoo/wildreceipt/textdet.py
+++ b/dataset_zoo/wildreceipt/textdet.py
@ -1,9 +1,14 @@
 _base_ = ['kie.py']

-data_converter = dict(
-    type='TextDetDataConverter',
-    parser=dict(type='WildreceiptTextDetAnnParser'),
-    dumper=dict(type='JsonDumper'))
+_base_.train_preparer.update(
+    dict(
+        parser=dict(type='WildreceiptTextDetAnnParser'),
+        packer=dict(type='TextDetPacker'),
+        dumper=dict(type='JsonDumper')))
+_base_.test_preparer.update(
+    dict(
+        parser=dict(type='WildreceiptTextDetAnnParser'),
+        packer=dict(type='TextDetPacker'),
+        dumper=dict(type='JsonDumper')))

-config_generator = dict(
-    type='TextRecogConfigGenerator', data_root=_base_.data_root)
+config_generator = dict(type='TextDetConfigGenerator')
--- a/dataset_zoo/wildreceipt/textrecog.py
+++ b/dataset_zoo/wildreceipt/textrecog.py
@ -1,4 +1,15 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextRecogCropConverter')
+_base_.train_preparer.update(
+    dict(
+        parser=dict(type='WildreceiptTextDetAnnParser'),
+        packer=dict(type='TextRecogCropPacker'),
+        dumper=dict(type='JsonDumper')))
+
+_base_.test_preparer.update(
+    dict(
+        parser=dict(type='WildreceiptTextDetAnnParser'),
+        packer=dict(type='TextRecogCropPacker'),
+        dumper=dict(type='JsonDumper')))
+
 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/wildreceipt/textspotting.py
+++ b/dataset_zoo/wildreceipt/textspotting.py
@ -1,5 +1,6 @@
 _base_ = ['textdet.py']

-data_converter = dict(type='TextSpottingDataConverter')
+_base_.train_preparer.packer.type = 'TextSpottingPacker'
+_base_.test_preparer.packer.type = 'TextSpottingPacker'

 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/mmocr/datasets/preparers/init.py
+++ b/mmocr/datasets/preparers/init.py
@ -1,17 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .config_generator import (TextDetConfigGenerator,
-                               TextRecogConfigGenerator,
-                               TextSpottingConfigGenerator)
-from .data_converter import (TextDetDataConverter, TextRecogDataConverter,
-                             TextSpottingDataConverter, WildReceiptConverter)
-from .data_obtainer import NaiveDataObtainer
+from .config_generators import *  # noqa
 from .data_preparer import DatasetPreparer
 from .dumpers import *  # noqa
+from .gatherers import *  # noqa
+from .obtainers import *  # noqa
+from .packers import *  # noqa
 from .parsers import *  # noqa

-__all__ = [
-    'DatasetPreparer', 'NaiveDataObtainer', 'TextDetDataConverter',
-    'TextRecogDataConverter', 'TextSpottingDataConverter',
-    'WildReceiptConverter', 'TextDetConfigGenerator',
-    'TextRecogConfigGenerator', 'TextSpottingConfigGenerator'
-]
+__all__ = ['DatasetPreparer']
--- a/mmocr/datasets/preparers/config_generator.py
+++ b/mmocr/datasets/preparers/config_generator.py
@ -1,374 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from abc import abstractmethod
-from typing import Dict, List, Optional
-
-from mmengine import mkdir_or_exist
-
-from .data_preparer import CFG_GENERATORS
-
-
-class BaseDatasetConfigGenerator:
-    """Base class for dataset config generator.
-
-    Args:
-        data_root (str): The root path of the dataset.
-        task (str): The task of the dataset.
-        dataset_name (str): The name of the dataset.
-        overwrite_cfg (bool): Whether to overwrite the dataset config file if
-            it already exists. If False, config generator will not generate new
-            config for datasets whose configs are already in base.
-        train_anns (List[Dict], optional): A list of train annotation files
-            to appear in the base configs. Defaults to None.
-            Each element is typically a dict with the following fields:
-            - ann_file (str): The path to the annotation file relative to
-              data_root.
-            - dataset_postfix (str, optional): Affects the postfix of the
-              resulting variable in the generated config. If specified, the
-              dataset variable will be named in the form of
-              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
-              None.
-        val_anns (List[Dict], optional): A list of val annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to None.
-        test_anns (List[Dict], optional): A list of test annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to None.
-        config_path (str): Path to the configs. Defaults to 'configs/'.
-    """
-
-    def __init__(
-        self,
-        data_root: str,
-        task: str,
-        dataset_name: str,
-        overwrite_cfg: bool = False,
-        train_anns: Optional[List[Dict]] = None,
-        val_anns: Optional[List[Dict]] = None,
-        test_anns: Optional[List[Dict]] = None,
-        config_path: str = 'configs/',
-    ) -> None:
-        self.config_path = config_path
-        self.data_root = data_root
-        self.task = task
-        self.dataset_name = dataset_name
-        self.overwrite_cfg = overwrite_cfg
-        self._prepare_anns(train_anns, val_anns, test_anns)
-
-    def _prepare_anns(self, train_anns: Optional[List[Dict]],
-                      val_anns: Optional[List[Dict]],
-                      test_anns: Optional[List[Dict]]) -> None:
-        """Preprocess input arguments and stores these information into
-        ``self.anns``.
-
-        ``self.anns`` is a dict that maps the name of a dataset config variable
-        to a dict, which contains the following fields:
-        - ann_file (str): The path to the annotation file relative to
-          data_root.
-        - split (str): The split the annotation belongs to. Usually
-          it can be 'train', 'val' and 'test'.
-        - dataset_postfix (str, optional): Affects the postfix of the
-          resulting variable in the generated config. If specified, the
-          dataset variable will be named in the form of
-          ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
-          None.
-        """
-        self.anns = {}
-        for split, ann_list in zip(('train', 'val', 'test'),
-                                   (train_anns, val_anns, test_anns)):
-            if ann_list is None:
-                continue
-            if not isinstance(ann_list, list):
-                raise ValueError(f'{split}_anns must be either a list or'
-                                 ' None!')
-            for ann_dict in ann_list:
-                assert 'ann_file' in ann_dict
-                if ann_dict.get('dataset_postfix', ''):
-                    key = f'{self.dataset_name}_{ann_dict["dataset_postfix"]}_{self.task}_{split}'  # noqa
-                else:
-                    key = f'{self.dataset_name}_{self.task}_{split}'
-                ann_dict['split'] = split
-                if key in self.anns:
-                    raise ValueError(
-                        f'Duplicate dataset variable {key} found! '
-                        'Please use different dataset_postfix to avoid '
-                        'conflict.')
-                self.anns[key] = ann_dict
-
-    def __call__(self) -> None:
-        """Generates the base dataset config."""
-
-        dataset_config = self._gen_dataset_config()
-
-        cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets',
-                            f'{self.dataset_name}.py')
-        if osp.exists(cfg_path) and not self.overwrite_cfg:
-            print(f'{cfg_path} found, skipping.')
-            return
-        mkdir_or_exist(osp.dirname(cfg_path))
-        with open(cfg_path, 'w') as f:
-            f.write(
-                f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n'  # noqa: E501
-            )
-            f.write(dataset_config)
-
-    @abstractmethod
-    def _gen_dataset_config(self) -> str:
-        """Generate a full dataset config based on the annotation file
-        dictionary.
-
-        Returns:
-            str: The generated dataset config.
-        """
-
-
-@CFG_GENERATORS.register_module()
-class TextDetConfigGenerator(BaseDatasetConfigGenerator):
-    """Text detection config generator.
-
-    Args:
-        data_root (str): The root path of the dataset.
-        dataset_name (str): The name of the dataset.
-        overwrite_cfg (bool): Whether to overwrite the dataset config file if
-            it already exists. If False, config generator will not generate new
-            config for datasets whose configs are already in base.
-        train_anns (List[Dict], optional): A list of train annotation files
-            to appear in the base configs. Defaults to
-            ``[dict(file='textdet_train.json', dataset_postfix='')]``.
-            Each element is typically a dict with the following fields:
-            - ann_file (str): The path to the annotation file relative to
-              data_root.
-            - dataset_postfix (str, optional): Affects the postfix of the
-              resulting variable in the generated config. If specified, the
-              dataset variable will be named in the form of
-              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
-              None.
-        val_anns (List[Dict], optional): A list of val annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to [].
-        test_anns (List[Dict], optional): A list of test annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to ``[dict(file='textdet_test.json')]``.
-        config_path (str): Path to the configs. Defaults to 'configs/'.
-    """
-
-    def __init__(
-        self,
-        data_root: str,
-        dataset_name: str,
-        overwrite_cfg: bool = False,
-        train_anns: Optional[List[Dict]] = [
-            dict(ann_file='textdet_train.json', dataset_postfix='')
-        ],
-        val_anns: Optional[List[Dict]] = [],
-        test_anns: Optional[List[Dict]] = [
-            dict(ann_file='textdet_test.json', dataset_postfix='')
-        ],
-        config_path: str = 'configs/',
-    ) -> None:
-        super().__init__(
-            data_root=data_root,
-            task='textdet',
-            overwrite_cfg=overwrite_cfg,
-            dataset_name=dataset_name,
-            train_anns=train_anns,
-            val_anns=val_anns,
-            test_anns=test_anns,
-            config_path=config_path,
-        )
-
-    def _gen_dataset_config(self) -> str:
-        """Generate a full dataset config based on the annotation file
-        dictionary.
-
-        Args:
-            ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps
-                a config variable name (such as icdar2015_textrecog_train) to
-                its corresponding annotation information dict. Each dict
-                contains following keys:
-                - ann_file (str): The path to the annotation file relative to
-                  data_root.
-                - dataset_postfix (str, optional): Affects the postfix of the
-                  resulting variable in the generated config. If specified, the
-                  dataset variable will be named in the form of
-                  ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults
-                  to None.
-                - split (str): The split the annotation belongs to. Usually
-                  it can be 'train', 'val' and 'test'.
-
-        Returns:
-            str: The generated dataset config.
-        """
-        cfg = ''
-        for key_name, ann_dict in self.anns.items():
-            cfg += f'\n{key_name} = dict(\n'
-            cfg += '    type=\'OCRDataset\',\n'
-            cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
-            cfg += f'    ann_file=\'{ann_dict["ann_file"]}\',\n'
-            if ann_dict['split'] == 'train':
-                cfg += '    filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'  # noqa: E501
-            elif ann_dict['split'] in ['test', 'val']:
-                cfg += '    test_mode=True,\n'
-            cfg += '    pipeline=None)\n'
-        return cfg
-
-
-@CFG_GENERATORS.register_module()
-class TextRecogConfigGenerator(BaseDatasetConfigGenerator):
-    """Text recognition config generator.
-
-    Args:
-        data_root (str): The root path of the dataset.
-        dataset_name (str): The name of the dataset.
-        overwrite_cfg (bool): Whether to overwrite the dataset config file if
-            it already exists. If False, config generator will not generate new
-            config for datasets whose configs are already in base.
-        train_anns (List[Dict], optional): A list of train annotation files
-            to appear in the base configs. Defaults to
-            ``[dict(file='textrecog_train.json'), dataset_postfix='']``.
-            Each element is typically a dict with the following fields:
-            - ann_file (str): The path to the annotation file relative to
-              data_root.
-            - dataset_postfix (str, optional): Affects the postfix of the
-              resulting variable in the generated config. If specified, the
-              dataset variable will be named in the form of
-              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
-              None.
-        val_anns (List[Dict], optional): A list of val annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to [].
-        test_anns (List[Dict], optional): A list of test annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to ``[dict(file='textrecog_test.json')]``.
-        config_path (str): Path to the configs. Defaults to 'configs/'.
-
-    Example:
-        It generates a dataset config like:
-        >>> ic15_rec_data_root = 'data/icdar2015/'
-        >>> icdar2015_textrecog_train = dict(
-        >>>     type='OCRDataset',
-        >>>     data_root=ic15_rec_data_root,
-        >>>     ann_file='textrecog_train.json',
-        >>>     test_mode=False,
-        >>>     pipeline=None)
-        >>> icdar2015_textrecog_test = dict(
-        >>>     type='OCRDataset',
-        >>>     data_root=ic15_rec_data_root,
-        >>>     ann_file='textrecog_test.json',
-        >>>     test_mode=True,
-        >>>     pipeline=None)
-    """
-
-    def __init__(
-        self,
-        data_root: str,
-        dataset_name: str,
-        overwrite_cfg: bool = False,
-        train_anns: Optional[List[Dict]] = [
-            dict(ann_file='textrecog_train.json', dataset_postfix='')
-        ],
-        val_anns: Optional[List[Dict]] = [],
-        test_anns: Optional[List[Dict]] = [
-            dict(ann_file='textrecog_test.json', dataset_postfix='')
-        ],
-        config_path: str = 'configs/',
-    ) -> None:
-        super().__init__(
-            data_root=data_root,
-            task='textrecog',
-            overwrite_cfg=overwrite_cfg,
-            dataset_name=dataset_name,
-            train_anns=train_anns,
-            val_anns=val_anns,
-            test_anns=test_anns,
-            config_path=config_path)
-
-    def _gen_dataset_config(self) -> str:
-        """Generate a full dataset config based on the annotation file
-        dictionary.
-
-        Args:
-            ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps
-                a config variable name (such as icdar2015_textrecog_train) to
-                its corresponding annotation information dict. Each dict
-                contains following keys:
-                - ann_file (str): The path to the annotation file relative to
-                  data_root.
-                - dataset_postfix (str, optional): Affects the postfix of the
-                  resulting variable in the generated config. If specified, the
-                  dataset variable will be named in the form of
-                  ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults
-                  to None.
-                - split (str): The split the annotation belongs to. Usually
-                  it can be 'train', 'val' and 'test'.
-
-        Returns:
-            str: The generated dataset config.
-        """
-        cfg = ''
-        for key_name, ann_dict in self.anns.items():
-            cfg += f'\n{key_name} = dict(\n'
-            cfg += '    type=\'OCRDataset\',\n'
-            cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
-            cfg += f'    ann_file=\'{ann_dict["ann_file"]}\',\n'
-            if ann_dict['split'] in ['test', 'val']:
-                cfg += '    test_mode=True,\n'
-            cfg += '    pipeline=None)\n'
-        return cfg
-
-
-@CFG_GENERATORS.register_module()
-class TextSpottingConfigGenerator(TextDetConfigGenerator):
-    """Text spotting config generator.
-
-    Args:
-        data_root (str): The root path of the dataset.
-        dataset_name (str): The name of the dataset.
-        overwrite_cfg (bool): Whether to overwrite the dataset config file if
-            it already exists. If False, config generator will not generate new
-            config for datasets whose configs are already in base.
-        train_anns (List[Dict], optional): A list of train annotation files
-            to appear in the base configs. Defaults to
-            ``[dict(file='textspotting_train.json', dataset_postfix='')]``.
-            Each element is typically a dict with the following fields:
-            - ann_file (str): The path to the annotation file relative to
-              data_root.
-            - dataset_postfix (str, optional): Affects the postfix of the
-              resulting variable in the generated config. If specified, the
-              dataset variable will be named in the form of
-              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
-              None.
-        val_anns (List[Dict], optional): A list of val annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to [].
-        test_anns (List[Dict], optional): A list of test annotation files
-            to appear in the base configs, similar to ``train_anns``. Defaults
-            to ``[dict(file='textspotting_test.json')]``.
-        config_path (str): Path to the configs. Defaults to 'configs/'.
-    """
-
-    def __init__(
-        self,
-        data_root: str,
-        dataset_name: str,
-        overwrite_cfg: bool = False,
-        train_anns: Optional[List[Dict]] = [
-            dict(ann_file='textspotting_train.json', dataset_postfix='')
-        ],
-        val_anns: Optional[List[Dict]] = [],
-        test_anns: Optional[List[Dict]] = [
-            dict(ann_file='textspotting_test.json', dataset_postfix='')
-        ],
-        config_path: str = 'configs/',
-    ) -> None:
-        BaseDatasetConfigGenerator.__init__(
-            self,
-            data_root=data_root,
-            task='textspotting',
-            overwrite_cfg=overwrite_cfg,
-            dataset_name=dataset_name,
-            train_anns=train_anns,
-            val_anns=val_anns,
-            test_anns=test_anns,
-            config_path=config_path,
-        )
--- a/mmocr/datasets/preparers/config_generators/init.py
+++ b/mmocr/datasets/preparers/config_generators/init.py
@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseDatasetConfigGenerator
+from .textdet_config_generator import TextDetConfigGenerator
+from .textrecog_config_generator import TextRecogConfigGenerator
+from .textspotting_config_generator import TextSpottingConfigGenerator
+
+__all__ = [
+    'BaseDatasetConfigGenerator', 'TextDetConfigGenerator',
+    'TextRecogConfigGenerator', 'TextSpottingConfigGenerator'
+]
--- a/mmocr/datasets/preparers/config_generators/base.py
+++ b/mmocr/datasets/preparers/config_generators/base.py
@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from abc import abstractmethod
+from typing import Dict, List, Optional
+
+from mmengine import mkdir_or_exist
+
+
+class BaseDatasetConfigGenerator:
+    """Base class for dataset config generator.
+
+    Args:
+        data_root (str): The root path of the dataset.
+        task (str): The task of the dataset.
+        dataset_name (str): The name of the dataset.
+        overwrite_cfg (bool): Whether to overwrite the dataset config file if
+            it already exists. If False, config generator will not generate new
+            config for datasets whose configs are already in base.
+        train_anns (List[Dict], optional): A list of train annotation files
+            to appear in the base configs. Defaults to None.
+            Each element is typically a dict with the following fields:
+            - ann_file (str): The path to the annotation file relative to
+              data_root.
+            - dataset_postfix (str, optional): Affects the postfix of the
+              resulting variable in the generated config. If specified, the
+              dataset variable will be named in the form of
+              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
+              None.
+        val_anns (List[Dict], optional): A list of val annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to None.
+        test_anns (List[Dict], optional): A list of test annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to None.
+        config_path (str): Path to the configs. Defaults to 'configs/'.
+    """
+
+    def __init__(
+        self,
+        data_root: str,
+        task: str,
+        dataset_name: str,
+        overwrite_cfg: bool = False,
+        train_anns: Optional[List[Dict]] = None,
+        val_anns: Optional[List[Dict]] = None,
+        test_anns: Optional[List[Dict]] = None,
+        config_path: str = 'configs/',
+    ) -> None:
+        self.config_path = config_path
+        self.data_root = data_root
+        self.task = task
+        self.dataset_name = dataset_name
+        self.overwrite_cfg = overwrite_cfg
+        self._prepare_anns(train_anns, val_anns, test_anns)
+
+    def _prepare_anns(self, train_anns: Optional[List[Dict]],
+                      val_anns: Optional[List[Dict]],
+                      test_anns: Optional[List[Dict]]) -> None:
+        """Preprocess input arguments and stores these information into
+        ``self.anns``.
+
+        ``self.anns`` is a dict that maps the name of a dataset config variable
+        to a dict, which contains the following fields:
+        - ann_file (str): The path to the annotation file relative to
+          data_root.
+        - split (str): The split the annotation belongs to. Usually
+          it can be 'train', 'val' and 'test'.
+        - dataset_postfix (str, optional): Affects the postfix of the
+          resulting variable in the generated config. If specified, the
+          dataset variable will be named in the form of
+          ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
+          None.
+        """
+        self.anns = {}
+        for split, ann_list in zip(('train', 'val', 'test'),
+                                   (train_anns, val_anns, test_anns)):
+            if ann_list is None:
+                continue
+            if not isinstance(ann_list, list):
+                raise ValueError(f'{split}_anns must be either a list or'
+                                 ' None!')
+            for ann_dict in ann_list:
+                assert 'ann_file' in ann_dict
+                if ann_dict.get('dataset_postfix', ''):
+                    key = f'{self.dataset_name}_{ann_dict["dataset_postfix"]}_{self.task}_{split}'  # noqa
+                else:
+                    key = f'{self.dataset_name}_{self.task}_{split}'
+                ann_dict['split'] = split
+                if key in self.anns:
+                    raise ValueError(
+                        f'Duplicate dataset variable {key} found! '
+                        'Please use different dataset_postfix to avoid '
+                        'conflict.')
+                self.anns[key] = ann_dict
+
+    def __call__(self) -> None:
+        """Generates the base dataset config."""
+
+        dataset_config = self._gen_dataset_config()
+
+        cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets',
+                            f'{self.dataset_name}.py')
+        if osp.exists(cfg_path) and not self.overwrite_cfg:
+            print(f'{cfg_path} found, skipping.')
+            return
+        mkdir_or_exist(osp.dirname(cfg_path))
+        with open(cfg_path, 'w') as f:
+            f.write(
+                f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n'  # noqa: E501
+            )
+            f.write(dataset_config)
+
+    @abstractmethod
+    def _gen_dataset_config(self) -> str:
+        """Generate a full dataset config based on the annotation file
+        dictionary.
+
+        Returns:
+            str: The generated dataset config.
+        """
--- a/mmocr/datasets/preparers/config_generators/textdet_config_generator.py
+++ b/mmocr/datasets/preparers/config_generators/textdet_config_generator.py
@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+from ..data_preparer import CFG_GENERATORS
+from .base import BaseDatasetConfigGenerator
+
+
+@CFG_GENERATORS.register_module()
+class TextDetConfigGenerator(BaseDatasetConfigGenerator):
+    """Text detection config generator.
+
+    Args:
+        data_root (str): The root path of the dataset.
+        dataset_name (str): The name of the dataset.
+        overwrite_cfg (bool): Whether to overwrite the dataset config file if
+            it already exists. If False, config generator will not generate new
+            config for datasets whose configs are already in base.
+        train_anns (List[Dict], optional): A list of train annotation files
+            to appear in the base configs. Defaults to
+            ``[dict(file='textdet_train.json', dataset_postfix='')]``.
+            Each element is typically a dict with the following fields:
+            - ann_file (str): The path to the annotation file relative to
+              data_root.
+            - dataset_postfix (str, optional): Affects the postfix of the
+              resulting variable in the generated config. If specified, the
+              dataset variable will be named in the form of
+              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
+              None.
+        val_anns (List[Dict], optional): A list of val annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to [].
+        test_anns (List[Dict], optional): A list of test annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to ``[dict(file='textdet_test.json')]``.
+        config_path (str): Path to the configs. Defaults to 'configs/'.
+    """
+
+    def __init__(
+        self,
+        data_root: str,
+        dataset_name: str,
+        overwrite_cfg: bool = False,
+        train_anns: Optional[List[Dict]] = [
+            dict(ann_file='textdet_train.json', dataset_postfix='')
+        ],
+        val_anns: Optional[List[Dict]] = [],
+        test_anns: Optional[List[Dict]] = [
+            dict(ann_file='textdet_test.json', dataset_postfix='')
+        ],
+        config_path: str = 'configs/',
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            task='textdet',
+            overwrite_cfg=overwrite_cfg,
+            dataset_name=dataset_name,
+            train_anns=train_anns,
+            val_anns=val_anns,
+            test_anns=test_anns,
+            config_path=config_path,
+        )
+
+    def _gen_dataset_config(self) -> str:
+        """Generate a full dataset config based on the annotation file
+        dictionary.
+
+        Args:
+            ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps
+                a config variable name (such as icdar2015_textrecog_train) to
+                its corresponding annotation information dict. Each dict
+                contains following keys:
+                - ann_file (str): The path to the annotation file relative to
+                  data_root.
+                - dataset_postfix (str, optional): Affects the postfix of the
+                  resulting variable in the generated config. If specified, the
+                  dataset variable will be named in the form of
+                  ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults
+                  to None.
+                - split (str): The split the annotation belongs to. Usually
+                  it can be 'train', 'val' and 'test'.
+
+        Returns:
+            str: The generated dataset config.
+        """
+        cfg = ''
+        for key_name, ann_dict in self.anns.items():
+            cfg += f'\n{key_name} = dict(\n'
+            cfg += '    type=\'OCRDataset\',\n'
+            cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
+            cfg += f'    ann_file=\'{ann_dict["ann_file"]}\',\n'
+            if ann_dict['split'] == 'train':
+                cfg += '    filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'  # noqa: E501
+            elif ann_dict['split'] in ['test', 'val']:
+                cfg += '    test_mode=True,\n'
+            cfg += '    pipeline=None)\n'
+        return cfg
--- a/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py
+++ b/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py
@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+from ..data_preparer import CFG_GENERATORS
+from .base import BaseDatasetConfigGenerator
+
+
+@CFG_GENERATORS.register_module()
+class TextRecogConfigGenerator(BaseDatasetConfigGenerator):
+    """Text recognition config generator.
+
+    Args:
+        data_root (str): The root path of the dataset.
+        dataset_name (str): The name of the dataset.
+        overwrite_cfg (bool): Whether to overwrite the dataset config file if
+            it already exists. If False, config generator will not generate new
+            config for datasets whose configs are already in base.
+        train_anns (List[Dict], optional): A list of train annotation files
+            to appear in the base configs. Defaults to
+            ``[dict(file='textrecog_train.json'), dataset_postfix='']``.
+            Each element is typically a dict with the following fields:
+            - ann_file (str): The path to the annotation file relative to
+              data_root.
+            - dataset_postfix (str, optional): Affects the postfix of the
+              resulting variable in the generated config. If specified, the
+              dataset variable will be named in the form of
+              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
+              None.
+        val_anns (List[Dict], optional): A list of val annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to [].
+        test_anns (List[Dict], optional): A list of test annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to ``[dict(file='textrecog_test.json')]``.
+        config_path (str): Path to the configs. Defaults to 'configs/'.
+
+    Example:
+        It generates a dataset config like:
+        >>> ic15_rec_data_root = 'data/icdar2015/'
+        >>> icdar2015_textrecog_train = dict(
+        >>>     type='OCRDataset',
+        >>>     data_root=ic15_rec_data_root,
+        >>>     ann_file='textrecog_train.json',
+        >>>     test_mode=False,
+        >>>     pipeline=None)
+        >>> icdar2015_textrecog_test = dict(
+        >>>     type='OCRDataset',
+        >>>     data_root=ic15_rec_data_root,
+        >>>     ann_file='textrecog_test.json',
+        >>>     test_mode=True,
+        >>>     pipeline=None)
+    """
+
+    def __init__(
+        self,
+        data_root: str,
+        dataset_name: str,
+        overwrite_cfg: bool = False,
+        train_anns: Optional[List[Dict]] = [
+            dict(ann_file='textrecog_train.json', dataset_postfix='')
+        ],
+        val_anns: Optional[List[Dict]] = [],
+        test_anns: Optional[List[Dict]] = [
+            dict(ann_file='textrecog_test.json', dataset_postfix='')
+        ],
+        config_path: str = 'configs/',
+    ) -> None:
+        super().__init__(
+            data_root=data_root,
+            task='textrecog',
+            overwrite_cfg=overwrite_cfg,
+            dataset_name=dataset_name,
+            train_anns=train_anns,
+            val_anns=val_anns,
+            test_anns=test_anns,
+            config_path=config_path)
+
+    def _gen_dataset_config(self) -> str:
+        """Generate a full dataset config based on the annotation file
+        dictionary.
+
+        Args:
+            ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps
+                a config variable name (such as icdar2015_textrecog_train) to
+                its corresponding annotation information dict. Each dict
+                contains following keys:
+                - ann_file (str): The path to the annotation file relative to
+                  data_root.
+                - dataset_postfix (str, optional): Affects the postfix of the
+                  resulting variable in the generated config. If specified, the
+                  dataset variable will be named in the form of
+                  ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults
+                  to None.
+                - split (str): The split the annotation belongs to. Usually
+                  it can be 'train', 'val' and 'test'.
+
+        Returns:
+            str: The generated dataset config.
+        """
+        cfg = ''
+        for key_name, ann_dict in self.anns.items():
+            cfg += f'\n{key_name} = dict(\n'
+            cfg += '    type=\'OCRDataset\',\n'
+            cfg += '    data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n'  # noqa: E501
+            cfg += f'    ann_file=\'{ann_dict["ann_file"]}\',\n'
+            if ann_dict['split'] in ['test', 'val']:
+                cfg += '    test_mode=True,\n'
+            cfg += '    pipeline=None)\n'
+        return cfg
--- a/mmocr/datasets/preparers/config_generators/textspotting_config_generator.py
+++ b/mmocr/datasets/preparers/config_generators/textspotting_config_generator.py
@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+from ..data_preparer import CFG_GENERATORS
+from .base import BaseDatasetConfigGenerator
+from .textdet_config_generator import TextDetConfigGenerator
+
+
+@CFG_GENERATORS.register_module()
+class TextSpottingConfigGenerator(TextDetConfigGenerator):
+    """Text spotting config generator.
+
+    Args:
+        data_root (str): The root path of the dataset.
+        dataset_name (str): The name of the dataset.
+        overwrite_cfg (bool): Whether to overwrite the dataset config file if
+            it already exists. If False, config generator will not generate new
+            config for datasets whose configs are already in base.
+        train_anns (List[Dict], optional): A list of train annotation files
+            to appear in the base configs. Defaults to
+            ``[dict(file='textspotting_train.json', dataset_postfix='')]``.
+            Each element is typically a dict with the following fields:
+            - ann_file (str): The path to the annotation file relative to
+              data_root.
+            - dataset_postfix (str, optional): Affects the postfix of the
+              resulting variable in the generated config. If specified, the
+              dataset variable will be named in the form of
+              ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to
+              None.
+        val_anns (List[Dict], optional): A list of val annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to [].
+        test_anns (List[Dict], optional): A list of test annotation files
+            to appear in the base configs, similar to ``train_anns``. Defaults
+            to ``[dict(file='textspotting_test.json')]``.
+        config_path (str): Path to the configs. Defaults to 'configs/'.
+    """
+
+    def __init__(
+        self,
+        data_root: str,
+        dataset_name: str,
+        overwrite_cfg: bool = False,
+        train_anns: Optional[List[Dict]] = [
+            dict(ann_file='textspotting_train.json', dataset_postfix='')
+        ],
+        val_anns: Optional[List[Dict]] = [],
+        test_anns: Optional[List[Dict]] = [
+            dict(ann_file='textspotting_test.json', dataset_postfix='')
+        ],
+        config_path: str = 'configs/',
+    ) -> None:
+        BaseDatasetConfigGenerator.__init__(
+            self,
+            data_root=data_root,
+            task='textspotting',
+            overwrite_cfg=overwrite_cfg,
+            dataset_name=dataset_name,
+            train_anns=train_anns,
+            val_anns=val_anns,
+            test_anns=test_anns,
+            config_path=config_path,
+        )
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@ -1,752 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os
-import os.path as osp
-import re
-import shutil
-from abc import abstractmethod
-from functools import partial
-from typing import Dict, List, Optional, Sequence, Tuple
-
-import mmcv
-from mmengine import mkdir_or_exist, track_parallel_progress
-
-from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox, warp_img
-from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS
-
-
-class BaseDataConverter:
-    """Base class for data processor.
-
-    Args:
-        splits (List): A list of splits to be processed.
-        data_root (str): Path to the data root.
-        gatherer (Dict): Config dict for gathering the dataset files.
-        parser (Dict): Config dict for parsing the dataset files.
-        dumper (Dict): Config dict for dumping the dataset files.
-        nproc (int): Number of processes to process the data.
-        task (str): Task of the dataset.
-        dataset_name (str): Dataset name.
-        delete (Optional[List]): A list of files to be deleted after
-            conversion.
-    """
-
-    def __init__(self,
-                 splits: List,
-                 data_root: str,
-                 gatherer: Dict,
-                 parser: Dict,
-                 dumper: Dict,
-                 nproc: int,
-                 task: str,
-                 dataset_name: str,
-                 delete: Optional[List] = None,
-                 config_path: str = 'configs/'):
-        assert isinstance(nproc, int) and nproc > 0, \
-            'nproc must be a positive integer.'
-        self.splits = splits
-        self.data_root = data_root
-        self.nproc = nproc
-        self.task = task
-        self.dataset_name = dataset_name
-        self.delete = delete
-        self.config_path = config_path
-        self.img_dir = f'{task}_imgs'
-        parser.update(dict(nproc=nproc))
-        dumper.update(dict(task=task))
-        self.parser = DATA_PARSERS.build(parser)
-        self.dumper = DATA_DUMPERS.build(dumper)
-        gather_type = gatherer.pop('type')
-        self.gatherer_args = gatherer
-        if gather_type == 'pair_gather':
-            self.gatherer = self.pair_gather
-        elif gather_type == 'mono_gather':
-            self.gatherer = self.mono_gather
-        elif gather_type == 'naf_gather':
-            self.gatherer = self.naf_gather
-        else:
-            raise NotImplementedError
-
-    def __call__(self):
-        """Process the data.
-
-        Returns:
-            Dict: A dict that maps each split to the path of the annotation
-                files.
-        """
-        # Convert and dump annotations to MMOCR format
-        for self.current_split in self.splits:
-            print(f'Parsing {self.current_split} split...')
-            # Gather the info such as file names required by parser
-            img_path = osp.join(self.data_root, self.img_dir,
-                                self.current_split)
-            ann_path = osp.join(self.data_root, 'annotations')
-            gatherer_args = dict(img_path=img_path, ann_path=ann_path)
-            gatherer_args.update(self.gatherer_args)
-            files = self.gatherer(**gatherer_args)
-            # Convert dataset annotations to MMOCR format
-            samples = self.parser.parse_files(files, self.current_split)
-            print(f'Packing {self.current_split} annotations...')
-            func = partial(self.pack_instance, split=self.current_split)
-            samples = track_parallel_progress(func, samples, nproc=self.nproc)
-            samples = self.add_meta(samples)
-            # Dump annotation files
-            self.dumper.dump(samples, self.data_root, self.current_split)
-        self.clean()
-
-    @abstractmethod
-    def pack_instance(self, sample: Tuple, split: str) -> Dict:
-        """Pack the parsed annotation info to an MMOCR format instance.
-
-        Args:
-            sample (Tuple): A tuple of (img_file, ann_file).
-               - img_path (str): Path to image file.
-               - instances (Sequence[Dict]): A list of converted annos.
-            split (str): The split of the instance.
-
-        Returns:
-            Dict: An MMOCR format instance.
-        """
-
-    @abstractmethod
-    def add_meta(self, sample: List) -> Dict:
-        """Add meta information to the sample.
-
-        Args:
-            sample (List): A list of samples of the dataset.
-
-        Returns:
-            Dict: A dict contains the meta information and samples.
-        """
-
-    def mono_gather(self,
-                    ann_path: str,
-                    train_ann: Optional[str] = None,
-                    val_ann: Optional[str] = None,
-                    test_ann: Optional[str] = None,
-                    **kwargs) -> str:
-        """Gather the dataset file. Specifically for the case that only one
-        annotation file is needed. For example,
-
-            img_001.jpg \
-            img_002.jpg ---> train.json
-            img_003.jpg /
-
-        Args:
-            anno_path (str): Path to the annotations.
-            train_ann (str, optional): The annotation file name of the train
-                split in the original dataset. Defaults to None.
-            val_ann (str, optional): The annotation file name of the val split
-                in the original dataset. Defaults to None.
-            test_ann (str, optional): The annotation file name of the test
-                split in the original dataset. Defaults to None.
-
-        Returns:
-            str: Path to the annotation file.
-        """
-
-        ann_file = eval(f'{self.current_split}_ann')
-        if ann_file is None:
-            raise ValueError(
-                f'{self.current_split}_ann must be specified in gatherer!')
-        return osp.join(ann_path, ann_file)
-
-    def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
-                    **kwargs) -> List[Tuple]:
-        """Gather the dataset files. Specifically for the paired annotations.
-        That is to say, each image has a corresponding annotation file. For
-        example,
-
-            img_1.jpg <---> gt_img_1.txt
-            img_2.jpg <---> gt_img_2.txt
-            img_3.jpg <---> gt_img_3.txt
-
-        Args:
-            img_path (str): Path to the images.
-            suffixes (List[str]): File suffixes that used for searching.
-            rule (Sequence): The rule for pairing the files. The
-                    first element is the matching pattern for the file, and the
-                    second element is the replacement pattern, which should
-                    be a regular expression. For example, to map the image
-                    name img_1.jpg to the annotation name gt_img_1.txt,
-                    the rule is
-                        [r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501
-
-        Returns:
-            List[Tuple]: A list of tuples (img_path, ann_path).
-        """
-        files = list()
-        for file in list_files(img_path, suffixes):
-            if not re.match(rule[0], osp.basename(file)):
-                continue
-            file2 = re.sub(rule[0], rule[1], osp.basename(file))
-            file2 = file.replace(osp.basename(file), file2)
-            file2 = file2.replace(self.img_dir, 'annotations')
-            files.append((file, file2))
-
-        return files
-
-    def naf_gather(self, img_path: str, ann_path: str,
-                   **kwargs) -> List[Tuple]:
-        """Gather the dataset file from NAF dataset. Specifically for the case
-        that there is a split file that contains the names of different splits.
-        For example,
-
-            img_001.jpg                           train: img_001.jpg
-            img_002.jpg ---> data_split.json ---> test: img_002.jpg
-            img_003.jpg                           val: img_003.jpg
-
-        Args:
-            img_path (str): Path to the images.
-            anno_path (str): Path to the annotations.
-        Returns:
-            List[Tuple]: A list of tuples (img_path, ann_path).
-        """
-        split_file = osp.join(self.data_root, 'data_split.json')
-        with open(split_file, 'r') as f:
-            split_data = json.load(f)
-        files = []
-        # Rename the key
-        split_data['val'] = split_data.pop('valid')
-        if not osp.exists(img_path):
-            os.makedirs(img_path)
-        for groups in split_data[self.current_split]:
-            for img_name in split_data[self.current_split][groups]:
-                src_img = osp.join(self.data_root, 'temp_images', img_name)
-                dst_img = osp.join(img_path, img_name)
-                if not osp.exists(src_img):
-                    Warning(f'{src_img} does not exist!')
-                    continue
-                # move the image to the new path
-                shutil.move(src_img, dst_img)
-                ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
-                files.append((dst_img, ann))
-        return files
-
-    def clean(self) -> None:
-        for d in self.delete:
-            delete_file = osp.join(self.data_root, d)
-            if osp.exists(delete_file):
-                if osp.isdir(delete_file):
-                    shutil.rmtree(delete_file)
-                else:
-                    os.remove(delete_file)
-
-
-@DATA_CONVERTERS.register_module()
-class TextDetDataConverter(BaseDataConverter):
-    """Text detection data converter.
-
-    Args:
-        splits (List): A list of splits to be processed.
-        data_root (str): Path to the data root.
-        gatherer (Dict): Config dict for gathering the dataset files.
-        parser (Dict): Config dict for parsing the dataset files.
-        dumper (Dict): Config dict for dumping the dataset files.
-        dataset_name (str): Name of the dataset.
-        nproc (int): Number of processes to process the data.
-        delete (Optional[List]): A list of files to be deleted after
-            conversion. Defaults to ['annotations].
-    """
-
-    def __init__(self,
-                 splits: List,
-                 data_root: str,
-                 gatherer: Dict,
-                 parser: Dict,
-                 dumper: Dict,
-                 dataset_name: str,
-                 nproc: int,
-                 delete: List = ['annotations']) -> None:
-        super().__init__(
-            splits=splits,
-            data_root=data_root,
-            gatherer=gatherer,
-            parser=parser,
-            dumper=dumper,
-            dataset_name=dataset_name,
-            nproc=nproc,
-            delete=delete,
-            task='textdet')
-
-    def pack_instance(self,
-                      sample: Tuple,
-                      split: str,
-                      bbox_label: int = 0) -> Dict:
-        """Pack the parsed annotation info to an MMOCR format instance.
-
-        Args:
-            sample (Tuple): A tuple of (img_file, instances).
-               - img_path (str): Path to the image file.
-               - instances (Sequence[Dict]): A list of converted annos. Each
-                    element should be a dict with the following keys:
-                    - 'poly' or 'box'
-                    - 'ignore'
-                    - 'bbox_label' (optional)
-            split (str): The split of the instance.
-
-        Returns:
-            Dict: An MMOCR format instance.
-        """
-
-        img_path, instances = sample
-
-        img = mmcv.imread(img_path)
-        h, w = img.shape[:2]
-
-        packed_instances = list()
-        for instance in instances:
-            poly = instance.get('poly', None)
-            box = instance.get('box', None)
-            assert box or poly
-            packed_sample = dict(
-                polygon=poly if poly else list(
-                    bbox2poly(box).astype('float64')),
-                bbox=box if box else list(poly2bbox(poly).astype('float64')),
-                bbox_label=bbox_label,
-                ignore=instance['ignore'])
-            packed_instances.append(packed_sample)
-
-        packed_instances = dict(
-            instances=packed_instances,
-            img_path=img_path.replace(self.data_root + '/', ''),
-            height=h,
-            width=w)
-
-        return packed_instances
-
-    def add_meta(self, sample: List) -> Dict:
-        meta = {
-            'metainfo': {
-                'dataset_type': 'TextDetDataset',
-                'task_name': 'textdet',
-                'category': [{
-                    'id': 0,
-                    'name': 'text'
-                }]
-            },
-            'data_list': sample
-        }
-        return meta
-
-
-@DATA_CONVERTERS.register_module()
-class TextSpottingDataConverter(BaseDataConverter):
-    """Text spotting data converter.
-
-    Args:
-        splits (List): A list of splits to be processed.
-        data_root (str): Path to the data root.
-        gatherer (Dict): Config dict for gathering the dataset files.
-        parser (Dict): Config dict for parsing the dataset files.
-        dumper (Dict): Config dict for dumping the dataset files.
-        dataset_name (str): Name of the dataset.
-        nproc (int): Number of processes to process the data.
-        delete (Optional[List]): A list of files to be deleted after
-            conversion. Defaults to ['annotations'].
-    """
-
-    def __init__(self,
-                 splits: List,
-                 data_root: str,
-                 gatherer: Dict,
-                 parser: Dict,
-                 dumper: Dict,
-                 dataset_name: str,
-                 nproc: int,
-                 delete: List = ['annotations']) -> None:
-        super().__init__(
-            splits=splits,
-            data_root=data_root,
-            gatherer=gatherer,
-            parser=parser,
-            dumper=dumper,
-            dataset_name=dataset_name,
-            nproc=nproc,
-            delete=delete,
-            task='textspotting')
-        # Textspotting task shares the same images with textdet task
-        self.img_dir = 'textdet_imgs'
-
-    def pack_instance(self,
-                      sample: Tuple,
-                      split: str,
-                      bbox_label: int = 0) -> Dict:
-        """Pack the parsed annotation info to an MMOCR format instance.
-
-        Args:
-            sample (Tuple): A tuple of (img_file, ann_file).
-               - img_path (str): Path to image file.
-               - instances (Sequence[Dict]): A list of converted annos. Each
-                    element should be a dict with the following keys:
-                    - 'poly' or 'box'
-                    - 'text'
-                    - 'ignore'
-                    - 'bbox_label' (optional)
-            split (str): The split of the instance.
-
-        Returns:
-            Dict: An MMOCR format instance.
-        """
-
-        img_path, instances = sample
-
-        img = mmcv.imread(img_path)
-        h, w = img.shape[:2]
-
-        packed_instances = list()
-        for instance in instances:
-            assert 'text' in instance, 'Text is not found in the instance.'
-            poly = instance.get('poly', None)
-            box = instance.get('box', None)
-            assert box or poly
-            packed_sample = dict(
-                polygon=poly if poly else list(
-                    bbox2poly(box).astype('float64')),
-                bbox=box if box else list(poly2bbox(poly).astype('float64')),
-                bbox_label=bbox_label,
-                ignore=instance['ignore'],
-                text=instance['text'])
-            packed_instances.append(packed_sample)
-
-        packed_instances = dict(
-            instances=packed_instances,
-            img_path=img_path.replace(self.data_root + '/', ''),
-            height=h,
-            width=w)
-
-        return packed_instances
-
-    def add_meta(self, sample: List) -> Dict:
-        meta = {
-            'metainfo': {
-                'dataset_type': 'TextSpottingDataset',
-                'task_name': 'textspotting',
-                'category': [{
-                    'id': 0,
-                    'name': 'text'
-                }]
-            },
-            'data_list': sample
-        }
-        return meta
-
-
-@DATA_CONVERTERS.register_module()
-class TextRecogDataConverter(BaseDataConverter):
-    """Text recognition data converter.
-
-    Args:
-        splits (List): A list of splits to be processed.
-        data_root (str): Path to the data root.
-        gatherer (Dict): Config dict for gathering the dataset files.
-        parser (Dict): Config dict for parsing the dataset annotations.
-        dumper (Dict): Config dict for dumping the dataset files.
-        dataset_name (str): Name of the dataset.
-        nproc (int): Number of processes to process the data.
-        delete (Optional[List]): A list of files to be deleted after
-            conversion. Defaults to ['annotations].
-    """
-
-    def __init__(self,
-                 splits: List,
-                 data_root: str,
-                 gatherer: Dict,
-                 parser: Dict,
-                 dumper: Dict,
-                 dataset_name: str,
-                 nproc: int,
-                 delete: List = ['annotations']):
-        super().__init__(
-            splits=splits,
-            data_root=data_root,
-            gatherer=gatherer,
-            parser=parser,
-            dumper=dumper,
-            dataset_name=dataset_name,
-            nproc=nproc,
-            task='textrecog',
-            delete=delete)
-
-    def pack_instance(self, sample: Tuple, split: str) -> Dict:
-        """Pack the text info to a recognition instance.
-
-        Args:
-            samples (Tuple): A tuple of (img_name, text).
-            split (str): The split of the instance.
-
-        Returns:
-            Dict: The packed instance.
-        """
-
-        img_name, text = sample
-        packed_instance = dict(
-            instances=[dict(text=text)],
-            img_path=osp.join(self.img_dir, split, osp.basename(img_name)))
-
-        return packed_instance
-
-    def add_meta(self, sample: List) -> Dict:
-        meta = {
-            'metainfo': {
-                'dataset_type': 'TextRecogDataset',
-                'task_name': 'textrecog'
-            },
-            'data_list': sample
-        }
-        return meta
-
-
-@DATA_CONVERTERS.register_module()
-class TextRecogCropConverter(TextRecogDataConverter):
-    """Text recognition crop converter. This converter will crop the text from
-    the original image. The parser used for this Converter should be a TextDet
-    parser.
-
-    Args:
-        splits (List): A list of splits to be processed.
-        data_root (str): Path to the data root.
-        gatherer (Dict): Config dict for gathering the dataset files.
-        parser (Dict): Config dict for parsing the dataset annotations.
-        dumper (Dict): Config dict for dumping the dataset files.
-        dataset_name (str): Name of the dataset.
-        nproc (int): Number of processes to process the data.
-        crop_with_warp (bool): Whether to crop the text from the original image
-            using opencv warpPerspective.
-        jitter (bool): (Applicable when crop_with_warp=True)
-            Whether to jitter the box.
-        jitter_ratio_x (float): (Applicable when crop_with_warp=True)
-            Horizontal jitter ratio relative to the height.
-        jitter_ratio_y (float): (Applicable when crop_with_warp=True)
-            Vertical jitter ratio relative to the height.
-        long_edge_pad_ratio (float): (Applicable when crop_with_warp=False)
-            The ratio of padding the long edge of the cropped image.
-            Defaults to 0.1.
-        short_edge_pad_ratio (float): (Applicable when crop_with_warp=False)
-            The ratio of padding the short edge of the cropped image.
-            Defaults to 0.05.
-        delete (Optional[List]): A list of files to be deleted after
-            conversion. Defaults to ['annotations].
-    """
-
-    def __init__(self,
-                 splits: List,
-                 data_root: str,
-                 gatherer: Dict,
-                 parser: Dict,
-                 dumper: Dict,
-                 dataset_name: str,
-                 nproc: int,
-                 crop_with_warp: bool = False,
-                 jitter: bool = False,
-                 jitter_ratio_x: float = 0.0,
-                 jitter_ratio_y: float = 0.0,
-                 long_edge_pad_ratio: float = 0.0,
-                 short_edge_pad_ratio: float = 0.0,
-                 delete: List = ['annotations']):
-        super().__init__(
-            splits=splits,
-            data_root=data_root,
-            gatherer=gatherer,
-            parser=parser,
-            dumper=dumper,
-            dataset_name=dataset_name,
-            nproc=nproc,
-            delete=delete)
-        self.crop_with_warp = crop_with_warp
-        self.jitter = jitter
-        self.jrx = jitter_ratio_x
-        self.jry = jitter_ratio_y
-        self.lepr = long_edge_pad_ratio
-        self.sepr = short_edge_pad_ratio
-        # Crop converter crops the images of textdet to patches
-        self.img_dir = 'textdet_imgs'
-        self.cropped_img_dir = 'textrecog_imgs'
-        self.crop_save_path = osp.join(self.data_root, self.cropped_img_dir)
-        mkdir_or_exist(self.crop_save_path)
-        for split in splits:
-            mkdir_or_exist(osp.join(self.crop_save_path, split))
-
-    def pack_instance(self, sample: Tuple, split: str) -> List:
-        """Crop patches from image.
-
-        Args:
-            samples (Tuple): A tuple of (img_name, text).
-            split (str): The split of the instance.
-
-        Return:
-            List: The list of cropped patches.
-        """
-
-        def get_box(instance: Dict) -> List:
-            if 'box' in instance:
-                return bbox2poly(instance['box']).tolist()
-            if 'poly' in instance:
-                return bbox2poly(poly2bbox(instance['poly'])).tolist()
-
-        def get_poly(instance: Dict) -> List:
-            if 'poly' in instance:
-                return instance['poly']
-            if 'box' in instance:
-                return bbox2poly(instance['box']).tolist()
-
-        data_list = []
-        img_path, instances = sample
-        img = mmcv.imread(img_path)
-        for i, instance in enumerate(instances):
-            if instance['ignore']:
-                continue
-            if self.crop_with_warp:
-                poly = get_poly(instance)
-                patch = warp_img(img, poly, self.jitter, self.jrx, self.jry)
-            else:
-                box = get_box(instance)
-                patch = crop_img(img, box, self.lepr, self.sepr)
-            if patch.shape[0] == 0 or patch.shape[1] == 0:
-                continue
-            text = instance['text']
-            patch_name = osp.splitext(
-                osp.basename(img_path))[0] + f'_{i}' + osp.splitext(
-                    osp.basename(img_path))[1]
-            dst_path = osp.join(self.crop_save_path, split, patch_name)
-            mmcv.imwrite(patch, dst_path)
-            rec_instance = dict(
-                instances=[dict(text=text)],
-                img_path=osp.join(self.cropped_img_dir, split, patch_name))
-            data_list.append(rec_instance)
-
-        return data_list
-
-    def add_meta(self, sample: List) -> Dict:
-        # Since the TextRecogCropConverter packs all of the patches in a single
-        # image into a list, we need to flatten the list.
-        sample = [item for sublist in sample for item in sublist]
-        return super().add_meta(sample)
-
-
-@DATA_CONVERTERS.register_module()
-class WildReceiptConverter(BaseDataConverter):
-    """MMOCR only supports wildreceipt dataset for KIE task now. This converter
-    converts the wildreceipt dataset from close set to open set.
-
-    Args:
-        splits (List): A list of splits to be processed.
-        data_root (str): Path to the data root.
-        gatherer (Dict): Config dict for gathering the dataset files.
-        parser (Dict): Config dict for parsing the dataset annotations.
-        dumper (Dict): Config dict for dumping the dataset files.
-        nproc (int): Number of processes to process the data.
-        delete (Optional[List]): A list of files to be deleted after
-            conversion. Defaults to ['annotations].
-        merge_bg_others (bool): If True, give the same label to "background"
-                class and "others" class. Defaults to True.
-        ignore_idx (int): Index for ``ignore`` class. Defaults to 0.
-        others_idx (int): Index for ``others`` class. Defaults to 25.
-    """
-
-    def __init__(self,
-                 splits: List,
-                 data_root: str,
-                 gatherer: Dict,
-                 parser: Dict,
-                 dumper: Dict,
-                 dataset_name: str,
-                 nproc: int,
-                 delete: Optional[List] = None,
-                 merge_bg_others: bool = False,
-                 ignore_idx: int = 0,
-                 others_idx: int = 25):
-        self.ignore_idx = ignore_idx
-        self.others_idx = others_idx
-        self.merge_bg_others = merge_bg_others
-        parser.update(dict(ignore=ignore_idx))
-        super().__init__(
-            splits=splits,
-            data_root=data_root,
-            gatherer=gatherer,
-            parser=parser,
-            dumper=dumper,
-            dataset_name=dataset_name,
-            nproc=nproc,
-            task='kie',
-            delete=delete)
-
-    def add_meta(self, samples: List) -> List:
-        """No meta info is required for the wildreceipt dataset."""
-        return samples
-
-    def pack_instance(self, sample: str, split: str):
-        """Pack line-json str of close set to line-json str of open set.
-
-        Args:
-            sample (str): The string to be deserialized to
-                the close set dictionary object.
-            split (str): The split of the instance.
-        """
-        # Two labels at the same index of the following two lists
-        # make up a key-value pair. For example, in wildreceipt,
-        # closeset_key_inds[0] maps to "Store_name_key"
-        # and closeset_value_inds[0] maps to "Store_addr_value".
-        closeset_key_inds = list(range(2, self.others_idx, 2))
-        closeset_value_inds = list(range(1, self.others_idx, 2))
-
-        openset_node_label_mapping = {
-            'bg': 0,
-            'key': 1,
-            'value': 2,
-            'others': 3
-        }
-        if self.merge_bg_others:
-            openset_node_label_mapping['others'] = openset_node_label_mapping[
-                'bg']
-
-        closeset_obj = json.loads(sample)
-        openset_obj = {
-            'file_name': closeset_obj['file_name'],
-            'height': closeset_obj['height'],
-            'width': closeset_obj['width'],
-            'annotations': []
-        }
-
-        edge_idx = 1
-        label_to_edge = {}
-        for anno in closeset_obj['annotations']:
-            label = anno['label']
-            if label == self.ignore_idx:
-                anno['label'] = openset_node_label_mapping['bg']
-                anno['edge'] = edge_idx
-                edge_idx += 1
-            elif label == self.others_idx:
-                anno['label'] = openset_node_label_mapping['others']
-                anno['edge'] = edge_idx
-                edge_idx += 1
-            else:
-                edge = label_to_edge.get(label, None)
-                if edge is not None:
-                    anno['edge'] = edge
-                    if label in closeset_key_inds:
-                        anno['label'] = openset_node_label_mapping['key']
-                    elif label in closeset_value_inds:
-                        anno['label'] = openset_node_label_mapping['value']
-                else:
-                    tmp_key = 'key'
-                    if label in closeset_key_inds:
-                        label_with_same_edge = closeset_value_inds[
-                            closeset_key_inds.index(label)]
-                    elif label in closeset_value_inds:
-                        label_with_same_edge = closeset_key_inds[
-                            closeset_value_inds.index(label)]
-                        tmp_key = 'value'
-                    edge_counterpart = label_to_edge.get(
-                        label_with_same_edge, None)
-                    if edge_counterpart is not None:
-                        anno['edge'] = edge_counterpart
-                    else:
-                        anno['edge'] = edge_idx
-                        edge_idx += 1
-                    anno['label'] = openset_node_label_mapping[tmp_key]
-                    label_to_edge[label] = anno['edge']
-
-        openset_obj['annotations'] = closeset_obj['annotations']
-
-        return json.dumps(openset_obj, ensure_ascii=False)
--- a/mmocr/datasets/preparers/data_preparer.py
+++ b/mmocr/datasets/preparers/data_preparer.py
@ -1,32 +1,39 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
 import os.path as osp
-import time
+import shutil
+from typing import List, Optional, Union

 from mmengine import Registry
-from mmengine.config import Config

+from mmocr.utils.typing_utils import ConfigType, OptConfigType
+
+DATA_PREPARERS = Registry('data preparer')
 DATA_OBTAINERS = Registry('data_obtainer')
-DATA_CONVERTERS = Registry('data_converter')
+DATA_GATHERERS = Registry('data_gatherer')
 DATA_PARSERS = Registry('data_parser')
 DATA_DUMPERS = Registry('data_dumper')
+DATA_PACKERS = Registry('data_packer')
 CFG_GENERATORS = Registry('cfg_generator')


+@DATA_PREPARERS.register_module()
 class DatasetPreparer:
    """Base class of dataset preparer.

    Dataset preparer is used to prepare dataset for MMOCR. It mainly consists
    of three steps:
-
-      1. Obtain the dataset
+      1. For each split:
+        - Obtain the dataset
            - Download
            - Extract
            - Move/Rename
-      2. Process the dataset
-            - Parse original annotations
-            - Convert to mmocr format
-            - Dump the annotation file
-            - Clean useless files
+        - Gather the dataset
+        - Parse the dataset
+        - Pack the dataset to MMOCR format
+        - Dump the dataset
+      2. Delete useless files
      3. Generate the base config for this dataset

    After all these steps, the original datasets have been prepared for
@ -34,106 +41,169 @@ class DatasetPreparer:
    https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html

    Args:
-        cfg_path (str): Path to dataset config file.
+        data_root (str): Root directory of data.
        dataset_name (str): Dataset name.
        task (str): Task type. Options are 'textdet', 'textrecog',
            'textspotter', and 'kie'. Defaults to 'textdet'.
        nproc (int): Number of parallel processes. Defaults to 4.
-        overwrite_cfg (bool): Whether to overwrite the dataset config file if
-            it already exists. If False, Dataset Preparer will not generate new
-            config for datasets whose configs are already in base.
+        train_preparer (OptConfigType): cfg for train data prepare. It contains
+            the following keys:
+            - obtainer: cfg for data obtainer.
+            - gatherer: cfg for data gatherer.
+            - parser: cfg for data parser.
+            - packer: cfg for data packer.
+            - dumper: cfg for data dumper.
+            Defaults to None.
+        test_preparer (OptConfigType): cfg for test data prepare. Defaults to
+            None.
+        val_preparer (OptConfigType): cfg for val data prepare. Defaults to
+            None.
+        config_generator (OptConfigType): cfg for config generator. Defaults to
+            None.
+        delete (list[str], optional): List of files to be deleted.
+            Defaults to None.
    """

    def __init__(self,
-                 cfg_path: str,
-                 dataset_name: str,
+                 data_root: str,
+                 dataset_name: str = '',
                 task: str = 'textdet',
                 nproc: int = 4,
-                 overwrite_cfg: bool = False) -> None:
-        cfg_path = osp.join(cfg_path, dataset_name)
+                 train_preparer: OptConfigType = None,
+                 test_preparer: OptConfigType = None,
+                 val_preparer: OptConfigType = None,
+                 config_generator: OptConfigType = None,
+                 delete: Optional[List[str]] = None) -> None:
+        self.data_root = data_root
        self.nproc = nproc
        self.task = task
        self.dataset_name = dataset_name
-        self.overwrite_cfg = overwrite_cfg
-        self.parse_meta(cfg_path)
-        self.parse_cfg(cfg_path)
+        self.train_preparer = train_preparer
+        self.test_preparer = test_preparer
+        self.val_preparer = val_preparer
+        self.config_generator = config_generator
+        self.delete = delete

-    def __call__(self):
+    def run(self, splits: Union[str, List] = ['train', 'test', 'val']) -> None:
        """Prepare the dataset."""
-        if self.with_obtainer:
-            print('Obtaining Dataset...')
-            self.data_obtainer()
-        if self.with_converter:
-            print('Converting Dataset...')
-            self.data_converter()
-        if self.with_config_generator:
-            print('Generating base configs...')
-            self.config_generator()
+        if isinstance(splits, str):
+            splits = [splits]
+        assert set(splits).issubset(set(['train', 'test',
+                                         'val'])), 'Invalid split name'
+        for split in splits:
+            self.loop(split, getattr(self, f'{split}_preparer'))
+        self.clean()
+        self.generate_config()

-    def parse_meta(self, cfg_path: str) -> None:
-        """Parse meta file.
+    @classmethod
+    def from_file(cls, cfg: ConfigType) -> 'DatasetPreparer':
+        """Create a DataPreparer from config file.

        Args:
-            cfg_path (str): Path to meta file.
+            cfg (ConfigType): A config used for building runner. Keys of
+                ``cfg`` can see :meth:`__init__`.
+
+        Returns:
+            Runner: A DatasetPreparer build from ``cfg``.
        """
-        try:
-            meta = Config.fromfile(osp.join(cfg_path, 'metafile.yml'))
-        except FileNotFoundError:
+
+        cfg = copy.deepcopy(cfg)
+        data_preparer = cls(
+            data_root=cfg['data_root'],
+            dataset_name=cfg.get('dataset_name', ''),
+            task=cfg.get('task', 'textdet'),
+            nproc=cfg.get('nproc', 4),
+            train_preparer=cfg.get('train_preparer', None),
+            test_preparer=cfg.get('test_preparer', None),
+            val_preparer=cfg.get('val_preparer', None),
+            delete=cfg.get('delete', None),
+            config_generator=cfg.get('config_generator', None))
+        return data_preparer
+
+    def loop(self, split: str, cfg: ConfigType) -> None:
+        """Loop over the dataset.
+
+        Args:
+            split (str): The split of the dataset.
+            cfg (ConfigType): A config used for building obtainer, gatherer,
+                parser, packer and dumper.
+        """
+        if cfg is None:
            return
-        assert self.task in meta['Data']['Tasks'], \
-            f'Task {self.task} not supported!'
-        # License related
-        if meta['Data']['License']['Type']:
-            print(f"\033[1;33;40mDataset Name: {meta['Name']}")
-            print(f"License Type: {meta['Data']['License']['Type']}")
-            print(f"License Link: {meta['Data']['License']['Link']}")
-            print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
-            print(
-                '\033[1;31;43mMMOCR does not own the dataset. Using this '
-                'dataset you must accept the license provided by the owners, '
-                'and cite the corresponding papers appropriately.')
-            print('If you do not agree with the above license, please cancel '
-                  'the progress immediately by pressing ctrl+c. Otherwise, '
-                  'you are deemed to accept the terms and conditions.\033[0m')
-            for i in range(5):
-                print(f'{5-i}...')
-                time.sleep(1)

-    def parse_cfg(self, cfg_path: str) -> None:
-        """Parse dataset config file.
+        # build obtainer and run
+        obtainer = cfg.get('obtainer', None)
+        if obtainer:
+            print(f'Obtaining {split} Dataset...')
+            obtainer.setdefault('task', default=self.task)
+            obtainer.setdefault('data_root', default=self.data_root)
+            obtainer = DATA_OBTAINERS.build(obtainer)
+            obtainer()

-        Args:
-            cfg_path (str): Path to dataset config file.
-        """
-        cfg_path = osp.join(cfg_path, self.task + '.py')
-        assert osp.exists(cfg_path), f'Config file {cfg_path} not found!'
-        cfg = Config.fromfile(cfg_path)
+        # build gatherer
+        gatherer = cfg.get('gatherer', None)
+        parser = cfg.get('parser', None)
+        packer = cfg.get('packer', None)
+        dumper = cfg.get('dumper', None)
+        related = [gatherer, parser, packer, dumper]
+        if all(item is None for item in related):  # no data process
+            return
+        if not all(item is not None for item in related):
+            raise ValueError('gatherer, parser, packer and dumper should be '
+                             'either all None or not None')

-        if 'data_obtainer' in cfg:
-            cfg.data_obtainer.update(task=self.task)
-            self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer)
-        if 'data_converter' in cfg:
-            cfg.data_converter.update(
-                dict(nproc=self.nproc, dataset_name=self.dataset_name))
-            self.data_converter = DATA_CONVERTERS.build(cfg.data_converter)
-        if 'config_generator' in cfg:
-            cfg.config_generator.update(
-                dict(
-                    dataset_name=self.dataset_name,
-                    overwrite_cfg=self.overwrite_cfg))
-            self.config_generator = CFG_GENERATORS.build(cfg.config_generator)
+        print(f'Gathering {split} Dataset...')
+        gatherer.setdefault('split', default=split)
+        gatherer.setdefault('data_root', default=self.data_root)
+        gatherer.setdefault('ann_dir', default='annotations')
+        gatherer.setdefault(
+            'img_dir', default=osp.join(f'{self.task}_imgs', split))

-    @property
-    def with_obtainer(self) -> bool:
-        """bool: whether the data preparer has an obtainer"""
-        return getattr(self, 'data_obtainer', None) is not None
+        gatherer = DATA_GATHERERS.build(gatherer)
+        img_paths, ann_paths = gatherer()

-    @property
-    def with_converter(self) -> bool:
-        """bool: whether the data preparer has an converter"""
-        return getattr(self, 'data_converter', None) is not None
+        # build parser
+        print(f'Parsing {split} Images and Annotations...')
+        parser.setdefault('split', default=split)
+        parser.setdefault('nproc', default=self.nproc)
+        parser = DATA_PARSERS.build(parser)
+        # Convert dataset annotations to MMOCR format
+        samples = parser(img_paths, ann_paths)

-    @property
-    def with_config_generator(self) -> bool:
-        """bool: whether the data preparer has a config generator"""
-        return getattr(self, 'config_generator', None) is not None
+        # build packer
+        print(f'Packing {split} Annotations...')
+        packer.setdefault('split', default=split)
+        packer.setdefault('nproc', default=self.nproc)
+        packer.setdefault('data_root', default=self.data_root)
+        packer = DATA_PACKERS.build(packer)
+        samples = packer(samples)
+
+        # build dumper
+        print(f'Dumping {split} Annotations...')
+        # Dump annotation files
+        dumper.setdefault('task', default=self.task)
+        dumper.setdefault('split', default=split)
+        dumper.setdefault('data_root', default=self.data_root)
+        dumper = DATA_DUMPERS.build(dumper)
+        dumper(samples)
+
+    def generate_config(self):
+        if self.config_generator is None:
+            return
+        self.config_generator.setdefault(
+            'dataset_name', default=self.dataset_name)
+        self.config_generator.setdefault('data_root', default=self.data_root)
+        config_generator = CFG_GENERATORS.build(self.config_generator)
+        print('Generating base configs...')
+        config_generator()
+
+    def clean(self) -> None:
+        if self.delete is None:
+            return
+        for d in self.delete:
+            delete_file = osp.join(self.data_root, d)
+            if osp.exists(delete_file):
+                if osp.isdir(delete_file):
+                    shutil.rmtree(delete_file)
+                else:
+                    os.remove(delete_file)
--- a/mmocr/datasets/preparers/dumpers/init.py
+++ b/mmocr/datasets/preparers/dumpers/init.py
@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .dumpers import JsonDumper, WildreceiptOpensetDumper
+from .base import BaseDumper
+from .json_dumper import JsonDumper
+from .wild_receipt_openset_dumper import WildreceiptOpensetDumper

-__all__ = ['JsonDumper', 'WildreceiptOpensetDumper']
+__all__ = ['BaseDumper', 'JsonDumper', 'WildreceiptOpensetDumper']
--- a/mmocr/datasets/preparers/dumpers/base.py
+++ b/mmocr/datasets/preparers/dumpers/base.py
@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+
+class BaseDumper:
+    """Base class for data dumpers.
+
+    Args:
+        task (str): Task type. Options are 'textdet', 'textrecog',
+            'textspotter', and 'kie'. It is usually set automatically and users
+             do not need to set it manually in config file in most cases.
+        split (str): It' s the partition of the datasets. Options are 'train',
+            'val' or 'test'. It is usually set automatically and users do not
+            need to set it manually in config file in most cases. Defaults to
+            None.
+        data_root (str): The root directory of the image and
+            annotation. It is usually set automatically and users do not need
+            to set it manually in config file in most cases. Defaults to None.
+    """
+
+    def __init__(self, task: str, split: str, data_root: str) -> None:
+        self.task = task
+        self.split = split
+        self.data_root = data_root
+
+    def __call__(self, data: Any) -> None:
+        """Call function.
+
+        Args:
+            data (Any): Data to be dumped.
+        """
+        self.dump(data)
+
+    def dump(self, data: Any) -> None:
+        raise NotImplementedError
--- a/mmocr/datasets/preparers/dumpers/dumpers.py
+++ b/mmocr/datasets/preparers/dumpers/dumpers.py
@ -1,49 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from typing import Dict, List
-
-import mmengine
-
-from mmocr.utils import list_to_file
-from ..data_preparer import DATA_DUMPERS
-
-
-@DATA_DUMPERS.register_module()
-class JsonDumper:
-
-    def __init__(self, task: str) -> None:
-        self.task = task
-
-    def dump(self, data: Dict, data_root: str, split: str) -> None:
-        """Dump data to json file.
-
-        Args:
-            data (Dict): Data to be dumped.
-            data_root (str): Root directory of data.
-            split (str): Split of data.
-            cfg_path (str): Path to configs. Defaults to 'configs/'.
-        """
-
-        filename = f'{self.task}_{split}.json'
-        dst_file = osp.join(data_root, filename)
-        mmengine.dump(data, dst_file)
-
-
-@DATA_DUMPERS.register_module()
-class WildreceiptOpensetDumper:
-
-    def __init__(self, task: str) -> None:
-        self.task = task
-
-    def dump(self, data: List, data_root: str, split: str):
-        """Dump data to txt file.
-
-        Args:
-            data (List): Data to be dumped.
-            data_root (str): Root directory of data.
-            split (str): Split of data.
-        """
-
-        filename = f'openset_{split}.txt'
-        dst_file = osp.join(data_root, filename)
-        list_to_file(dst_file, data)
--- a/mmocr/datasets/preparers/dumpers/json_dumper.py
+++ b/mmocr/datasets/preparers/dumpers/json_dumper.py
@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict
+
+import mmengine
+
+from ..data_preparer import DATA_DUMPERS
+from .base import BaseDumper
+
+
+@DATA_DUMPERS.register_module()
+class JsonDumper(BaseDumper):
+    """Dumper for json file."""
+
+    def dump(self, data: Dict) -> None:
+        """Dump data to json file.
+
+        Args:
+            data (Dict): Data to be dumped.
+        """
+
+        filename = f'{self.task}_{self.split}.json'
+        dst_file = osp.join(self.data_root, filename)
+        mmengine.dump(data, dst_file)
--- a/mmocr/datasets/preparers/dumpers/wild_receipt_openset_dumper.py
+++ b/mmocr/datasets/preparers/dumpers/wild_receipt_openset_dumper.py
@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmocr.utils import list_to_file
+from ..data_preparer import DATA_DUMPERS
+from .base import BaseDumper
+
+
+@DATA_DUMPERS.register_module()
+class WildreceiptOpensetDumper(BaseDumper):
+
+    def dump(self, data: List):
+        """Dump data to txt file.
+
+        Args:
+            data (List): Data to be dumped.
+        """
+
+        filename = f'openset_{self.split}.txt'
+        dst_file = osp.join(self.data_root, filename)
+        list_to_file(dst_file, data)
--- a/mmocr/datasets/preparers/gatherers/init.py
+++ b/mmocr/datasets/preparers/gatherers/init.py
@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .base import BaseGatherer
+from .mono_gatherer import MonoGatherer
+from .naf_gatherer import NAFGatherer
+from .pair_gatherer import PairGatherer
+
+__all__ = ['BaseGatherer', 'MonoGatherer', 'PairGatherer', 'NAFGatherer']
--- a/mmocr/datasets/preparers/gatherers/base.py
+++ b/mmocr/datasets/preparers/gatherers/base.py
@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional, Tuple, Union
+
+
+class BaseGatherer:
+    """Base class for gatherer.
+
+    Note: Gatherer assumes that all the annotation file is in the same
+    directory and all the image files are in the same directory.
+
+    Args:
+        img_dir(str): The directory of the images. It is usually set
+            automatically to f'text{task}_imgs/split' and users do not need to
+            set it manually in config file in most cases. When the image files
+            is not in 'text{task}_imgs/split' directory, users should set it.
+            Defaults to ''.
+        ann_dir (str): The directory of the annotation files. It is usually set
+            automatically to 'annotations' and users do not need to set it
+            manually in config file in most cases. When the annotation files
+            is not in 'annotations' directory, users should set it. Defaults to
+            'annotations'.
+        split (str, optional): List of splits to gather. It' s the partition of
+            the datasets. Options are 'train', 'val' or 'test'. It is usually
+            set automatically and users do not need to set it manually in
+            config file in most cases. Defaults to None.
+        data_root (str, optional): The root directory of the image and
+            annotation. It is usually set automatically and users do not need
+            to set it manually in config file in most cases. Defaults to None.
+    """
+
+    def __init__(self,
+                 img_dir: str = '',
+                 ann_dir: str = 'annotations',
+                 split: Optional[str] = None,
+                 data_root: Optional[str] = None) -> None:
+        self.split = split
+        self.data_root = data_root
+        self.ann_dir = osp.join(data_root, ann_dir)
+        self.img_dir = osp.join(data_root, img_dir)
+
+    def __call__(self) -> Union[Tuple[List[str], List[str]], Tuple[str, str]]:
+        """The return value of the gatherer is a tuple of two lists or strings.
+
+        The first element is the list of image paths or the directory of the
+        images. The second element is the list of annotation paths or the path
+        of the annotation file which contains all the annotations.
+        """
+        raise NotImplementedError
--- a/mmocr/datasets/preparers/gatherers/mono_gatherer.py
+++ b/mmocr/datasets/preparers/gatherers/mono_gatherer.py
@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Tuple
+
+from ..data_preparer import DATA_GATHERERS
+from .base import BaseGatherer
+
+
+@DATA_GATHERERS.register_module()
+class MonoGatherer(BaseGatherer):
+    """Gather the dataset file. Specifically for the case that only one
+    annotation file is needed. For example,
+
+            img_001.jpg \
+            img_002.jpg ---> train.json
+            img_003.jpg /
+
+    Args:
+        ann_name (str): The name of the annotation file.
+    """
+
+    def __init__(self, ann_name: str, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.ann_name = ann_name
+
+    def __call__(self) -> Tuple[str, str]:
+        """
+        Returns:
+            tuple(str, str): The directory of the image and the path of
+            annotation file.
+        """
+
+        return (self.img_dir, osp.join(self.ann_dir, self.ann_name))
--- a/mmocr/datasets/preparers/gatherers/naf_gatherer.py
+++ b/mmocr/datasets/preparers/gatherers/naf_gatherer.py
@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+import shutil
+from typing import List, Tuple
+
+from ..data_preparer import DATA_GATHERERS
+from .base import BaseGatherer
+
+
+@DATA_GATHERERS.register_module()
+class NAFGatherer(BaseGatherer):
+    """Gather the dataset file from NAF dataset. Specifically for the case that
+    there is a split file that contains the names of different splits. For
+    example,
+
+        img_001.jpg                           train: img_001.jpg
+        img_002.jpg ---> split_file ---> test: img_002.jpg
+        img_003.jpg                           val: img_003.jpg
+
+    Args:
+        split_file (str, optional): The name of the split file. Defaults to
+            "data_split.json".
+        temp_dir (str, optional): The directory of the temporary images.
+            Defaults to "temp_images".
+    """
+
+    def __init__(self,
+                 split_file='data_split.json',
+                 temp_dir: str = 'temp_images',
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.temp_dir = temp_dir
+        self.split_file = split_file
+
+    def __call__(self) -> Tuple[List[str], List[str]]:
+        """
+        Returns:
+            tuple(list[str], list[str]): The list of image paths and the list
+            of annotation paths.
+        """
+
+        split_file = osp.join(self.data_root, self.split_file)
+        with open(split_file, 'r') as f:
+            split_data = json.load(f)
+        img_list = list()
+        ann_list = list()
+        # Rename the key
+        split_data['val'] = split_data.pop('valid')
+        if not osp.exists(self.img_dir):
+            os.makedirs(self.img_dir)
+        current_split_data = split_data[self.split]
+        for groups in current_split_data:
+            for img_name in current_split_data[groups]:
+                src_img = osp.join(self.data_root, self.temp_dir, img_name)
+                dst_img = osp.join(self.img_dir, img_name)
+                if not osp.exists(src_img):
+                    Warning(f'{src_img} does not exist!')
+                    continue
+                # move the image to the new path
+                shutil.move(src_img, dst_img)
+                ann = osp.join(self.ann_dir, img_name.replace('.jpg', '.json'))
+                img_list.append(dst_img)
+                ann_list.append(ann)
+        return img_list, ann_list
--- a/mmocr/datasets/preparers/gatherers/pair_gatherer.py
+++ b/mmocr/datasets/preparers/gatherers/pair_gatherer.py
@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import re
+from typing import List, Optional, Tuple
+
+from mmocr.utils import list_files
+from ..data_preparer import DATA_GATHERERS
+from .base import BaseGatherer
+
+
+@DATA_GATHERERS.register_module()
+class PairGatherer(BaseGatherer):
+    """Gather the dataset files. Specifically for the paired annotations. That
+    is to say, each image has a corresponding annotation file. For example,
+
+            img_1.jpg <---> gt_img_1.txt
+            img_2.jpg <---> gt_img_2.txt
+            img_3.jpg <---> gt_img_3.txt
+
+    Args:
+        img_suffixes (List[str]): File suffixes that used for searching.
+        rule (Sequence): The rule for pairing the files. The first element is
+            the matching pattern for the file, and the second element is the
+            replacement pattern, which should be a regular expression. For
+            example, to map the image name img_1.jpg to the annotation name
+            gt_img_1.txt, the rule is
+            [r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501
+
+    Note: PairGatherer assumes that each split annotation file is in the
+    correspond split directory. For example, all the train annotation files are
+    in {ann_dir}/train.
+    """
+
+    def __init__(self,
+                 img_suffixes: Optional[List[str]] = None,
+                 rule: Optional[List[str]] = None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.rule = rule
+        self.img_suffixes = img_suffixes
+        # ann_dir = {ann_root}/{ann_dir}/{split}
+        self.ann_dir = osp.join(self.ann_dir, self.split)
+
+    def __call__(self) -> Tuple[List[str], List[str]]:
+        """tuple(list, list): The list of image paths and the list of
+        annotation paths."""
+
+        img_list = list()
+        ann_list = list()
+        for img_path in list_files(self.img_dir, self.img_suffixes):
+            if not re.match(self.rule[0], osp.basename(img_path)):
+                continue
+            ann_name = re.sub(self.rule[0], self.rule[1],
+                              osp.basename(img_path))
+            ann_path = osp.join(self.ann_dir, ann_name)
+            img_list.append(img_path)
+            ann_list.append(ann_path)
+
+        return img_list, ann_list
--- a/mmocr/datasets/preparers/obtainers/init.py
+++ b/mmocr/datasets/preparers/obtainers/init.py
@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .naive_data_obtainer import NaiveDataObtainer
+
+__all__ = ['NaiveDataObtainer']
--- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
+++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py
@ -10,7 +10,7 @@ from typing import Dict, List, Optional, Tuple
 from mmengine import mkdir_or_exist

 from mmocr.utils import check_integrity, is_archive
-from .data_preparer import DATA_OBTAINERS
+from ..data_preparer import DATA_OBTAINERS

 ssl._create_default_https_context = ssl._create_unverified_context

@ -24,8 +24,12 @@ class NaiveDataObtainer:
    Args:
        files (list[dict]): A list of file information.
        cache_path (str): The path to cache the downloaded files.
-        data_root (str): The root path of the dataset.
-        task (str): The task of the dataset.
+        data_root (str): The root path of the dataset. It is usually set auto-
+            matically and users do not need to set it manually in config file
+            in most cases.
+        task (str): The task of the dataset. It is usually set automatically
+            and users do not need to set it manually in config file
+            in most cases.
    """

    def __init__(self, files: List[Dict], cache_path: str, data_root: str,
@ -114,6 +118,23 @@ class NaiveDataObtainer:
            dst_path = osp.join(osp.dirname(src_path), zip_name)
        else:
            dst_path = osp.join(dst_path, zip_name)
+
+        extracted = False
+        if osp.exists(dst_path):
+            name = set(os.listdir(dst_path))
+            if '.finish' in name:
+                extracted = True
+            elif '.finish' not in name and len(name) > 0:
+                while True:
+                    c = input(f'{dst_path} already exists when extracting '
+                              '{zip_name}, whether to unzip again? (y/n)')
+                    if c.lower() in ['y', 'n']:
+                        extracted = c == 'n'
+                        break
+        if extracted:
+            open(osp.join(dst_path, '.finish'), 'w').close()
+            print(f'{zip_name} has been extracted. Skip')
+            return
        mkdir_or_exist(dst_path)
        print(f'Extracting: {osp.basename(src_path)}')
        if src_path.endswith('.zip'):
@ -136,6 +157,8 @@ class NaiveDataObtainer:
                    'Please install tarfile by running "pip install tarfile".')
            with tarfile.open(src_path, mode) as tar_ref:
                tar_ref.extractall(dst_path)
+
+        open(osp.join(dst_path, '.finish'), 'w').close()
        if delete:
            os.remove(src_path)

--- a/mmocr/datasets/preparers/packers/init.py
+++ b/mmocr/datasets/preparers/packers/init.py
@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BasePacker
+from .textdet_packer import TextDetPacker
+from .textrecog_packer import TextRecogCropPacker, TextRecogPacker
+from .textspotting_packer import TextSpottingPacker
+from .wildreceipt_packer import WildReceiptPacker
+
+__all__ = [
+    'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker',
+    'TextSpottingPacker', 'WildReceiptPacker'
+]
--- a/mmocr/datasets/preparers/packers/base.py
+++ b/mmocr/datasets/preparers/packers/base.py
@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Dict, List, Tuple
+
+from mmengine import track_parallel_progress
+
+
+class BasePacker:
+    """Base class for packing the parsed annotation info to MMOCR format.
+
+    Args:
+        data_root (str): The root path of the dataset.  It is usually set auto-
+            matically and users do not need to set it manually in config file
+            in most cases.
+        split (str): The split of the dataset. It is usually set automatically
+            and users do not need to set it manually in config file in most
+            cases.
+        nproc (int): Number of processes to process the data. Defaults to 1.
+            It is usually set automatically and users do not need to set it
+            manually in config file in most cases.
+    """
+
+    def __init__(self, data_root: str, split: str, nproc: int = 1) -> None:
+        self.data_root = data_root
+        self.split = split
+        self.nproc = nproc
+
+    @abstractmethod
+    def pack_instance(self, sample: Tuple, split: str) -> Dict:
+        """Pack the parsed annotation info to an MMOCR format instance.
+
+        Args:
+            sample (Tuple): A tuple of (img_file, ann_file).
+               - img_path (str): Path to image file.
+               - instances (Sequence[Dict]): A list of converted annos.
+            split (str): The split of the instance.
+
+        Returns:
+            Dict: An MMOCR format instance.
+        """
+
+    @abstractmethod
+    def add_meta(self, sample: List) -> Dict:
+        """Add meta information to the sample.
+
+        Args:
+            sample (List): A list of samples of the dataset.
+
+        Returns:
+            Dict: A dict contains the meta information and samples.
+        """
+
+    def __call__(self, samples) -> Dict:
+        samples = track_parallel_progress(
+            self.pack_instance, samples, nproc=self.nproc)
+        samples = self.add_meta(samples)
+        return samples
--- a/mmocr/datasets/preparers/packers/textdet_packer.py
+++ b/mmocr/datasets/preparers/packers/textdet_packer.py
@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Tuple
+
+import mmcv
+
+from mmocr.utils import bbox2poly, poly2bbox
+from ..data_preparer import DATA_PACKERS
+from .base import BasePacker
+
+
+@DATA_PACKERS.register_module()
+class TextDetPacker(BasePacker):
+    """Text detection packer. It is used to pack the parsed annotation info to.
+
+    .. code-block:: python
+
+        {
+            "metainfo":
+                {
+                    "dataset_type": "TextDetDataset",
+                    "task_name": "textdet",
+                    "category": [{"id": 0, "name": "text"}]
+                },
+            "data_list":
+                [
+                    {
+                        "img_path": "test_img.jpg",
+                        "height": 640,
+                        "width": 640,
+                        "instances":
+                        [
+                            {
+                                "polygon": [0, 0, 0, 10, 10, 20, 20, 0],
+                                "bbox": [0, 0, 10, 20],
+                                "bbox_label": 0,
+                                "ignore": False
+                            },
+                            // ...
+                        ]
+                    }
+                ]
+        }
+    """
+
+    def pack_instance(self, sample: Tuple, bbox_label: int = 0) -> Dict:
+        """Pack the parsed annotation info to an MMOCR format instance.
+
+        Args:
+            sample (Tuple): A tuple of (img_file, instances).
+               - img_path (str): Path to the image file.
+               - instances (Sequence[Dict]): A list of converted annos. Each
+                 element should be a dict with the following keys:
+
+                 - 'poly' or 'box'
+                 - 'ignore'
+                 - 'bbox_label' (optional)
+            split (str): The split of the instance.
+
+        Returns:
+            Dict: An MMOCR format instance.
+        """
+
+        img_path, instances = sample
+
+        img = mmcv.imread(img_path)
+        h, w = img.shape[:2]
+
+        packed_instances = list()
+        for instance in instances:
+            poly = instance.get('poly', None)
+            box = instance.get('box', None)
+            assert box or poly
+            packed_sample = dict(
+                polygon=poly if poly else list(
+                    bbox2poly(box).astype('float64')),
+                bbox=box if box else list(poly2bbox(poly).astype('float64')),
+                bbox_label=bbox_label,
+                ignore=instance['ignore'])
+            packed_instances.append(packed_sample)
+
+        packed_instances = dict(
+            instances=packed_instances,
+            img_path=osp.relpath(img_path, self.data_root),
+            height=h,
+            width=w)
+
+        return packed_instances
+
+    def add_meta(self, sample: List) -> Dict:
+        """Add meta information to the sample.
+
+        Args:
+            sample (List): A list of samples of the dataset.
+
+        Returns:
+            Dict: A dict contains the meta information and samples.
+        """
+        meta = {
+            'metainfo': {
+                'dataset_type': 'TextDetDataset',
+                'task_name': 'textdet',
+                'category': [{
+                    'id': 0,
+                    'name': 'text'
+                }]
+            },
+            'data_list': sample
+        }
+        return meta
--- a/mmocr/datasets/preparers/packers/textrecog_packer.py
+++ b/mmocr/datasets/preparers/packers/textrecog_packer.py
@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Tuple
+
+import mmcv
+from mmengine import mkdir_or_exist
+
+from mmocr.utils import bbox2poly, crop_img, poly2bbox, warp_img
+from ..data_preparer import DATA_PACKERS
+from .base import BasePacker
+
+
+@DATA_PACKERS.register_module()
+class TextRecogPacker(BasePacker):
+    """Text recogntion packer. It is used to pack the parsed annotation info
+    to:
+
+    .. code-block:: python
+
+        {
+            "metainfo":
+                {
+                    "dataset_type": "TextRecogDataset",
+                    "task_name": "textrecog",
+                },
+            "data_list":
+                [
+                    {
+                        "img_path": "textrecog_imgs/train/test_img.jpg",
+                        "instances":
+                            [
+                                {
+                                    "text": "GRAND"
+                                }
+                            ]
+                    }
+                ]
+        }
+    """
+
+    def pack_instance(self, sample: Tuple) -> Dict:
+        """Pack the text info to a recognition instance.
+
+        Args:
+            samples (Tuple): A tuple of (img_name, text).
+            split (str): The split of the instance.
+
+        Returns:
+            Dict: The packed instance.
+        """
+
+        img_name, text = sample
+        # TODO: remove hard code
+        packed_instance = dict(
+            instances=[dict(text=text)],
+            img_path=osp.join('textrecog_imgs', self.split,
+                              osp.basename(img_name)))
+
+        return packed_instance
+
+    def add_meta(self, sample: List) -> Dict:
+        """Add meta information to the sample.
+
+        Args:
+            sample (List): A list of samples of the dataset.
+
+        Returns:
+            Dict: A dict contains the meta information and samples.
+        """
+        meta = {
+            'metainfo': {
+                'dataset_type': 'TextRecogDataset',
+                'task_name': 'textrecog'
+            },
+            'data_list': sample
+        }
+        return meta
+
+
+@DATA_PACKERS.register_module()
+class TextRecogCropPacker(TextRecogPacker):
+    """Text recognition packer with image cropper. It is used to pack the
+    parsed annotation info and crop out the word images from the full-size
+    ones.
+
+    Args:
+        crop_with_warp (bool): Whether to crop the text from the original
+            image using opencv warpPerspective.
+        jitter (bool): (Applicable when crop_with_warp=True)
+            Whether to jitter the box.
+        jitter_ratio_x (float): (Applicable when crop_with_warp=True)
+            Horizontal jitter ratio relative to the height.
+        jitter_ratio_y (float): (Applicable when crop_with_warp=True)
+            Vertical jitter ratio relative to the height.
+        long_edge_pad_ratio (float): (Applicable when crop_with_warp=False)
+            The ratio of padding the long edge of the cropped image.
+            Defaults to 0.1.
+        short_edge_pad_ratio (float): (Applicable when crop_with_warp=False)
+            The ratio of padding the short edge of the cropped image.
+            Defaults to 0.05.
+    """
+
+    def __init__(self,
+                 crop_with_warp: bool = False,
+                 jitter: bool = False,
+                 jitter_ratio_x: float = 0.0,
+                 jitter_ratio_y: float = 0.0,
+                 long_edge_pad_ratio: float = 0.0,
+                 short_edge_pad_ratio: float = 0.0,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.crop_with_warp = crop_with_warp
+        self.jitter = jitter
+        self.jrx = jitter_ratio_x
+        self.jry = jitter_ratio_y
+        self.lepr = long_edge_pad_ratio
+        self.sepr = short_edge_pad_ratio
+        # Crop converter crops the images of textdet to patches
+        self.cropped_img_dir = 'textrecog_imgs'
+        self.crop_save_path = osp.join(self.data_root, self.cropped_img_dir)
+        mkdir_or_exist(self.crop_save_path)
+        mkdir_or_exist(osp.join(self.crop_save_path, self.split))
+
+    def pack_instance(self, sample: Tuple) -> List:
+        """Crop patches from image.
+
+        Args:
+            samples (Tuple): A tuple of (img_name, text).
+
+        Return:
+            List: The list of cropped patches.
+        """
+
+        def get_box(instance: Dict) -> List:
+            if 'box' in instance:
+                return bbox2poly(instance['box']).tolist()
+            if 'poly' in instance:
+                return bbox2poly(poly2bbox(instance['poly'])).tolist()
+
+        def get_poly(instance: Dict) -> List:
+            if 'poly' in instance:
+                return instance['poly']
+            if 'box' in instance:
+                return bbox2poly(instance['box']).tolist()
+
+        data_list = []
+        img_path, instances = sample
+        img = mmcv.imread(img_path)
+        for i, instance in enumerate(instances):
+            if instance['ignore']:
+                continue
+            if self.crop_with_warp:
+                poly = get_poly(instance)
+                patch = warp_img(img, poly, self.jitter, self.jrx, self.jry)
+            else:
+                box = get_box(instance)
+                patch = crop_img(img, box, self.lepr, self.sepr)
+            if patch.shape[0] == 0 or patch.shape[1] == 0:
+                continue
+            text = instance['text']
+            patch_name = osp.splitext(
+                osp.basename(img_path))[0] + f'_{i}' + osp.splitext(
+                    osp.basename(img_path))[1]
+            dst_path = osp.join(self.crop_save_path, self.split, patch_name)
+            mmcv.imwrite(patch, dst_path)
+            rec_instance = dict(
+                instances=[dict(text=text)],
+                img_path=osp.join(self.cropped_img_dir, self.split,
+                                  patch_name))
+            data_list.append(rec_instance)
+
+        return data_list
+
+    def add_meta(self, sample: List) -> Dict:
+        # Since the TextRecogCropConverter packs all of the patches in a single
+        # image into a list, we need to flatten the list.
+        sample = [item for sublist in sample for item in sublist]
+        return super().add_meta(sample)
--- a/mmocr/datasets/preparers/packers/textspotting_packer.py
+++ b/mmocr/datasets/preparers/packers/textspotting_packer.py
@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List, Tuple
+
+import mmcv
+
+from mmocr.utils import bbox2poly, poly2bbox
+from ..data_preparer import DATA_PACKERS
+from .base import BasePacker
+
+
+@DATA_PACKERS.register_module()
+class TextSpottingPacker(BasePacker):
+    """Text spotting packer. It is used to pack the parsed annotation info to:
+
+    .. code-block:: python
+
+        {
+            "metainfo":
+                {
+                    "dataset_type": "TextDetDataset",
+                    "task_name": "textdet",
+                    "category": [{"id": 0, "name": "text"}]
+                },
+            "data_list":
+                [
+                    {
+                        "img_path": "test_img.jpg",
+                        "height": 640,
+                        "width": 640,
+                        "instances":
+                        [
+                            {
+                                "polygon": [0, 0, 0, 10, 10, 20, 20, 0],
+                                "bbox": [0, 0, 10, 20],
+                                "bbox_label": 0,
+                                "ignore": False,
+                                "text": "mmocr"
+                            },
+                            // ...
+                        ]
+                    }
+                ]
+        }
+    """
+
+    def pack_instance(self, sample: Tuple, bbox_label: int = 0) -> Dict:
+        """Pack the parsed annotation info to an MMOCR format instance.
+
+        Args:
+            sample (Tuple): A tuple of (img_file, ann_file).
+               - img_path (str): Path to image file.
+               - instances (Sequence[Dict]): A list of converted annos. Each
+                    element should be a dict with the following keys:
+                    - 'poly' or 'box'
+                    - 'text'
+                    - 'ignore'
+                    - 'bbox_label' (optional)
+            split (str): The split of the instance.
+
+        Returns:
+            Dict: An MMOCR format instance.
+        """
+
+        img_path, instances = sample
+
+        img = mmcv.imread(img_path)
+        h, w = img.shape[:2]
+
+        packed_instances = list()
+        for instance in instances:
+            assert 'text' in instance, 'Text is not found in the instance.'
+            poly = instance.get('poly', None)
+            box = instance.get('box', None)
+            assert box or poly
+            packed_sample = dict(
+                polygon=poly if poly else list(
+                    bbox2poly(box).astype('float64')),
+                bbox=box if box else list(poly2bbox(poly).astype('float64')),
+                bbox_label=bbox_label,
+                ignore=instance['ignore'],
+                text=instance['text'])
+            packed_instances.append(packed_sample)
+
+        packed_instances = dict(
+            instances=packed_instances,
+            img_path=osp.relpath(img_path, self.data_root),
+            height=h,
+            width=w)
+
+        return packed_instances
+
+    def add_meta(self, sample: List) -> Dict:
+        """Add meta information to the sample.
+
+        Args:
+            sample (List): A list of samples of the dataset.
+
+        Returns:
+            Dict: A dict contains the meta information and samples.
+        """
+        meta = {
+            'metainfo': {
+                'dataset_type': 'TextSpottingDataset',
+                'task_name': 'textspotting',
+                'category': [{
+                    'id': 0,
+                    'name': 'text'
+                }]
+            },
+            'data_list': sample
+        }
+        return meta
--- a/mmocr/datasets/preparers/packers/wildreceipt_packer.py
+++ b/mmocr/datasets/preparers/packers/wildreceipt_packer.py
@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import List
+
+from ..data_preparer import DATA_PACKERS
+from .base import BasePacker
+
+
+@DATA_PACKERS.register_module()
+class WildReceiptPacker(BasePacker):
+    """Pack the wildreceipt annotation to MMOCR format.
+
+    Args:
+        merge_bg_others (bool): If True, give the same label to "background"
+                class and "others" class. Defaults to True.
+        ignore_idx (int): Index for ``ignore`` class. Defaults to 0.
+        others_idx (int): Index for ``others`` class. Defaults to 25.
+    """
+
+    def __init__(self,
+                 merge_bg_others: bool = False,
+                 ignore_idx: int = 0,
+                 others_idx: int = 25,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.ignore_idx = ignore_idx
+        self.others_idx = others_idx
+        self.merge_bg_others = merge_bg_others
+
+    def add_meta(self, samples: List) -> List:
+        """No meta info is required for the wildreceipt dataset."""
+        return samples
+
+    def pack_instance(self, sample: str):
+        """Pack line-json str of close set to line-json str of open set.
+
+        Args:
+            sample (str): The string to be deserialized to
+                the close set dictionary object.
+            split (str): The split of the instance.
+        """
+        # Two labels at the same index of the following two lists
+        # make up a key-value pair. For example, in wildreceipt,
+        # closeset_key_inds[0] maps to "Store_name_key"
+        # and closeset_value_inds[0] maps to "Store_addr_value".
+        closeset_key_inds = list(range(2, self.others_idx, 2))
+        closeset_value_inds = list(range(1, self.others_idx, 2))
+
+        openset_node_label_mapping = {
+            'bg': 0,
+            'key': 1,
+            'value': 2,
+            'others': 3
+        }
+        if self.merge_bg_others:
+            openset_node_label_mapping['others'] = openset_node_label_mapping[
+                'bg']
+
+        closeset_obj = json.loads(sample)
+        openset_obj = {
+            'file_name':
+            closeset_obj['file_name'].replace(self.data_root + '/', ''),
+            'height':
+            closeset_obj['height'],
+            'width':
+            closeset_obj['width'],
+            'annotations': []
+        }
+
+        edge_idx = 1
+        label_to_edge = {}
+        for anno in closeset_obj['annotations']:
+            label = anno['label']
+            if label == self.ignore_idx:
+                anno['label'] = openset_node_label_mapping['bg']
+                anno['edge'] = edge_idx
+                edge_idx += 1
+            elif label == self.others_idx:
+                anno['label'] = openset_node_label_mapping['others']
+                anno['edge'] = edge_idx
+                edge_idx += 1
+            else:
+                edge = label_to_edge.get(label, None)
+                if edge is not None:
+                    anno['edge'] = edge
+                    if label in closeset_key_inds:
+                        anno['label'] = openset_node_label_mapping['key']
+                    elif label in closeset_value_inds:
+                        anno['label'] = openset_node_label_mapping['value']
+                else:
+                    tmp_key = 'key'
+                    if label in closeset_key_inds:
+                        label_with_same_edge = closeset_value_inds[
+                            closeset_key_inds.index(label)]
+                    elif label in closeset_value_inds:
+                        label_with_same_edge = closeset_key_inds[
+                            closeset_value_inds.index(label)]
+                        tmp_key = 'value'
+                    edge_counterpart = label_to_edge.get(
+                        label_with_same_edge, None)
+                    if edge_counterpart is not None:
+                        anno['edge'] = edge_counterpart
+                    else:
+                        anno['edge'] = edge_idx
+                        edge_idx += 1
+                    anno['label'] = openset_node_label_mapping[tmp_key]
+                    label_to_edge[label] = anno['edge']
+
+        openset_obj['annotations'] = closeset_obj['annotations']
+
+        return json.dumps(openset_obj, ensure_ascii=False)
--- a/mmocr/datasets/preparers/parsers/init.py
+++ b/mmocr/datasets/preparers/parsers/init.py
@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseParser
 from .coco_parser import COCOTextDetAnnParser
 from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
@ -10,7 +11,7 @@ from .totaltext_parser import TotaltextTextDetAnnParser
 from .wildreceipt_parser import WildreceiptKIEAnnParser

 __all__ = [
-    'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
+    'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
    'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
    'SROIETextDetAnnParser', 'NAFAnnParser'
--- a/mmocr/datasets/preparers/parsers/base.py
+++ b/mmocr/datasets/preparers/parsers/base.py
@ -1,67 +1,87 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import abstractmethod
-from functools import partial
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple, Union

-from mmengine import track_parallel_progress
+from mmocr.utils import track_parallel_progress_multi_args


 class BaseParser:
    """Base class for parsing annotations.

    Args:
-        data_root (str, optional): Path to the data root. Defaults to None.
-        nproc (int, optional): Number of processes. Defaults to 1.
+        split (str): The split of the dataset. It is usually set automatically
+            and users do not need to set it manually in config file in most
+            cases.
+        nproc (int): Number of processes to process the data. Defaults to 1.
+            It is usually set automatically and users do not need to set it
+            manually in config file in most cases.
    """

-    def __init__(self,
-                 data_root: Optional[str] = None,
-                 nproc: int = 1) -> None:
-        self.data_root = data_root
+    def __init__(self, split: str, nproc: int = 1) -> None:
        self.nproc = nproc
+        self.split = split

-    def __call__(self, files: List[Tuple], split: str) -> List:
+    def __call__(self, img_paths: Union[List[str], str],
+                 ann_paths: Union[List[str], str]) -> List[Tuple]:
        """Parse annotations.

        Args:
-            files (List[Tuple]): A list of a tuple of
-                (image_path, annotation_path).
-            split (str): The split of the dataset.
+            img_paths (str or list[str]): the list of image paths or the
+                directory of the images.
+            ann_paths (str or list[str]): the list of annotation paths or the
+                path of the annotation file which contains all the annotations.

        Returns:
            List: A list of a tuple of (image_path, instances)
        """
-        samples = self.parse_files(files, split)
+        samples = self.parse_files(img_paths, ann_paths)
        return samples

-    def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]:
+    def parse_files(self, img_paths: Union[List[str], str],
+                    ann_paths: Union[List[str], str]) -> List[Tuple]:
        """Convert annotations to MMOCR format.

        Args:
-            files (Tuple): A list of tuple of path to image and annotation.
+            img_paths (str or list[str]): the list of image paths or the
+                directory of the images.
+            ann_paths (str or list[str]): the list of annotation paths or the
+                path of the annotation file which contains all the annotations.

        Returns:
-            List[Tuple]: A list of a tuple of (image_path, instances)
+            List[Tuple]: A list of a tuple of (image_path, instances).
+
+            - img_path (str): The path of image file, which can be read
+              directly by opencv.
+            - instance: instance is a list of dict containing parsed
+              annotations, which should contain the following keys:
+
+              - 'poly' or 'box' (textdet or textspotting)
+              - 'text' (textspotting or textrecog)
+              - 'ignore' (all task)
        """
-        func = partial(self.parse_file, split=split)
-        samples = track_parallel_progress(func, files, nproc=self.nproc)
+        samples = track_parallel_progress_multi_args(
+            self.parse_file, (img_paths, ann_paths), nproc=self.nproc)
        return samples

    @abstractmethod
-    def parse_file(self, file: Tuple, split: str) -> Tuple:
+    def parse_file(self, img_path: str, ann_path: str) -> Tuple:
        """Convert annotation for a single image.

        Args:
-            file (Tuple): A tuple of path to image and annotation
-            split (str): Current split.
+            img_path (str): The path of image.
+            ann_path (str): The path of annotation.

        Returns:
-            Tuple: A tuple of (img_path, instance). Instance is a list of dict
-            containing parsed annotations, which should contain the
-            following keys:
-            - 'poly' or 'box' (textdet or textspotting)
-            - 'text' (textspotting or textrecog)
-            - 'ignore' (all task)
+            Tuple: A tuple of (img_path, instance).
+
+            - img_path (str): The path of image file, which can be read
+              directly by opencv.
+            - instance: instance is a list of dict containing parsed
+              annotations, which should contain the following keys:
+
+              - 'poly' or 'box' (textdet or textspotting)
+              - 'text' (textspotting or textrecog)
+              - 'ignore' (all task)

        Examples:
        An example of returned values:
--- a/mmocr/datasets/preparers/parsers/coco_parser.py
+++ b/mmocr/datasets/preparers/parsers/coco_parser.py
@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from typing import Dict, Tuple
+from typing import List

 from mmdet.datasets.api_wrappers import COCO

@ -21,25 +21,25 @@ class COCOTextDetAnnParser(BaseParser):
    """

    def __init__(self,
-                 data_root: str = None,
+                 split: str,
                 nproc: int = 1,
                 variant: str = 'standard') -> None:

-        super().__init__(nproc=nproc, data_root=data_root)
+        super().__init__(nproc=nproc, split=split)
        assert variant in ['standard', 'cocotext', 'textocr'], \
            f'variant {variant} is not supported'
        self.variant = variant

-    def parse_files(self, files: Tuple, split: str = None) -> Dict:
+    def parse_files(self, img_dir: str, ann_path: str) -> List:
        """Parse single annotation."""
        samples = list()
-        coco = COCO(files)
+        coco = COCO(ann_path)
        if self.variant == 'cocotext' or self.variant == 'textocr':
            # cocotext stores both 'train' and 'val' split in one annotation
            # file, and uses the 'set' field to distinguish them.
            if self.variant == 'cocotext':
                for img in coco.dataset['imgs']:
-                    if split == coco.dataset['imgs'][img]['set']:
+                    if self.split == coco.dataset['imgs'][img]['set']:
                        coco.imgs[img] = coco.dataset['imgs'][img]
            # textocr stores 'train' and 'val'split separately
            elif self.variant == 'textocr':
@ -60,8 +60,6 @@ class COCOTextDetAnnParser(BaseParser):
            img_info = coco.load_imgs([img_id])[0]
            img_info['img_id'] = img_id
            img_path = img_info['file_name']
-            if self.data_root is not None:
-                img_path = osp.join(self.data_root, img_path)
            ann_ids = coco.get_ann_ids(img_ids=[img_id])
            if len(ann_ids) == 0:
                continue
@ -96,5 +94,6 @@ class COCOTextDetAnnParser(BaseParser):
                    instances.append(
                        dict(
                            poly=ann['points'], text=text, ignore=text == '.'))
-            samples.append((img_path, instances))
+            samples.append((osp.join(img_dir,
+                                     osp.basename(img_path)), instances))
        return samples
--- a/mmocr/datasets/preparers/parsers/funsd_parser.py
+++ b/mmocr/datasets/preparers/parsers/funsd_parser.py
@ -17,17 +17,13 @@ class FUNSDTextDetAnnParser(BaseParser):
            to 1.
    """

-    def __init__(self, nproc: int = 1) -> None:
-        super().__init__(nproc=nproc)
-
-    def parse_file(self, file: Tuple, split: str) -> Tuple:
+    def parse_file(self, img_path: str, ann_path: str) -> Tuple:
        """Parse single annotation."""
-        img_file, json_file = file
        instances = list()
-        for poly, text, ignore in self.loader(json_file):
+        for poly, text, ignore in self.loader(ann_path):
            instances.append(dict(poly=poly, text=text, ignore=ignore))

-        return img_file, instances
+        return img_path, instances

    def loader(self, file_path: str):
        with open(file_path, 'r') as f:
--- a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
+++ b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
 from typing import List, Optional, Tuple

 from mmocr.utils import bbox2poly
@ -35,22 +36,21 @@ class ICDARTxtTextDetAnnParser(BaseParser):
                 ignore: str = '###',
                 format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
                 encoding: str = 'utf-8',
-                 nproc: int = 1,
                 remove_strs: Optional[List[str]] = None,
-                 mode: str = None) -> None:
+                 mode: str = None,
+                 **kwargs) -> None:
        self.sep = separator
        self.format = format
        self.encoding = encoding
        self.ignore = ignore
        self.mode = mode
        self.remove_strs = remove_strs
-        super().__init__(nproc=nproc)
+        super().__init__(**kwargs)

-    def parse_file(self, file: Tuple, split: str) -> Tuple:
+    def parse_file(self, img_path: str, ann_path: str) -> Tuple:
        """Parse single annotation."""
-        img_file, txt_file = file
        instances = list()
-        for anno in self.loader(txt_file, self.sep, self.format,
+        for anno in self.loader(ann_path, self.sep, self.format,
                                self.encoding):
            anno = list(anno.values())
            if self.remove_strs is not None:
@ -66,7 +66,7 @@ class ICDARTxtTextDetAnnParser(BaseParser):
            instances.append(
                dict(poly=poly, text=text, ignore=text == self.ignore))

-        return img_file, instances
+        return img_path, instances


@DATA_PARSERS.register_module()
@ -97,21 +97,21 @@ class ICDARTxtTextRecogAnnParser(BaseParser):
                 ignore: str = '#',
                 format: str = 'img,text',
                 encoding: str = 'utf-8',
-                 nproc: int = 1,
-                 remove_strs: Optional[List[str]] = ['"']) -> None:
+                 remove_strs: Optional[List[str]] = ['"'],
+                 **kwargs) -> None:
        self.sep = separator
        self.format = format
        self.encoding = encoding
        self.ignore = ignore
        self.remove_strs = remove_strs
-        super().__init__(nproc=nproc)
+        super().__init__(**kwargs)

-    def parse_files(self, files: str, split: str) -> List:
+    def parse_files(self, img_dir: str, ann_path: str) -> List:
        """Parse annotations."""
-        assert isinstance(files, str)
+        assert isinstance(ann_path, str)
        samples = list()
        for anno in self.loader(
-                file_path=files,
+                file_path=ann_path,
                format=self.format,
                encoding=self.encoding,
                separator=self.sep):
@ -122,6 +122,6 @@ class ICDARTxtTextRecogAnnParser(BaseParser):
            if text == self.ignore:
                continue
            img_name = anno['img']
-            samples.append((img_name, text))
+            samples.append((osp.join(img_dir, osp.basename(img_name)), text))

        return samples
--- a/mmocr/datasets/preparers/parsers/naf_parser.py
+++ b/mmocr/datasets/preparers/parsers/naf_parser.py
@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
-from typing import Dict, List, Tuple
+from typing import List, Tuple

 import numpy as np

@ -28,32 +28,28 @@ class NAFAnnParser(BaseParser):
    "" (empty string) is if the field was blank

    Args:
-        data_root (str): Path to the dataset root.
        ignore (list(str)): The text of the ignored instances. Default: ['#'].
        det (bool): Whether to parse the detection annotation. Default: True.
            If False, the parser will consider special case in NAF dataset
            where the transcription is not available.
-        nproc (int): Number of processes to load the data. Default: 1.
    """

    def __init__(self,
-                 data_root: str,
                 ignore: List[str] = ['#'],
                 det: bool = True,
-                 nproc: int = 1) -> None:
+                 **kwargs) -> None:
        self.ignore = ignore
        self.det = det
-        super().__init__(data_root=data_root, nproc=nproc)
+        super().__init__(**kwargs)

-    def parse_file(self, file: Tuple, split: str) -> Dict:
+    def parse_file(self, img_path: str, ann_path: str) -> Tuple:
        """Convert single annotation."""
-        img_file, json_file = file
        instances = list()
-        for poly, text in self.loader(json_file):
+        for poly, text in self.loader(ann_path):
            instances.append(
                dict(poly=poly, text=text, ignore=text in self.ignore))

-        return img_file, instances
+        return img_path, instances

    def loader(self, file_path: str) -> str:
        """Load the annotation of the NAF dataset.
--- a/mmocr/datasets/preparers/parsers/sroie_parser.py
+++ b/mmocr/datasets/preparers/parsers/sroie_parser.py
@ -31,6 +31,7 @@ class SROIETextDetAnnParser(BaseParser):
    """

    def __init__(self,
+                 split: str,
                 separator: str = ',',
                 ignore: str = '###',
                 format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
@ -44,16 +45,15 @@ class SROIETextDetAnnParser(BaseParser):
        self.ignore = ignore
        self.mode = mode
        self.remove_strs = remove_strs
-        super().__init__(nproc=nproc)
+        super().__init__(nproc=nproc, split=split)

-    def parse_file(self, file: Tuple, split: str) -> Tuple:
+    def parse_file(self, img_path: str, ann_path: str) -> Tuple:
        """Parse single annotation."""
-        img_file, txt_file = file
        instances = list()
        try:
            # there might be some illegal symbols in the annotation
            # which cannot be parsed by loader
-            for anno in self.loader(txt_file, self.sep, self.format,
+            for anno in self.loader(ann_path, self.sep, self.format,
                                    self.encoding):
                anno = list(anno.values())
                if self.remove_strs is not None:
@ -71,4 +71,4 @@ class SROIETextDetAnnParser(BaseParser):
        except Exception:
            pass

-        return img_file, instances
+        return img_path, instances
--- a/mmocr/datasets/preparers/parsers/svt_parser.py
+++ b/mmocr/datasets/preparers/parsers/svt_parser.py
@ -17,15 +17,13 @@ class SVTTextDetAnnParser(BaseParser):
            to 1.
    """

-    def __init__(self, data_root: str = None, nproc: int = 1) -> None:
-        super().__init__(data_root=data_root, nproc=nproc)
-
-    def parse_files(self, files: str, split: str) -> List:
+    def parse_files(self, img_dir: str, ann_path: str) -> List:
        """Parse annotations."""
-        assert isinstance(files, str)
+        assert isinstance(ann_path, str)
        samples = list()
-        for img_name, instance in self.loader(files):
-            samples.append((img_name, instance))
+        for img_name, instance in self.loader(ann_path):
+            samples.append((osp.join(img_dir,
+                                     osp.basename(img_name)), instance))

        return samples

@ -45,8 +43,7 @@ class SVTTextDetAnnParser(BaseParser):
        tree = ET.parse(file_path)
        root = tree.getroot()
        for image in root.findall('image'):
-            image_name = osp.join(self.data_root, 'textdet_imgs',
-                                  image.find('imageName').text)
+            image_name = image.find('imageName').text
            instances = list()
            for rectangle in image.find('taggedRectangles'):
                x = int(rectangle.get('x'))
--- a/mmocr/datasets/preparers/parsers/totaltext_parser.py
+++ b/mmocr/datasets/preparers/parsers/totaltext_parser.py
@ -23,22 +23,18 @@ class TotaltextTextDetAnnParser(BaseParser):
        nproc (int): Number of processes to load the data. Default: 1.
    """

-    def __init__(self,
-                 data_root: str,
-                 ignore: str = '#',
-                 nproc: int = 1) -> None:
+    def __init__(self, ignore: str = '#', **kwargs) -> None:
        self.ignore = ignore
-        super().__init__(data_root=data_root, nproc=nproc)
+        super().__init__(**kwargs)

-    def parse_file(self, file: Tuple, split: str) -> Dict:
+    def parse_file(self, img_path: str, ann_path: str) -> Dict:
        """Convert single annotation."""
-        img_file, txt_file = file
        instances = list()
-        for poly, text in self.loader(txt_file):
+        for poly, text in self.loader(ann_path):
            instances.append(
                dict(poly=poly, text=text, ignore=text == self.ignore))

-        return img_file, instances
+        return img_path, instances

    def loader(self, file_path: str) -> str:
        """The annotation of the totaltext dataset may be stored in multiple
--- a/mmocr/datasets/preparers/parsers/wildreceipt_parser.py
+++ b/mmocr/datasets/preparers/parsers/wildreceipt_parser.py
@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import os.path as osp
-from typing import Dict, Tuple
+from typing import Dict

 from mmocr.utils import list_from_file
 from ..data_preparer import DATA_PARSERS
@ -30,21 +30,18 @@ class WildreceiptTextDetAnnParser(BaseParser):
            to 1.
    """

-    def __init__(self,
-                 data_root: str,
-                 ignore: int = 0,
-                 nproc: int = 1) -> None:
+    def __init__(self, ignore: int = 0, **kwargs) -> None:
        self.ignore = ignore
-        super().__init__(data_root=data_root, nproc=nproc)
+        super().__init__(**kwargs)

-    def parse_files(self, files: Tuple, split: str) -> Dict:
+    def parse_files(self, img_dir: str, ann_path) -> Dict:
        """Convert single annotation."""
-        closeset_lines = list_from_file(files)
+        closeset_lines = list_from_file(ann_path)
        samples = list()
        for line in closeset_lines:
            instances = list()
            line = json.loads(line)
-            img_file = osp.join(self.data_root, line['file_name'])
+            img_file = osp.join(img_dir, osp.basename(line['file_name']))
            for anno in line['annotations']:
                poly = anno['box']
                text = anno['text']
@ -72,21 +69,23 @@ class WildreceiptKIEAnnParser(BaseParser):
        ]}

    Args:
-        data_root (str): The root path of the dataset.
        ignore (int): The label to be ignored. Defaults to 0.
        nproc (int): The number of processes to parse the annotation. Defaults
            to 1.
    """

-    def __init__(self,
-                 data_root: str,
-                 ignore: int = 0,
-                 nproc: int = 1) -> None:
+    def __init__(self, ignore: int = 0, **kwargs) -> None:
        self.ignore = ignore
-        super().__init__(data_root=data_root, nproc=nproc)
+        super().__init__(**kwargs)

-    def parse_files(self, files: Tuple, split: str) -> Dict:
+    def parse_files(self, img_dir: str, ann_path: str) -> Dict:
        """Convert single annotation."""
-        closeset_lines = list_from_file(files)
+        closeset_lines = list_from_file(ann_path)
+        samples = list()
+        for line in closeset_lines:
+            json_line = json.loads(line)
+            img_file = osp.join(img_dir, osp.basename(json_line['file_name']))
+            json_line['file_name'] = img_file
+            samples.append(json.dumps(json_line))

-        return closeset_lines
+        return samples
--- a/mmocr/utils/init.py
+++ b/mmocr/utils/init.py
@ -19,6 +19,7 @@ from .polygon_utils import (boundary_iou, crop_polygon, is_poly_inside_rect,
                            poly_union, polys2shapely, rescale_polygon,
                            rescale_polygons, shapely2poly, sort_points,
                            sort_vertex, sort_vertex8)
+from .processing import track_parallel_progress_multi_args
 from .setup_env import register_all_modules
 from .string_utils import StringStripper
 from .transform_utils import remove_pipeline_elements
@ -48,5 +49,6 @@ __all__ = [
    'OptTensor', 'ColorType', 'OptKIESampleList', 'KIESampleList',
    'is_archive', 'check_integrity', 'list_files', 'get_md5', 'InstanceList',
    'LabelList', 'OptInstanceList', 'OptLabelList', 'RangeType',
-    'remove_pipeline_elements', 'bezier2poly', 'poly2bezier'
+    'remove_pipeline_elements', 'bezier2poly', 'poly2bezier',
+    'track_parallel_progress_multi_args'
 ]
--- a/mmocr/utils/processing.py
+++ b/mmocr/utils/processing.py
@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+
+from mmengine.utils.progressbar import ProgressBar, init_pool
+
+
+def track_parallel_progress_multi_args(func,
+                                       args,
+                                       nproc,
+                                       initializer=None,
+                                       initargs=None,
+                                       bar_width=50,
+                                       chunksize=1,
+                                       skip_first=False,
+                                       file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (tuple[Iterable]): A tuple of tasks.
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    assert isinstance(args, tuple)
+    for arg in args:
+        assert isinstance(arg, Iterable)
+    assert len(set([len(arg)
+                    for arg in args])) == 1, 'args must have same length'
+    task_num = len(args[0])
+    tasks = zip(*args)
+
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    gen = pool.starmap(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
--- a/tests/data/preparer/dummy/metafile.yml
+++ b/tests/data/preparer/dummy/metafile.yml
@ -1,24 +0,0 @@
-Name: Dummy Dataset
-Paper:
-  Title: Dummy Dataset
-  URL: https://github.com/open-mmlab/mmocr
-  Venue: MMOCR
-  Year: 2022
-  BibTeX: ''
-Data:
-  Website: https://github.com/open-mmlab/mmocr
-  Language:
-    - English
-    - Chinese
-  Scene:
-    - Natural Scene
-  Granularity:
-    - Word
-  Tasks:
-    - textdet
-    - textrecog
-    - textspotting
-  License:
-    Type: CC BY 4.0
-    Link: https://creativecommons.org/licenses/by/4.0/
-  Format: .txt
--- a/tests/data/preparer/dummy/textdet.py
+++ b/tests/data/preparer/dummy/textdet.py
@ -1,3 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-data_root = 'tests/data/preparer/dummy'
-cache_path = 'tests/data/preparer/dummy'
--- a/tests/test_datasets/test_preparers/test_config_generators/test_textdet_config_generator.py
+++ b/tests/test_datasets/test_preparers/test_config_generators/test_textdet_config_generator.py
@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers import TextDetConfigGenerator
+
+
+class TestTextDetConfigGenerator(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+
+    def test_textdet_config_generator(self):
+        config_generator = TextDetConfigGenerator(
+            data_root=self.root.name,
+            dataset_name='dummy',
+            train_anns=[
+                dict(ann_file='textdet_train.json', dataset_postfix='')
+            ],
+            val_anns=[],
+            test_anns=[
+                dict(ann_file='textdet_test.json', dataset_postfix='fake')
+            ],
+            config_path=self.root.name,
+        )
+        cfg_path = osp.join(self.root.name, 'textdet', '_base_', 'datasets',
+                            'dummy.py')
+        config_generator()
+        self.assertTrue(osp.exists(cfg_path))
+        f = open(cfg_path, 'r')
+        lines = ''.join(f.readlines())
+
+        self.assertEquals(
+            lines, (f"dummy_textdet_data_root = '{self.root.name}'\n"
+                    '\n'
+                    'dummy_textdet_train = dict(\n'
+                    "    type='OCRDataset',\n"
+                    '    data_root=dummy_textdet_data_root,\n'
+                    "    ann_file='textdet_train.json',\n"
+                    '    filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'
+                    '    pipeline=None)\n'
+                    '\n'
+                    'dummy_fake_textdet_test = dict(\n'
+                    "    type='OCRDataset',\n"
+                    '    data_root=dummy_textdet_data_root,\n'
+                    "    ann_file='textdet_test.json',\n"
+                    '    test_mode=True,\n'
+                    '    pipeline=None)\n'))
+        with self.assertRaises(ValueError):
+            TextDetConfigGenerator(
+                data_root=self.root.name,
+                dataset_name='dummy',
+                train_anns=[
+                    dict(ann_file='textdet_train.json', dataset_postfix='1'),
+                    dict(ann_file='textdet_train_1.json', dataset_postfix='1')
+                ],
+                config_path=self.root.name,
+            )
--- a/tests/test_datasets/test_preparers/test_config_generators/test_textrecog_config_generator.py
+++ b/tests/test_datasets/test_preparers/test_config_generators/test_textrecog_config_generator.py
@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers import TextRecogConfigGenerator
+
+
+class TestTextRecogConfigGenerator(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+
+    def test_textrecog_config_generator(self):
+        config_generator = TextRecogConfigGenerator(
+            data_root=self.root.name,
+            dataset_name='dummy',
+            train_anns=[
+                dict(ann_file='textrecog_train.json', dataset_postfix='')
+            ],
+            val_anns=[],
+            test_anns=[
+                dict(ann_file='textrecog_test.json', dataset_postfix='fake')
+            ],
+            config_path=self.root.name,
+        )
+        cfg_path = osp.join(self.root.name, 'textrecog', '_base_', 'datasets',
+                            'dummy.py')
+        config_generator()
+        self.assertTrue(osp.exists(cfg_path))
+        f = open(cfg_path, 'r')
+        lines = ''.join(f.readlines())
+
+        self.assertEquals(lines,
+                          (f"dummy_textrecog_data_root = '{self.root.name}'\n"
+                           '\n'
+                           'dummy_textrecog_train = dict(\n'
+                           "    type='OCRDataset',\n"
+                           '    data_root=dummy_textrecog_data_root,\n'
+                           "    ann_file='textrecog_train.json',\n"
+                           '    pipeline=None)\n'
+                           '\n'
+                           'dummy_fake_textrecog_test = dict(\n'
+                           "    type='OCRDataset',\n"
+                           '    data_root=dummy_textrecog_data_root,\n'
+                           "    ann_file='textrecog_test.json',\n"
+                           '    test_mode=True,\n'
+                           '    pipeline=None)\n'))
+        with self.assertRaises(ValueError):
+            TextRecogConfigGenerator(
+                data_root=self.root.name,
+                dataset_name='dummy',
+                train_anns=[
+                    dict(ann_file='textrecog_train.json', dataset_postfix='1'),
+                    dict(
+                        ann_file='textrecog_train_1.json', dataset_postfix='1')
+                ],
+                config_path=self.root.name,
+            )
--- a/tests/test_datasets/test_preparers/test_config_generators/test_textspotting_config_generator.py
+++ b/tests/test_datasets/test_preparers/test_config_generators/test_textspotting_config_generator.py
@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers import TextSpottingConfigGenerator
+
+
+class TestTextSpottingConfigGenerator(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+
+    def test_textspotting_config_generator(self):
+        config_generator = TextSpottingConfigGenerator(
+            data_root=self.root.name,
+            dataset_name='dummy',
+            train_anns=[
+                dict(ann_file='textspotting_train.json', dataset_postfix='')
+            ],
+            val_anns=[],
+            test_anns=[
+                dict(
+                    ann_file='textspotting_test.json', dataset_postfix='fake')
+            ],
+            config_path=self.root.name,
+        )
+        cfg_path = osp.join(self.root.name, 'textspotting', '_base_',
+                            'datasets', 'dummy.py')
+        config_generator()
+        self.assertTrue(osp.exists(cfg_path))
+        f = open(cfg_path, 'r')
+        lines = ''.join(f.readlines())
+
+        self.assertEquals(
+            lines, (f"dummy_textspotting_data_root = '{self.root.name}'\n"
+                    '\n'
+                    'dummy_textspotting_train = dict(\n'
+                    "    type='OCRDataset',\n"
+                    '    data_root=dummy_textspotting_data_root,\n'
+                    "    ann_file='textspotting_train.json',\n"
+                    '    filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'
+                    '    pipeline=None)\n'
+                    '\n'
+                    'dummy_fake_textspotting_test = dict(\n'
+                    "    type='OCRDataset',\n"
+                    '    data_root=dummy_textspotting_data_root,\n'
+                    "    ann_file='textspotting_test.json',\n"
+                    '    test_mode=True,\n'
+                    '    pipeline=None)\n'))
+        with self.assertRaises(ValueError):
+            TextSpottingConfigGenerator(
+                data_root=self.root.name,
+                dataset_name='dummy',
+                train_anns=[
+                    dict(
+                        ann_file='textspotting_train.json',
+                        dataset_postfix='1'),
+                    dict(
+                        ann_file='textspotting_train_1.json',
+                        dataset_postfix='1')
+                ],
+                config_path=self.root.name,
+            )
--- a/tests/test_datasets/test_preparers/test_data_preparer.py
+++ b/tests/test_datasets/test_preparers/test_data_preparer.py
@ -1,15 +1,60 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
 import unittest

-from mmocr.datasets.preparers.data_preparer import DatasetPreparer
+from mmengine import Config
+
+from mmocr.datasets.preparers import DatasetPreparer
+from mmocr.datasets.preparers.data_preparer import (CFG_GENERATORS,
+                                                    DATA_DUMPERS,
+                                                    DATA_GATHERERS,
+                                                    DATA_OBTAINERS,
+                                                    DATA_PACKERS, DATA_PARSERS)
+
+
+class Fake:
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        return None, None
+
+
+DATA_OBTAINERS.register_module(module=Fake)
+DATA_GATHERERS.register_module(module=Fake)
+DATA_PARSERS.register_module(module=Fake)
+DATA_DUMPERS.register_module(module=Fake)
+DATA_PACKERS.register_module(module=Fake)
+CFG_GENERATORS.register_module(module=Fake)


 class TestDataPreparer(unittest.TestCase):

-    def setUp(self) -> None:
-        self.cfg_path = 'tests/data/preparer'
-        self.dataset_name = 'dummy'
+    def _create_config(self):
+        cfg_path = 'config.py'
+        cfg = ''
+        cfg += "data_root = ''\n"
+        cfg += 'train_preparer=dict(\n'
+        cfg += '    obtainer=dict(type="Fake"),\n'
+        cfg += '    gatherer=dict(type="Fake"),\n'
+        cfg += '    parser=dict(type="Fake"),\n'
+        cfg += '    packer=dict(type="Fake"),\n'
+        cfg += '    dumper=dict(type="Fake"),\n'
+        cfg += ')\n'
+        cfg += 'test_preparer=dict(\n'
+        cfg += '    obtainer=dict(type="Fake"),\n'
+        cfg += ')\n'
+        cfg += 'cfg_generator=dict(type="Fake")\n'
+        cfg += f"delete = ['{cfg_path}']\n"
+
+        with open(cfg_path, 'w') as f:
+            f.write(cfg)
+        return cfg_path

    def test_dataset_preparer(self):
-        preparer = DatasetPreparer(self.cfg_path, self.dataset_name, 'textdet')
-        preparer()
+        cfg_path = self._create_config()
+        cfg = Config.fromfile(cfg_path)
+        preparer = DatasetPreparer.from_file(cfg)
+        preparer.run()
+        self.assertFalse(osp.exists(cfg_path))
--- a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py
+++ b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py
@ -21,8 +21,8 @@ class TestDumpers(unittest.TestCase):
                task_name='textdet',
                category=[dict(id=0, name='text')]))

-        dumper = JsonDumper(task)
-        dumper.dump(fake_data, self.root.name, split)
+        dumper = JsonDumper(task, split, self.root.name)
+        dumper.dump(fake_data)
        with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f:
            data = json.load(f)
        self.assertEqual(data, fake_data)
@ -31,8 +31,8 @@ class TestDumpers(unittest.TestCase):
        task, split = 'kie', 'train'
        fake_data = ['test1', 'test2']

-        dumper = WildreceiptOpensetDumper(task)
-        dumper.dump(fake_data, self.root.name, split)
+        dumper = WildreceiptOpensetDumper(task, split, self.root.name)
+        dumper.dump(fake_data)
        with open(osp.join(self.root.name, f'openset_{split}.txt'), 'r') as f:
            data = f.read().splitlines()
        self.assertEqual(data, fake_data)
--- a/tests/test_datasets/test_preparers/test_gatherers/test_mono_gatherer.py
+++ b/tests/test_datasets/test_preparers/test_gatherers/test_mono_gatherer.py
@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import unittest
+
+from mmocr.datasets.preparers.gatherers import MonoGatherer
+
+
+class TestMonoGatherer(unittest.TestCase):
+
+    def test_mono_text_gatherer(self):
+        data_root = 'dummpy'
+        img_dir = 'dummy_img'
+        ann_dir = 'dummy_ann'
+        ann_name = 'dummy_ann.json'
+        split = 'train'
+        gatherer = MonoGatherer(
+            data_root=data_root,
+            img_dir=img_dir,
+            ann_dir=ann_dir,
+            ann_name=ann_name,
+            split=split)
+        gather_img_dir, ann_path = gatherer()
+        self.assertEqual(gather_img_dir, osp.join(data_root, img_dir))
+        self.assertEqual(ann_path, osp.join(data_root, ann_dir, ann_name))
--- a/tests/test_datasets/test_preparers/test_gatherers/test_pair_gatherer.py
+++ b/tests/test_datasets/test_preparers/test_gatherers/test_pair_gatherer.py
@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+import unittest
+
+import cv2
+import numpy as np
+
+from mmocr.datasets.preparers.gatherers import PairGatherer
+
+
+class TestPairGatherer(unittest.TestCase):
+
+    def test_pair_text_gatherer(self):
+        root = tempfile.TemporaryDirectory()
+        data_root = root.name
+        img_dir = 'dummy_img'
+        ann_dir = 'dummy_ann'
+        split = 'train'
+        img = np.random.randint(0, 100, size=(100, 100, 3))
+        os.makedirs(osp.join(data_root, img_dir))
+        os.makedirs(osp.join(data_root, ann_dir))
+        for i in range(10):
+            cv2.imwrite(osp.join(data_root, img_dir, f'img_{i}.jpg'), img)
+            f = open(osp.join(data_root, ann_dir, f'img_{i}.txt'), 'w')
+            f.close()
+        f = open(osp.join(data_root, ann_dir, 'img_10.mmocr'), 'w')
+        f.close()
+        gatherer = PairGatherer(
+            data_root=data_root,
+            img_dir=img_dir,
+            ann_dir=ann_dir,
+            split=split,
+            img_suffixes=['.jpg'],
+            rule=[r'img_(\d+)\.([jJ][pP][gG])', r'img_\1.txt'])
+        img_list, ann_list = gatherer()
+        self.assertEqual(len(img_list), 10)
+        self.assertEqual(len(ann_list), 10)
+        self.assertNotIn(
+            osp.join(data_root, ann_dir, 'img_10.mmocr'), ann_list)
+        root.cleanup()
--- a/tests/test_datasets/test_preparers/test_packers/test_textdet_packer.py
+++ b/tests/test_datasets/test_preparers/test_packers/test_textdet_packer.py
@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+import cv2
+import numpy as np
+
+from mmocr.datasets.preparers import TextDetPacker
+
+
+class TestTextDetPacker(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+        img = np.random.randint(0, 255, (30, 20, 3), dtype=np.uint8)
+        cv2.imwrite(osp.join(self.root.name, 'test_img.jpg'), img)
+        self.instance = [{
+            'poly': [0, 0, 0, 10, 10, 20, 20, 0],
+            'ignore': False
+        }, {
+            'box': [0, 0, 10, 20],
+            'ignore': False
+        }]
+        self.img_path = osp.join(self.root.name, 'test_img.jpg')
+        self.sample = (self.img_path, self.instance)
+
+    def test_pack_instance(self):
+        packer = TextDetPacker(data_root=self.root.name, split='test')
+        instance = packer.pack_instance(self.sample)
+        self.assertEquals(instance['img_path'], 'test_img.jpg')
+        self.assertEquals(instance['height'], 30)
+        self.assertEquals(instance['width'], 20)
+        self.assertEquals(instance['instances'][0]['polygon'],
+                          [0, 0, 0, 10, 10, 20, 20, 0])
+        self.assertEquals(instance['instances'][0]['bbox'],
+                          [float(x) for x in [0, 0, 20, 20]])
+        self.assertEquals(instance['instances'][0]['bbox_label'], 0)
+        self.assertEquals(instance['instances'][0]['ignore'], False)
+        self.assertEquals(instance['instances'][1]['polygon'],
+                          [0.0, 0.0, 10.0, 0.0, 10.0, 20.0, 0.0, 20.0])
+        self.assertEquals(instance['instances'][1]['bbox'],
+                          [float(x) for x in [0, 0, 10, 20]])
+        self.assertEquals(instance['instances'][1]['bbox_label'], 0)
+        self.assertEquals(instance['instances'][1]['ignore'], False)
+
+    def test_add_meta(self):
+        packer = TextDetPacker(data_root=self.root.name, split='test')
+        instance = packer.pack_instance(self.sample)
+        meta = packer.add_meta(instance)
+        self.assertDictEqual(
+            meta['metainfo'], {
+                'dataset_type': 'TextDetDataset',
+                'task_name': 'textdet',
+                'category': [{
+                    'id': 0,
+                    'name': 'text'
+                }]
+            })
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_packers/test_textrecog_packer.py
+++ b/tests/test_datasets/test_preparers/test_packers/test_textrecog_packer.py
@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+import cv2
+import numpy as np
+
+from mmocr.datasets.preparers import TextRecogCropPacker, TextRecogPacker
+
+
+class TestTextRecogPacker(unittest.TestCase):
+
+    def test_pack_instance(self):
+
+        packer = TextRecogPacker(data_root='', split='test')
+        sample = ('test.jpg', 'text')
+        results = packer.pack_instance(sample)
+        self.assertDictEqual(
+            results,
+            dict(
+                img_path=osp.join('textrecog_imgs', 'test', 'test.jpg'),
+                instances=[dict(text='text')]))
+
+    def test_add_meta(self):
+        packer = TextRecogPacker(data_root='', split='test')
+        sample = [dict(img_path='test.jpg', instances=[dict(text='text')])]
+        results = packer.add_meta(sample)
+        self.assertDictEqual(
+            results,
+            dict(
+                metainfo=dict(
+                    dataset_type='TextRecogDataset', task_name='textrecog'),
+                data_list=sample))
+
+
+class TestTextRecogCropPacker(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+        img = np.random.randint(0, 255, (30, 40, 3), dtype=np.uint8)
+        cv2.imwrite(osp.join(self.root.name, 'test_img.jpg'), img)
+        self.instance = [{
+            'poly': [0, 0, 0, 10, 10, 20, 20, 0],
+            'ignore': False,
+            'text': 'text1'
+        }, {
+            'box': [0, 0, 10, 20],
+            'ignore': False,
+            'text': 'text2'
+        }]
+        self.img_path = osp.join(self.root.name, 'test_img.jpg')
+        self.sample = (self.img_path, self.instance)
+
+    def test_pack_instance(self):
+        packer = TextRecogCropPacker(data_root=self.root.name, split='test')
+        instance = packer.pack_instance(self.sample)
+        self.assertListEqual(instance, [
+            dict(
+                img_path=osp.join('textrecog_imgs', 'test', 'test_img_0.jpg'),
+                instances=[dict(text='text1')]),
+            dict(
+                img_path=osp.join('textrecog_imgs', 'test', 'test_img_1.jpg'),
+                instances=[dict(text='text2')])
+        ])
+
+    def test_add_meta(self):
+        packer = TextRecogCropPacker(data_root=self.root.name, split='test')
+        instance = packer.pack_instance(self.sample)
+        results = packer.add_meta([instance])
+        self.assertDictEqual(
+            results,
+            dict(
+                metainfo=dict(
+                    dataset_type='TextRecogDataset', task_name='textrecog'),
+                data_list=instance))
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_packers/test_textspotting_packer.py
+++ b/tests/test_datasets/test_preparers/test_packers/test_textspotting_packer.py
@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+import cv2
+import numpy as np
+
+from mmocr.datasets.preparers import TextSpottingPacker
+
+
+class TestTextSpottingPacker(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+        img = np.random.randint(0, 255, (30, 20, 3), dtype=np.uint8)
+        cv2.imwrite(osp.join(self.root.name, 'test_img.jpg'), img)
+        self.instance = [{
+            'poly': [0, 0, 0, 10, 10, 20, 20, 0],
+            'ignore': False,
+            'text': 'text1'
+        }, {
+            'box': [0, 0, 10, 20],
+            'ignore': False,
+            'text': 'text2'
+        }]
+        self.img_path = osp.join(self.root.name, 'test_img.jpg')
+        self.sample = (self.img_path, self.instance)
+
+    def test_pack_instance(self):
+        packer = TextSpottingPacker(data_root=self.root.name, split='test')
+        instance = packer.pack_instance(self.sample)
+        self.assertEquals(instance['img_path'], 'test_img.jpg')
+        self.assertEquals(instance['height'], 30)
+        self.assertEquals(instance['width'], 20)
+        self.assertEquals(instance['instances'][0]['polygon'],
+                          [0, 0, 0, 10, 10, 20, 20, 0])
+        self.assertEquals(instance['instances'][0]['bbox'],
+                          [float(x) for x in [0, 0, 20, 20]])
+        self.assertEquals(instance['instances'][0]['bbox_label'], 0)
+        self.assertEquals(instance['instances'][0]['ignore'], False)
+        self.assertEquals(instance['instances'][0]['text'], 'text1')
+        self.assertEquals(instance['instances'][1]['polygon'],
+                          [0.0, 0.0, 10.0, 0.0, 10.0, 20.0, 0.0, 20.0])
+        self.assertEquals(instance['instances'][1]['bbox'],
+                          [float(x) for x in [0, 0, 10, 20]])
+        self.assertEquals(instance['instances'][1]['bbox_label'], 0)
+        self.assertEquals(instance['instances'][1]['ignore'], False)
+        self.assertEquals(instance['instances'][1]['text'], 'text2')
+
+    def test_add_meta(self):
+        packer = TextSpottingPacker(data_root=self.root.name, split='test')
+        instance = packer.pack_instance(self.sample)
+        meta = packer.add_meta(instance)
+        self.assertDictEqual(
+            meta, {
+                'metainfo': {
+                    'dataset_type': 'TextSpottingDataset',
+                    'task_name': 'textspotting',
+                    'category': [{
+                        'id': 0,
+                        'name': 'text'
+                    }]
+                },
+                'data_list': instance
+            })
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_funsd_parser.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_funsd_parser.py
@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers import FUNSDTextDetAnnParser
+
+
+class TestFUNSDTextDetAnnParser(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+
+    def _create_fake_sample(self):
+        fake_sample = {
+            'form': [{
+                'box': [91, 279, 123, 294],
+                'text': 'Date:',
+                'label': 'question',
+                'words': [{
+                    'box': [91, 279, 123, 294],
+                    'text': 'Date:'
+                }],
+                'linking': [[0, 16]],
+                'id': 0
+            }, {
+                'box': [92, 310, 130, 324],
+                'text': 'From:',
+                'label': 'question',
+                'words': [{
+                    'box': [92, 310, 130, 324],
+                    'text': ''
+                }],
+                'linking': [[1, 22]],
+                'id': 1
+            }]
+        }
+        ann_path = osp.join(self.root.name, 'funsd.json')
+        with open(ann_path, 'w') as f:
+            json.dump(fake_sample, f)
+        return ann_path
+
+    def test_textdet_parsers(self):
+        ann_path = self._create_fake_sample()
+        parser = FUNSDTextDetAnnParser(split='train')
+        _, instances = parser.parse_file('fake.jpg', ann_path)
+        self.assertEqual(len(instances), 2)
+        self.assertEqual(instances[0]['text'], 'Date:')
+        self.assertEqual(instances[0]['ignore'], False)
+        self.assertEqual(instances[1]['ignore'], True)
+        self.assertListEqual(instances[0]['poly'],
+                             [91, 279, 123, 279, 123, 294, 91, 294])
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
@ -35,9 +35,9 @@ class TestIC15Parsers(unittest.TestCase):

    def test_textdet_parsers(self):
        file = self._create_dummy_ic15_det()
-        parser = ICDARTxtTextDetAnnParser()
+        parser = ICDARTxtTextDetAnnParser(split='train')

-        img, instances = parser.parse_file(file, 'train')
+        img, instances = parser.parse_file(*file)
        self.assertEqual(img, file[0])
        self.assertEqual(len(instances), 4)
        self.assertIn('poly', instances[0])
@ -48,12 +48,15 @@ class TestIC15Parsers(unittest.TestCase):
        self.assertEqual(instances[3]['text'], '100,000')

    def test_textrecog_parsers(self):
-        parser = ICDARTxtTextRecogAnnParser()
+        parser = ICDARTxtTextRecogAnnParser(split='train')
        file = self._create_dummy_ic15_recog()
-        samples = parser.parse_files(file, 'train')
+        samples = parser.parse_files(self.root.name, file)
        self.assertEqual(len(samples), 4)
        img, text = samples[0]
-        self.assertEqual(img, 'word_1.png')
+        self.assertEqual(img, osp.join(self.root.name, 'word_1.png'))
        self.assertEqual(text, 'Genaxis Theatre')
        img, text = samples[3]
        self.assertEqual(text, '62-,03')
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_naf_parser.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_naf_parser.py
@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers import NAFAnnParser
+
+
+class TestNAFAnnParser(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+
+    def _create_fake_sample(self):
+        fake_sample = {
+            'fieldBBs': [{
+                'poly_points': [[1357, 322], [1636, 324], [1636, 402],
+                                [1357, 400]],
+                'type':
+                'field',
+                'id':
+                'f0',
+                'isBlank':
+                1
+            }, {
+                'poly_points': [[1831, 352], [1908, 353], [1908, 427],
+                                [1830, 427]],
+                'type':
+                'blank',
+                'id':
+                'f1',
+                'isBlank':
+                1
+            }],
+            'textBBs': [{
+                'poly_points': [[1388, 80], [2003, 82], [2003, 133],
+                                [1388, 132]],
+                'type':
+                'text',
+                'id':
+                't0'
+            }, {
+                'poly_points': [[1065, 366], [1320, 366], [1320, 413],
+                                [1065, 412]],
+                'type':
+                'text',
+                'id':
+                't1'
+            }],
+            'imageFilename':
+            '004173988_00005.jpg',
+            'transcriptions': {
+                'f0': '7/24',
+                'f1': '9',
+                't0': 'REGISTRY RETURN RECEIPT.',
+                't1': 'Date of delivery',
+            }
+        }
+        ann_path = osp.join(self.root.name, 'naf.json')
+        with open(ann_path, 'w') as f:
+            json.dump(fake_sample, f)
+        return ann_path
+
+    def test_parsers(self):
+        ann_path = self._create_fake_sample()
+        parser = NAFAnnParser(split='train')
+        _, instances = parser.parse_file('fake.jpg', ann_path)
+        self.assertEqual(len(instances), 3)
+        self.assertEqual(instances[0]['ignore'], False)
+        self.assertEqual(instances[1]['ignore'], False)
+        self.assertListEqual(instances[2]['poly'],
+                             [1357, 322, 1636, 324, 1636, 402, 1357, 400])
+
+        parser = NAFAnnParser(split='train', det=False)
+        _, instances = parser.parse_file('fake.jpg', ann_path)
+        self.assertEqual(len(instances), 2)
+        self.assertEqual(instances[0]['text'], '7/24')
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_sroie_parser.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_sroie_parser.py
@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers import SROIETextDetAnnParser
+from mmocr.utils import list_to_file
+
+
+class TestSROIETextDetAnnParser(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.root = tempfile.TemporaryDirectory()
+
+    def _create_dummy_sroie_det(self):
+        fake_anno = [
+            '114,54,326,54,326,92,114,92,TAN CHAY YEE',
+            '60,119,300,119,300,136,60,136,###',
+            '100,139,267,139,267,162,100,162,ROC NO: 538358-H',
+            '83,163,277,163,277,183,83,183,NO 2 & 4, JALAN BAYU 4,',
+        ]
+        ann_file = osp.join(self.root.name, 'sroie_det.txt')
+        list_to_file(ann_file, fake_anno)
+        return (osp.join(self.root.name, 'sroie_det.jpg'), ann_file)
+
+    def test_textdet_parsers(self):
+        file = self._create_dummy_sroie_det()
+        parser = SROIETextDetAnnParser(split='train')
+
+        img, instances = parser.parse_file(*file)
+        self.assertEqual(img, file[0])
+        self.assertEqual(len(instances), 4)
+        self.assertIn('poly', instances[0])
+        self.assertIn('text', instances[0])
+        self.assertIn('ignore', instances[0])
+        self.assertEqual(instances[0]['text'], 'TAN CHAY YEE')
+        self.assertEqual(instances[1]['ignore'], True)
+        self.assertEqual(instances[3]['text'], 'NO 2 & 4, JALAN BAYU 4,')
+        self.assertListEqual(instances[2]['poly'],
+                             [100, 139, 267, 139, 267, 162, 100, 162])
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_svt_parsers.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_svt_parsers.py
@ -38,11 +38,11 @@ class TestSVTParsers(unittest.TestCase):
        return ann_file

    def test_textdet_parsers(self):
-        parser = SVTTextDetAnnParser(self.root.name)
+        parser = SVTTextDetAnnParser(split='train')
        file = self._create_dummy_svt_det()
-        samples = parser.parse_files(file, 'train')
+        samples = parser.parse_files(self.root.name, file)
        self.assertEqual(len(samples), 1)
-        self.assertEqual(osp.basename(samples[0][0]), 'test.jpg')
+        self.assertEqual(samples[0][0], osp.join(self.root.name, 'test.jpg'))
        self.assertEqual(len(samples[0][1]), 3)
        self.assertEqual(samples[0][1][0]['text'], 'living')
        self.assertEqual(samples[0][1][1]['text'], 'room')
@ -50,3 +50,6 @@ class TestSVTParsers(unittest.TestCase):
        self.assertEqual(samples[0][1][0]['poly'],
                         [375, 253, 611, 253, 611, 328, 375, 328])
        self.assertEqual(samples[0][1][0]['ignore'], False)
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py
@ -24,9 +24,9 @@ class TestTTParsers(unittest.TestCase):
        return (osp.join(self.root.name, 'tt_det.jpg'), ann_file)

    def test_textdet_parsers(self):
-        parser = TotaltextTextDetAnnParser(self.root.name)
+        parser = TotaltextTextDetAnnParser(split='train')
        file = self._create_dummy_tt_det()
-        img, instances = parser.parse_file(file, 'train')
+        img, instances = parser.parse_file(*file)
        self.assertEqual(img, file[0])
        self.assertEqual(len(instances), 3)
        self.assertIn('poly', instances[0])
@ -34,3 +34,6 @@ class TestTTParsers(unittest.TestCase):
        self.assertIn('ignore', instances[0])
        self.assertEqual(instances[0]['text'], 'PERUNDING')
        self.assertEqual(instances[2]['ignore'], True)
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py
@ -39,8 +39,8 @@ class TestWildReceiptParsers(unittest.TestCase):
        list_to_file(self.anno, fake_sample)

    def test_textdet_parsers(self):
-        parser = WildreceiptTextDetAnnParser(self.root.name)
-        samples = parser.parse_files(self.anno, 'train')
+        parser = WildreceiptTextDetAnnParser(split='train')
+        samples = parser.parse_files(self.root.name, self.anno)
        self.assertEqual(len(samples), 1)
        self.assertEqual(osp.basename(samples[0][0]), 'test.jpg')
        instances = samples[0][1]
@ -52,6 +52,9 @@ class TestWildReceiptParsers(unittest.TestCase):
        self.assertEqual(instances[1]['ignore'], True)

    def test_kie_parsers(self):
-        parser = WildreceiptKIEAnnParser(self.root.name)
-        samples = parser.parse_files(self.anno, 'train')
+        parser = WildreceiptKIEAnnParser(split='train')
+        samples = parser.parse_files(self.root.name, self.anno)
        self.assertEqual(len(samples), 1)
+
+    def tearDown(self) -> None:
+        self.root.cleanup()
--- a/tests/test_utils/test_processing.py
+++ b/tests/test_utils/test_processing.py
@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+from mmocr.utils import track_parallel_progress_multi_args
+
+
+def func(a, b):
+    return a + b
+
+
+class TestProcessing(unittest.TestCase):
+
+    def test_track_parallel_progress_multi_args(self):
+
+        args = ([1, 2, 3], [4, 5, 6])
+        results = track_parallel_progress_multi_args(func, args, nproc=1)
+        self.assertEqual(results, [5, 7, 9])
+
+        results = track_parallel_progress_multi_args(func, args, nproc=2)
+        self.assertEqual(results, [5, 7, 9])
+
+        with self.assertRaises(AssertionError):
+            track_parallel_progress_multi_args(func, 1, nproc=1)
+
+        with self.assertRaises(AssertionError):
+            track_parallel_progress_multi_args(func, ([1, 2], 1), nproc=1)
+
+        with self.assertRaises(AssertionError):
+            track_parallel_progress_multi_args(
+                func, ([1, 2], [1, 2, 3]), nproc=1)
--- a/tools/dataset_converters/prepare_dataset.py
+++ b/tools/dataset_converters/prepare_dataset.py
@ -1,8 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 import os.path as osp
+import time
 import warnings

+from mmengine import Config
+
 from mmocr.datasets.preparers import DatasetPreparer


@ -21,6 +24,11 @@ def parse_args():
        choices=['textdet', 'textrecog', 'textspotting', 'kie'],
        help='Task type. Options are "textdet", "textrecog", "textspotting"'
        ' and "kie".')
+    parser.add_argument(
+        '--splits',
+        default=['train', 'test', 'val'],
+        help='A list of the split that would like to prepare.',
+        nargs='+')
    parser.add_argument(
        '--overwrite-cfg',
        action='store_true',
@ -36,6 +44,35 @@ def parse_args():
    return args


+def parse_meta(task: str, meta_path: str) -> None:
+    """Parse meta file.
+
+    Args:
+        cfg_path (str): Path to meta file.
+    """
+    try:
+        meta = Config.fromfile(meta_path)
+    except FileNotFoundError:
+        return
+    assert task in meta['Data']['Tasks'], \
+        f'Task {task} not supported!'
+    # License related
+    if meta['Data']['License']['Type']:
+        print(f"\033[1;33;40mDataset Name: {meta['Name']}")
+        print(f"License Type: {meta['Data']['License']['Type']}")
+        print(f"License Link: {meta['Data']['License']['Link']}")
+        print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
+        print('\033[1;31;43mMMOCR does not own the dataset. Using this '
+              'dataset you must accept the license provided by the owners, '
+              'and cite the corresponding papers appropriately.')
+        print('If you do not agree with the above license, please cancel '
+              'the progress immediately by pressing ctrl+c. Otherwise, '
+              'you are deemed to accept the terms and conditions.\033[0m')
+        for i in range(5):
+            print(f'{5-i}...')
+            time.sleep(1)
+
+
 def main():
    args = parse_args()
    for dataset in args.datasets:
@ -43,13 +80,18 @@ def main():
            warnings.warn(f'{dataset} is not supported yet. Please check '
                          'dataset zoo for supported datasets.')
            continue
-        preparer = DatasetPreparer(
-            cfg_path=args.dataset_zoo_path,
-            dataset_name=dataset,
-            task=args.task,
-            nproc=args.nproc,
-            overwrite_cfg=args.overwrite_cfg)
-        preparer()
+        meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml')
+        parse_meta(args.task, meta_path)
+        cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py')
+        cfg = Config.fromfile(cfg_path)
+        if args.overwrite_cfg and cfg.get('config_generator',
+                                          None) is not None:
+            cfg.config_generator.overwrite = args.overwrite_cfg
+        cfg.nproc = args.nproc
+        cfg.task = args.task
+        cfg.dataset_name = dataset
+        preparer = DatasetPreparer.from_file(cfg)
+        preparer.run(args.splits)


 if __name__ == '__main__':