diff --git a/.dev_scripts/covignore.cfg b/.dev_scripts/covignore.cfg
index 213afd4b..00ec54b0 100644
--- a/.dev_scripts/covignore.cfg
+++ b/.dev_scripts/covignore.cfg
@@ -13,3 +13,6 @@ mmocr/models/textdet/detectors/mmdet_wrapper.py
# It will be removed after KieVisualizer and TextSpotterVisualizer
mmocr/visualization/visualize.py
+
+# Add tests for data preparers later
+mmocr/datasets/preparers
diff --git a/configs/kie/_base_/datasets/wildreceipt-openset.py b/configs/kie/_base_/datasets/wildreceipt-openset.py
index 33274a7f..f8251283 100644
--- a/configs/kie/_base_/datasets/wildreceipt-openset.py
+++ b/configs/kie/_base_/datasets/wildreceipt-openset.py
@@ -1,4 +1,4 @@
-wildreceipt_openset_data_root = 'data/kie/wildreceipt/'
+wildreceipt_openset_data_root = 'data/wildreceipt/'
wildreceipt_openset_train = dict(
type='WildReceiptDataset',
diff --git a/configs/kie/_base_/datasets/wildreceipt.py b/configs/kie/_base_/datasets/wildreceipt.py
index b266c2e0..9c1122ed 100644
--- a/configs/kie/_base_/datasets/wildreceipt.py
+++ b/configs/kie/_base_/datasets/wildreceipt.py
@@ -1,4 +1,4 @@
-wildreceipt_data_root = 'data/kie/wildreceipt/'
+wildreceipt_data_root = 'data/wildreceipt/'
wildreceipt_train = dict(
type='WildReceiptDataset',
diff --git a/configs/textdet/_base_/datasets/icdar2015.py b/configs/textdet/_base_/datasets/icdar2015.py
index 6553d76a..41893ce9 100644
--- a/configs/textdet/_base_/datasets/icdar2015.py
+++ b/configs/textdet/_base_/datasets/icdar2015.py
@@ -1,17 +1,15 @@
-ic15_det_data_root = 'data/det/icdar2015'
+ic15_det_data_root = 'data/icdar2015'
ic15_det_train = dict(
type='OCRDataset',
data_root=ic15_det_data_root,
- ann_file='instances_training.json',
- data_prefix=dict(img_path='imgs/'),
+ ann_file='textdet_train.json',
filter_cfg=dict(filter_empty_gt=True, min_size=32),
pipeline=None)
ic15_det_test = dict(
type='OCRDataset',
data_root=ic15_det_data_root,
- ann_file='instances_test.json',
- data_prefix=dict(img_path='imgs/'),
+ ann_file='textdet_test.json',
test_mode=True,
pipeline=None)
diff --git a/configs/textdet/_base_/datasets/totaltext.py b/configs/textdet/_base_/datasets/totaltext.py
new file mode 100644
index 00000000..4884d297
--- /dev/null
+++ b/configs/textdet/_base_/datasets/totaltext.py
@@ -0,0 +1,15 @@
+tt_det_data_root = 'data/totaltext'
+
+tt_det_train = dict(
+ type='OCRDataset',
+ data_root=tt_det_data_root,
+ ann_file='textdet_train.json',
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
+ pipeline=None)
+
+tt_det_test = dict(
+ type='OCRDataset',
+ data_root=tt_det_data_root,
+ ann_file='textdet_test.json',
+ test_mode=True,
+ pipeline=None)
diff --git a/configs/textrecog/_base_/datasets/icdar2015.py b/configs/textrecog/_base_/datasets/icdar2015.py
index facdbd10..b0d54282 100644
--- a/configs/textrecog/_base_/datasets/icdar2015.py
+++ b/configs/textrecog/_base_/datasets/icdar2015.py
@@ -1,15 +1,15 @@
-ic15_rec_data_root = 'data/rec/icdar_2015/'
+ic15_rec_data_root = 'data/icdar2015/'
ic15_rec_train = dict(
type='OCRDataset',
data_root=ic15_rec_data_root,
- ann_file='train_labels.json',
+ ann_file='textrecog_train.json',
test_mode=False,
pipeline=None)
ic15_rec_test = dict(
type='OCRDataset',
data_root=ic15_rec_data_root,
- ann_file='test_labels.json',
+ ann_file='textrecog_test.json',
test_mode=True,
pipeline=None)
diff --git a/configs/textrecog/_base_/datasets/totaltext.py b/configs/textrecog/_base_/datasets/totaltext.py
new file mode 100644
index 00000000..b1e10e75
--- /dev/null
+++ b/configs/textrecog/_base_/datasets/totaltext.py
@@ -0,0 +1,15 @@
+tt_rec_data_root = 'data/totaltext/'
+
+tt_rec_train = dict(
+ type='OCRDataset',
+ data_root=tt_rec_data_root,
+ ann_file='textrecog_train.json',
+ test_mode=False,
+ pipeline=None)
+
+tt_rec_test = dict(
+ type='OCRDataset',
+ data_root=tt_rec_data_root,
+ ann_file='textrecog_test.json',
+ test_mode=True,
+ pipeline=None)
diff --git a/dataset_zoo/icdar2015/metafile.yml b/dataset_zoo/icdar2015/metafile.yml
new file mode 100644
index 00000000..fa4c24db
--- /dev/null
+++ b/dataset_zoo/icdar2015/metafile.yml
@@ -0,0 +1,29 @@
+Name: 'Incidental Scene Text IC15'
+Paper:
+ Title: ICDAR 2015 Competition on Robust Reading
+ URL: https://rrc.cvc.uab.es/files/short_rrc_2015.pdf
+ Venue: ICDAR
+ Year: '2015'
+ BibTeX: '@inproceedings{karatzas2015icdar,
+ title={ICDAR 2015 competition on robust reading},
+ author={Karatzas, Dimosthenis and Gomez-Bigorda, Lluis and Nicolaou, Anguelos and Ghosh, Suman and Bagdanov, Andrew and Iwamura, Masakazu and Matas, Jiri and Neumann, Lukas and Chandrasekhar, Vijay Ramaseshan and Lu, Shijian and others},
+ booktitle={2015 13th international conference on document analysis and recognition (ICDAR)},
+ pages={1156--1160},
+ year={2015},
+ organization={IEEE}}'
+Data:
+ Website: https://rrc.cvc.uab.es/?ch=4
+ Language:
+ - English
+ Scene:
+ - Natural Scene
+ Granularity:
+ - Word
+ Tasks:
+ - textdet
+ - textrecog
+ - textspotting
+ License:
+ Type: CC BY 4.0
+ Link: https://creativecommons.org/licenses/by/4.0/
+ Format: .txt
diff --git a/dataset_zoo/icdar2015/sample_anno.md b/dataset_zoo/icdar2015/sample_anno.md
new file mode 100644
index 00000000..73b37c3e
--- /dev/null
+++ b/dataset_zoo/icdar2015/sample_anno.md
@@ -0,0 +1,19 @@
+**Text Detection**
+
+```text
+# x1,y1,x2,y2,x3,y3,x4,y4,trans
+
+377,117,463,117,465,130,378,130,Genaxis Theatre
+493,115,519,115,519,131,493,131,[06]
+374,155,409,155,409,170,374,170,###
+```
+
+**Text Recognition**
+
+```text
+# img_name, "text"
+
+word_1.png, "Genaxis Theatre"
+word_2.png, "[06]"
+word_3.png, "62-03"
+```
diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py
new file mode 100644
index 00000000..3dfa6f76
--- /dev/null
+++ b/dataset_zoo/icdar2015/textdet.py
@@ -0,0 +1,51 @@
+data_root = 'data/icdar2015'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+ type='NaiveDataObtainer',
+ cache_path=cache_path,
+ data_root=data_root,
+ files=[
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip',
+ save_name='ic15_textdet_train_img.zip',
+ md5='c51cbace155dcc4d98c8dd19d378f30d',
+ split=['train'],
+ content=['image'],
+ mapping=[['ic15_textdet_train_img', 'textdet_imgs/train']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip',
+ save_name='ic15_textdet_test_img.zip',
+ md5='97e4c1ddcf074ffcc75feff2b63c35dd',
+ split=['test'],
+ content=['image'],
+ mapping=[['ic15_textdet_test_img', 'textdet_imgs/test']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/'
+ 'ch4_training_localization_transcription_gt.zip',
+ save_name='ic15_textdet_train_gt.zip',
+ md5='3bfaf1988960909014f7987d2343060b',
+ split=['train'],
+ content=['annotation'],
+ mapping=[['ic15_textdet_train_gt', 'annotations/train']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/'
+ 'Challenge4_Test_Task4_GT.zip',
+ save_name='ic15_textdet_test_gt.zip',
+ md5='8bce173b06d164b98c357b0eb96ef430',
+ split=['test'],
+ content=['annotation'],
+ mapping=[['ic15_textdet_test_gt', 'annotations/test']]),
+ ])
+
+data_converter = dict(
+ type='TextDetDataConverter',
+ splits=['train', 'test'],
+ data_root=data_root,
+ gatherer=dict(
+ type='pair_gather',
+ suffixes=['.jpg', '.JPG'],
+ rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
+ parser=dict(type='ICDARTxtTextDetAnnParser'),
+ dumper=dict(type='JsonDumper'),
+ delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img'])
diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py
new file mode 100644
index 00000000..5500bebd
--- /dev/null
+++ b/dataset_zoo/icdar2015/textrecog.py
@@ -0,0 +1,42 @@
+data_root = 'data/icdar2015'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+ type='NaiveDataObtainer',
+ cache_path=cache_path,
+ data_root=data_root,
+ files=[
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/'
+ 'ch4_training_word_images_gt.zip',
+ save_name='ic15_textrecog_train_img_gt.zip',
+ md5='600caf8c6a64a3dcf638839820edcca9',
+ split=['train'],
+ content=['image', 'annotation'],
+ mapping=[[
+ 'ic15_textrecog_train_img_gt/gt.txt', 'annotations/train.txt'
+ ], ['ic15_textrecog_train_img_gt', 'textrecog_imgs/train']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_test_word_images_gt.zip',
+ save_name='ic15_textrecog_test_img.zip',
+ md5='d7a71585f4cc69f89edbe534e7706d5d',
+ split=['test'],
+ content=['image'],
+ mapping=[['ic15_textrecog_test_img', 'textrecog_imgs/test']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/'
+ 'Challenge4_Test_Task3_GT.txt',
+ save_name='ic15_textrecog_test_gt.txt',
+ md5='d7a71585f4cc69f89edbe534e7706d5d',
+ split=['test'],
+ content=['annotation'],
+ mapping=[['ic15_textrecog_test_gt.txt', 'annotations/test.txt']])
+ ])
+
+data_converter = dict(
+ type='TextRecogDataConverter',
+ splits=['train', 'test'],
+ data_root=data_root,
+ gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
+ parser=dict(type='ICDARTxtTextRecogAnnParser'),
+ dumper=dict(type='JsonDumper'))
diff --git a/dataset_zoo/icdar2015/textspotting.py b/dataset_zoo/icdar2015/textspotting.py
new file mode 100644
index 00000000..413de5e8
--- /dev/null
+++ b/dataset_zoo/icdar2015/textspotting.py
@@ -0,0 +1,3 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextSpottingDataConverter')
diff --git a/dataset_zoo/totaltext/metafile.yml b/dataset_zoo/totaltext/metafile.yml
new file mode 100644
index 00000000..ea94f160
--- /dev/null
+++ b/dataset_zoo/totaltext/metafile.yml
@@ -0,0 +1,30 @@
+Name: 'Total Text'
+Paper:
+ Title: "Total-Text: Towards Orientation Robustness in Scene Text Detection"
+ URL: https://link.springer.com/article/10.1007/s10032-019-00334-z
+ Venue: IJDAR
+ Year: '2020'
+ BibTeX: '@article{CK2019,
+ author = {Chee Kheng Chng and Chee Seng Chan and Chenglin Liu},
+ title = {Total-Text: Towards Orientation Robustness in Scene Text Detection},
+ journal = {International Journal on Document Analysis and Recognition (IJDAR)},
+ volume = {23},
+ pages = {31-52},
+ year = {2020},
+ doi = {10.1007/s10032-019-00334-z}}'
+Data:
+ Website: https://github.com/cs-chan/Total-Text-Dataset
+ Language:
+ - English
+ Scene:
+ - Natural Scene
+ Granularity:
+ - Word
+ Tasks:
+ - textdet
+ - textrecog
+ - textspotting
+ License:
+ Type: BSD-3
+ Link: https://github.com/cs-chan/Total-Text-Dataset/blob/master/LICENSE
+ Format: .txt
diff --git a/dataset_zoo/totaltext/sample_anno.md b/dataset_zoo/totaltext/sample_anno.md
new file mode 100644
index 00000000..51ce61a2
--- /dev/null
+++ b/dataset_zoo/totaltext/sample_anno.md
@@ -0,0 +1,6 @@
+**Text Detection/Spotting**
+
+```text
+x: [[259 313 389 427 354 302]], y: [[542 462 417 459 507 582]], ornt: [u'c'], transcriptions: [u'PAUL']
+x: [[400 478 494 436]], y: [[398 380 448 465]], ornt: [u'#'], transcriptions: [u'#']
+```
diff --git a/dataset_zoo/totaltext/textdet.py b/dataset_zoo/totaltext/textdet.py
new file mode 100644
index 00000000..425909fa
--- /dev/null
+++ b/dataset_zoo/totaltext/textdet.py
@@ -0,0 +1,39 @@
+data_root = 'data/totaltext'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+ type='NaiveDataObtainer',
+ cache_path=cache_path,
+ data_root=data_root,
+ files=[
+ dict(
+ url='https://universityofadelaide.box.com/shared/static/'
+ '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip',
+ save_name='totaltext.zip',
+ md5='5b56d71a4005a333cf200ff35ce87f75',
+ split=['train', 'test'],
+ content=['image'],
+ mapping=[['totaltext/Images/Train', 'textdet_imgs/train'],
+ ['totaltext/Images/Test', 'textdet_imgs/test']]),
+ dict(
+ url='https://universityofadelaide.box.com/shared/static/'
+ '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip',
+ save_name='txt_format.zip',
+ md5='53377a83420b4a0244304467512134e8',
+ split=['train', 'test'],
+ content=['annotation'],
+ mapping=[['txt_format/Train', 'annotations/train'],
+ ['txt_format/Test', 'annotations/test']]),
+ ])
+
+data_converter = dict(
+ type='TextDetDataConverter',
+ splits=['train', 'test'],
+ data_root=data_root,
+ gatherer=dict(
+ type='pair_gather',
+ suffixes=['.jpg', '.JPG'],
+ rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']),
+ parser=dict(type='TotaltextTextDetAnnParser', data_root=data_root),
+ dumper=dict(type='JsonDumper'),
+ delete=['totaltext', 'txt_format', 'annotations'])
diff --git a/dataset_zoo/totaltext/textrecog.py b/dataset_zoo/totaltext/textrecog.py
new file mode 100644
index 00000000..e18f2f1f
--- /dev/null
+++ b/dataset_zoo/totaltext/textrecog.py
@@ -0,0 +1,3 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextRecogCropConverter')
diff --git a/dataset_zoo/totaltext/textspotting.py b/dataset_zoo/totaltext/textspotting.py
new file mode 100644
index 00000000..413de5e8
--- /dev/null
+++ b/dataset_zoo/totaltext/textspotting.py
@@ -0,0 +1,3 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextSpottingDataConverter')
diff --git a/dataset_zoo/wildreceipt/kie.py b/dataset_zoo/wildreceipt/kie.py
new file mode 100644
index 00000000..66ab937d
--- /dev/null
+++ b/dataset_zoo/wildreceipt/kie.py
@@ -0,0 +1,32 @@
+data_root = 'data/wildreceipt'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+ type='NaiveDataObtainer',
+ cache_path=cache_path,
+ data_root=data_root,
+ files=[
+ dict(
+ url='https://download.openmmlab.com/mmocr/data/wildreceipt.tar',
+ save_name='wildreceipt.tar',
+ md5='2a2c4a1b4777fb4fe185011e17ad46ae',
+ split=['train', 'test'],
+ content=['image', 'annotation'],
+ mapping=[
+ ['wildreceipt/wildreceipt/class_list.txt', 'class_list.txt'],
+ ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'],
+ ['wildreceipt/wildreceipt/test.txt', 'test.txt'],
+ ['wildreceipt/wildreceipt/train.txt', 'train.txt'],
+ ['wildreceipt/wildreceipt/image_files', 'image_files'],
+ ]),
+ ])
+
+data_converter = dict(
+ type='WildReceiptConverter',
+ splits=['train', 'test'],
+ data_root=data_root,
+ gatherer=dict(
+ type='mono_gather', mapping="f'{split}.txt'", ann_path=data_root),
+ parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root),
+ dumper=dict(type='WildreceiptOpensetDumper'),
+ delete=['wildreceipt'])
diff --git a/dataset_zoo/wildreceipt/metafile.yml b/dataset_zoo/wildreceipt/metafile.yml
new file mode 100644
index 00000000..0196a6fe
--- /dev/null
+++ b/dataset_zoo/wildreceipt/metafile.yml
@@ -0,0 +1,30 @@
+Name: 'WildReceipt'
+Paper:
+ Title: "Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
+ URL: https://link.springer.com/article/10.1007/s10032-019-00334-z
+ Venue: arXiv
+ Year: '2021'
+ BibTeX: '@article{sun2021spatial,
+ title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction},
+ author={Sun, Hongbin and Kuang, Zhanghui and Yue, Xiaoyu and Lin, Chenhao and Zhang, Wayne},
+ journal={arXiv preprint arXiv:2103.14470},
+ year={2021}
+}
+'
+Data:
+ Website: https://download.openmmlab.com/mmocr/data/wildreceipt.tar
+ Language:
+ - English
+ Scene:
+ - Receipt
+ Granularity:
+ - Word
+ Tasks:
+ - kie
+ - textdet
+ - textrecog
+ - textspotting
+ License:
+ Type: N/A
+ Link: N/A
+ Format: .txt
diff --git a/dataset_zoo/wildreceipt/sample_anno.md b/dataset_zoo/wildreceipt/sample_anno.md
new file mode 100644
index 00000000..e5a0dcea
--- /dev/null
+++ b/dataset_zoo/wildreceipt/sample_anno.md
@@ -0,0 +1,45 @@
+**KIE**
+
+```json
+// Close Set
+{
+ "file_name": "image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg",
+ "height": 1200,
+ "width": 1600,
+ "annotations":
+ [
+ {
+ "box": [550.0, 190.0, 937.0, 190.0, 937.0, 104.0, 550.0, 104.0],
+ "text": "SAFEWAY",
+ "label": 1
+ },
+ {
+ "box": [1048.0, 211.0, 1074.0, 211.0, 1074.0, 196.0, 1048.0, 196.0],
+ "text": "TM",
+ "label": 25
+ }
+ ], //...
+}
+
+// Open Set
+{
+ "file_name": "image_files/Image_12/10/845be0dd6f5b04866a2042abd28d558032ef2576.jpeg",
+ "height": 348,
+ "width": 348,
+ "annotations":
+ [
+ {
+ "box": [114.0, 19.0, 230.0, 19.0, 230.0, 1.0, 114.0, 1.0],
+ "text": "CHOEUN",
+ "label": 2,
+ "edge": 1
+ },
+ {
+ "box": [97.0, 35.0, 236.0, 35.0, 236.0, 19.0, 97.0, 19.0],
+ "text": "KOREANRESTAURANT",
+ "label": 2,
+ "edge": 1
+ }
+ ]
+}
+```
diff --git a/dataset_zoo/wildreceipt/textdet.py b/dataset_zoo/wildreceipt/textdet.py
new file mode 100644
index 00000000..faa0decf
--- /dev/null
+++ b/dataset_zoo/wildreceipt/textdet.py
@@ -0,0 +1,6 @@
+_base_ = ['kie.py']
+
+data_converter = dict(
+ type='TextDetDataConverter',
+ parser=dict(type='WildreceiptTextDetAnnParser'),
+ dumper=dict(type='JsonDumper'))
diff --git a/dataset_zoo/wildreceipt/textrecog.py b/dataset_zoo/wildreceipt/textrecog.py
new file mode 100644
index 00000000..e18f2f1f
--- /dev/null
+++ b/dataset_zoo/wildreceipt/textrecog.py
@@ -0,0 +1,3 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextRecogCropConverter')
diff --git a/dataset_zoo/wildreceipt/textspotting.py b/dataset_zoo/wildreceipt/textspotting.py
new file mode 100644
index 00000000..413de5e8
--- /dev/null
+++ b/dataset_zoo/wildreceipt/textspotting.py
@@ -0,0 +1,3 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextSpottingDataConverter')
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 255c071b..6c62152b 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -173,6 +173,7 @@ intersphinx_mapping = {
def builder_inited_handler(app):
subprocess.run(['./merge_docs.sh'])
subprocess.run(['./stats.py'])
+ subprocess.run(['./dataset_zoo.py'])
def setup(app):
diff --git a/docs/en/dataset_zoo.py b/docs/en/dataset_zoo.py
new file mode 100755
index 00000000..01e47c2d
--- /dev/null
+++ b/docs/en/dataset_zoo.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+import os
+import os.path as osp
+
+import yaml
+
+dataset_zoo_path = '../../dataset_zoo'
+datasets = os.listdir(dataset_zoo_path)
+datasets.sort()
+
+table = '# Overview\n'
+table += '## Supported Datasets\n'
+table += '| Dataset Name | Text Detection | Text Recognition | Text Spotting | KIE |\n' \
+ '|--------------|----------------|------------------|---------------|-----|\n' # noqa: E501
+details = '## Dataset Details\n'
+
+for dataset in datasets:
+ meta = yaml.safe_load(
+ open(osp.join(dataset_zoo_path, dataset, 'metafile.yml')))
+ dataset_name = meta['Name']
+ paper = meta['Paper']
+ data = meta['Data']
+
+ table += '| [{}](#{}) | {} | {} | {} | {} |\n'.format(
+ dataset,
+ dataset_name.lower().replace(' ', '-'),
+ '✓' if 'textdet' in data['Tasks'] else '',
+ '✓' if 'textrecog' in data['Tasks'] else '',
+ '✓' if 'textspotting' in data['Tasks'] else '',
+ '✓' if 'kie' in data['Tasks'] else '',
+ )
+
+ details += '### {}\n'.format(dataset_name)
+ details += "> \"{}\", *{}*, {}.\n\n".format(paper['Title'], paper['Venue'],
+ paper['Year'])
+
+ # Basic Info
+ details += 'A. Basic Info\n'
+ details += ' - Official Website: [{}]({})\n'.format(
+ dataset, data['Website'])
+ details += ' - Year: {}\n'.format(paper['Year'])
+ details += ' - Language: {}\n'.format(data['Language'])
+ details += ' - Scene: {}\n'.format(data['Scene'])
+ details += ' - Annotation Granularity: {}\n'.format(data['Granularity'])
+ details += ' - Supported Tasks: {}\n'.format(data['Tasks'])
+ details += ' - License: [{}]({})\n'.format(data['License']['Type'],
+ data['License']['Link'])
+
+ # Format
+ details += ' B. Annotation Format
\n\n'
+ sample_path = osp.join(dataset_zoo_path, dataset, 'sample_anno.md')
+ if osp.exists(sample_path):
+ with open(sample_path, 'r') as f:
+ samples = f.readlines()
+ samples = ''.join(samples)
+ details += samples
+ details += ' \n\n'
+
+ # Reference
+ details += 'C. Reference\n'
+ details += '```bibtex\n{}\n```\n'.format(paper['BibTeX'])
+
+datasetzoo = table + details
+
+with open('user_guides/data_prepare/datasetzoo.md', 'w') as f:
+ f.write(datasetzoo)
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 52de5b45..6d96c028 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -51,6 +51,8 @@ You can switch between English and Chinese in the lower-left corner of the layou
:maxdepth: 2
:caption: Dataset Zoo
+ user_guides/data_prepare/datasetzoo.md
+ user_guides/data_prepare/dataset_preparer.md
user_guides/data_prepare/det.md
user_guides/data_prepare/recog.md
user_guides/data_prepare/kie.md
diff --git a/docs/en/user_guides/data_prepare/dataset_preparer.md b/docs/en/user_guides/data_prepare/dataset_preparer.md
new file mode 100644
index 00000000..940d6137
--- /dev/null
+++ b/docs/en/user_guides/data_prepare/dataset_preparer.md
@@ -0,0 +1,184 @@
+# Dataset Preparer
+
+## One-click data preparation script
+
+MMOCR provides a unified one-stop data preparation script `prepare_dataset.py`.
+
+Only one line of command is needed to complete the data download, decompression, and format conversion.
+
+```bash
+python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] --task [$TASK] --nproc [$NPROC]
+```
+
+| ARGS | Type | Description |
+| ------------ | ---- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| dataset_name | str | (required) dataset name. |
+| --task | str | Convert the dataset to the format of a specified task supported by MMOCR. options are: 'textdet', 'textrecog', 'textspotting', and 'kie'. |
+| --nproc | int | Number of processes to be used. Defaults to 4. |
+
+For example, the following command shows how to use the script to prepare the ICDAR2015 dataset for text detection task.
+
+```bash
+python tools/dataset_converters/prepare_dataset.py icdar2015 --task textdet
+```
+
+Also, the script supports preparing multiple datasets at the same time. For example, the following command shows how to prepare the ICDAR2015 and TotalText datasets for text recognition task.
+
+```bash
+python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task textrecog
+```
+
+To check the supported datasets in MMOCR, please refer to [Dataset Zoo](./datasetzoo.md).
+
+## Advanced Usage
+
+### Configuration of Dataset Preparer
+
+Dataset preparer uses a modular design to enhance extensibility, which allows users to extend it to other public or private datasets easily. The configuration files of the dataset preparers are stored in the `dataset_zoo/`, where all the configs of currently supported datasets can be found here. The directory structure is as follows:
+
+```text
+dataset_zoo/
+├── icdar2015
+│ ├── metafile.yml
+│ ├── textdet.py
+│ ├── textrecog.py
+│ └── textspotting.py
+└── wildreceipt
+ ├── metafile.yml
+ ├── kie.py
+ ├── textdet.py
+ ├── textrecog.py
+ └── textspotting.py
+```
+
+`metafile.yml` is the metafile of the dataset, which contains the basic information of the dataset, including the year of publication, the author of the paper, and other information such as license. The other files named by the task are the configuration files of the dataset preparer, which are used to configure the download, decompression, format conversion, etc. of the dataset. These configs are in Python format, and their usage is completely consistent with the configuration files in MMOCR repo. See [Configuration File Documentation](../config.md) for detailed usage.
+
+#### Metafile
+
+Take the ICDAR2015 dataset as an example, the `metafile.yml` stores the basic information of the dataset:
+
+```yaml
+Name: 'Incidental Scene Text IC15'
+Paper:
+ Title: ICDAR 2015 Competition on Robust Reading
+ URL: https://rrc.cvc.uab.es/files/short_rrc_2015.pdf
+ Venue: ICDAR
+ Year: '2015'
+ BibTeX: '@inproceedings{karatzas2015icdar,
+ title={ICDAR 2015 competition on robust reading},
+ author={Karatzas, Dimosthenis and Gomez-Bigorda, Lluis and Nicolaou, Anguelos and Ghosh, Suman and Bagdanov, Andrew and Iwamura, Masakazu and Matas, Jiri and Neumann, Lukas and Chandrasekhar, Vijay Ramaseshan and Lu, Shijian and others},
+ booktitle={2015 13th international conference on document analysis and recognition (ICDAR)},
+ pages={1156--1160},
+ year={2015},
+ organization={IEEE}}'
+Data:
+ Website: https://rrc.cvc.uab.es/?ch=4
+ Language:
+ - English
+ Scene:
+ - Natural Scene
+ Granularity:
+ - Word
+ Tasks:
+ - textdet
+ - textrecog
+ - textspotting
+ License:
+ Type: CC BY 4.0
+ Link: https://creativecommons.org/licenses/by/4.0/
+```
+
+It is not mandatory to use the metafile in the dataset preparation process (so users can ignore this file when preparing private datasets), but in order to better understand the information of each public dataset, we recommend that users read the metafile before preparing the dataset, which will help to understand whether the datasets meet their needs.
+
+#### Config of Dataset Preparer
+
+Next, we will introduce the conventional fields and usage of the dataset preparer configuration files.
+
+In the configuration files, there are two fields `data_root` and `cache_path`, which are used to store the converted dataset and the temporary files such as the archived files downloaded during the data preparation process.
+
+```python
+data_root = './data/icdar2015'
+cache_path = './data/cache'
+```
+
+Data preparation usually contains two steps: "raw data preparation" and "format conversion and saving". Therefore, we use the `data_obtainer` and `data_converter` to configure the behavior of these two steps. In some cases, users can also ignore `data_converter` to only download and decompress the raw data, without performing format conversion and saving. Or, for the local stored dataset, use ignore `data_obtainer` to only perform format conversion and saving.
+
+Take the text detection task of the ICDAR2015 dataset (`dataset_zoo/icdar2015/textdet.py`) as an example:
+
+```python
+data_obtainer = dict(
+ type='NaiveDataObtainer',
+ cache_path=cache_path,
+ data_root=data_root,
+ files=[
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip',
+ save_name='ic15_textdet_train_img.zip',
+ md5='c51cbace155dcc4d98c8dd19d378f30d',
+ split=['train'],
+ content=['image'],
+ mapping=[['ic15_textdet_train_img', 'imgs/train']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip',
+ save_name='ic15_textdet_test_img.zip',
+ md5='97e4c1ddcf074ffcc75feff2b63c35dd',
+ split=['test'],
+ content=['image'],
+ mapping=[['ic15_textdet_test_img', 'imgs/test']]),
+ ])
+```
+
+The default type of `data_obtainer` is `NaiveDataObtainer`, which mainly downloads and decompresses the original files to the specified directory. Here, we configure the URL, save name, MD5 value, etc. of the original dataset files through the `files` parameter. The `mapping` parameter is used to specify the path where the data is decompressed or moved. In addition, the two optional parameters `split` and `content` respectively indicate the content type stored in the compressed file and the corresponding dataset.
+
+```python
+data_converter = dict(
+ type='TextDetDataConverter',
+ splits=['train', 'test'],
+ data_root=data_root,
+ gatherer=dict(
+ type='pair_gather',
+ suffixes=['.jpg', '.JPG'],
+ rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
+ parser=dict(type='ICDARTxtTextDetAnnParser'),
+ dumper=dict(type='JsonDumper'),
+ delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img'])
+```
+
+`data_converter` is responsible for loading and converting the original to the format supported by MMOCR. We provide a number of built-in data converters for different tasks, such as `TextDetDataConverter`, `TextRecogDataConverter`, `TextSpottingDataConverter`, and `WildReceiptConverter` (Since we only support WildReceipt dataset for KIE task at present, we only provide this converter for now).
+
+Take the text detection task as an example, `TextDetDataConverter` mainly completes the following work:
+
+- Collect and match the images and original annotation files, such as the image `img_1.jpg` and the annotation `gt_img_1.txt`
+- Load and parse the original annotations to obtain necessary information such as the bounding box and text
+- Convert the parsed data to the format supported by MMOCR
+- Dump the converted data to the specified path and format
+
+The above steps can be configured separately through `gatherer`, `parser`, `dumper`.
+
+Specifically, the `gatherer` is used to collect and match the images and annotations in the original dataset. Typically, there are two relations between images and annotations, one is many-to-many, the other is many-to-one.
+
+```text
+many-to-many
+├── img_1.jpg
+├── gt_img_1.txt
+├── img_2.jpg
+├── gt_img_2.txt
+├── img_3.JPG
+├── gt_img_3.txt
+
+one-to-many
+├── img_1.jpg
+├── img_2.jpg
+├── img_3.JPG
+├── gt.txt
+```
+
+Therefore, we provide two built-in gatherers, `pair_gather` and `mono_gather`, to handle the two cases. `pair_gather` is used for the case of many-to-many, and `mono_gather` is used for the case of one-to-many. `pair_gather` needs to specify the `suffixes` parameter to indicate the suffix of the image, such as `suffixes=[.jpg,.JPG]` in the above example. In addition, we need to specify the corresponding relationship between the image and the annotation file through the regular expression, such as `rule=[r'img_(\d+)\.([jJ][pP][gG])',r'gt_img_\1.txt']` in the above example. Where `\d+` is used to match the serial number of the image, `([jJ][pP][gG])` is used to match the suffix of the image, and `\_1` matches the serial number of the image and the serial number of the annotation file.
+
+When the image and annotation file are matched, the original annotations will be parsed. Since the annotation format is usually varied from dataset to dataset, the parsers are usually dataset related. Then, the parser will pack the required data into the MMOCR format.
+
+Finally, we can specify the dumpers to decide the data format. Currently, we only support `JsonDumper` and `WildreceiptOpensetDumper`, where the former is used to save the data in the standard MMOCR Json format, and the latter is used to save the data in the Wildreceipt format. In the future, we plan to support `LMDBDumper` to save the annotation files in LMDB format.
+
+### Use DataPreparer to prepare customized dataset
+
+\[Coming Soon\]
diff --git a/docs/en/user_guides/data_prepare/det.md b/docs/en/user_guides/data_prepare/det.md
index a869f3f7..f0ff5eac 100644
--- a/docs/en/user_guides/data_prepare/det.md
+++ b/docs/en/user_guides/data_prepare/det.md
@@ -1,4 +1,8 @@
-# Text Detection
+# Text Detection\[Deprecated\]
+
+```{warning}
+This page is deprecated and all these scripts will be eventually migrated into dataset preparer, a brand new module designed to ease these lengthy dataset preparation steps. [Check it out](./dataset_preparer.md)!
+```
## Overview
diff --git a/docs/en/user_guides/data_prepare/kie.md b/docs/en/user_guides/data_prepare/kie.md
index cbbc8290..c4025484 100644
--- a/docs/en/user_guides/data_prepare/kie.md
+++ b/docs/en/user_guides/data_prepare/kie.md
@@ -1,4 +1,8 @@
-# Key Information Extraction
+# Key Information Extraction\[Deprecated\]
+
+```{warning}
+This page is deprecated and all these scripts will be eventually migrated into dataset preparer, a brand new module designed to ease these lengthy dataset preparation steps. [Check it out](./dataset_preparer.md)!
+```
## Overview
diff --git a/docs/en/user_guides/data_prepare/recog.md b/docs/en/user_guides/data_prepare/recog.md
index 3a1d5793..1d25cbea 100644
--- a/docs/en/user_guides/data_prepare/recog.md
+++ b/docs/en/user_guides/data_prepare/recog.md
@@ -1,4 +1,8 @@
-# Text Recognition
+# Text Recognition\[Deprecated\]
+
+```{warning}
+This page is deprecated and all these scripts will be eventually migrated into dataset preparer, a brand new module designed to ease these lengthy dataset preparation steps. [Check it out](./dataset_preparer.md)!
+```
## Overview
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index c6363abb..c9559ceb 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -171,6 +171,7 @@ def builder_inited_handler(app):
subprocess.run(['./cp_origin_docs.sh'])
subprocess.run(['./merge_docs.sh'])
subprocess.run(['./stats.py'])
+ subprocess.run(['./dataset_zoo.py'])
def setup(app):
diff --git a/docs/zh_cn/dataset_zoo.py b/docs/zh_cn/dataset_zoo.py
new file mode 100755
index 00000000..972ea931
--- /dev/null
+++ b/docs/zh_cn/dataset_zoo.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import os
+import os.path as osp
+
+import yaml
+
+dataset_zoo_path = '../../dataset_zoo'
+datasets = os.listdir(dataset_zoo_path)
+datasets.sort()
+
+table = '# 支持数据集一览\n'
+table += '## 支持的数据集\n'
+table += '| 数据集名称 | 文本检测 | 文本识别 | 端到端文本检测识别 | 关键信息抽取 |\n' \
+ '|----------|---------|--------|------------------|-----------|\n'
+details = '## 数据集详情\n'
+
+for dataset in datasets:
+ meta = yaml.safe_load(
+ open(osp.join(dataset_zoo_path, dataset, 'metafile.yml')))
+ dataset_name = meta['Name']
+ paper = meta['Paper']
+ data = meta['Data']
+
+ table += '| [{}](#{}) | {} | {} | {} | {} |\n'.format(
+ dataset,
+ dataset_name.lower().replace(' ', '-'),
+ '✓' if 'textdet' in data['Tasks'] else '',
+ '✓' if 'textrecog' in data['Tasks'] else '',
+ '✓' if 'textspotting' in data['Tasks'] else '',
+ '✓' if 'kie' in data['Tasks'] else '',
+ )
+
+ details += '### {}\n'.format(dataset_name)
+ details += "> \"{}\", *{}*, {}.\n\n".format(paper['Title'], paper['Venue'],
+ paper['Year'])
+ # Basic Info
+ details += 'A. 数据集基础信息\n'
+ details += ' - 官方网址: [{}]({})\n'.format(dataset, data['Website'])
+ details += ' - 发布年份: {}\n'.format(paper['Year'])
+ details += ' - 语言: {}\n'.format(data['Language'])
+ details += ' - 场景: {}\n'.format(data['Scene'])
+ details += ' - 标注粒度: {}\n'.format(data['Granularity'])
+ details += ' - 支持任务: {}\n'.format(data['Tasks'])
+ details += ' - 数据集许可证: [{}]({})\n\n'.format(data['License']['Type'],
+ data['License']['Link'])
+
+ # Format
+ details += ' B. 标注格式
\n\n'
+ sample_path = osp.join(dataset_zoo_path, dataset, 'sample_anno.md')
+ if osp.exists(sample_path):
+ with open(sample_path, 'r') as f:
+ samples = f.readlines()
+ samples = ''.join(samples)
+ details += samples
+ details += ' \n\n'
+
+ # Reference
+ details += 'C. 参考文献\n'
+ details += '```bibtex\n{}\n```\n'.format(paper['BibTeX'])
+
+datasetzoo = table + details
+
+with open('user_guides/data_prepare/datasetzoo.md', 'w') as f:
+ f.write(datasetzoo)
diff --git a/docs/zh_cn/get_started/install.md b/docs/zh_cn/get_started/install.md
index ae9c4c42..1ee9ac81 100644
--- a/docs/zh_cn/get_started/install.md
+++ b/docs/zh_cn/get_started/install.md
@@ -118,7 +118,7 @@ python mmocr/ocr.py --det DB_r18 --recog CRNN demo/demo_text_ocr.jpg --show
也可以在 Python 解释器中运行以下代码:
```python
-from mmocr.utils.ocr import MMOCR
+from mmocr.ocr import MMOCR
ocr = MMOCR(recog='CRNN', det='DB_r18')
ocr.readtext('demo_text_ocr.jpg', show=True)
```
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index c8732d3e..01e6865f 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -51,6 +51,8 @@
:maxdepth: 2
:caption: 数据集支持
+ user_guides/data_prepare/datasetzoo.md
+ user_guides/data_prepare/dataset_preparer.md
user_guides/data_prepare/det.md
user_guides/data_prepare/recog.md
user_guides/data_prepare/kie.md
diff --git a/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md
new file mode 100644
index 00000000..8015db62
--- /dev/null
+++ b/docs/zh_cn/user_guides/data_prepare/dataset_preparer.md
@@ -0,0 +1,184 @@
+# 数据准备
+
+## 一键式数据准备脚本
+
+MMOCR 提供了统一的一站式数据集准备脚本 `prepare_dataset.py`。
+
+仅需一行命令即可完成数据的下载、解压,以及格式转换。
+
+```bash
+python tools/dataset_converters/prepare_dataset.py [$DATASET_NAME] --task [$TASK] --nproc [$NPROC]
+```
+
+| 参数 | 类型 | 说明 |
+| ------------ | ---- | ----------------------------------------------------------------------------------------------------- |
+| dataset_name | str | (必须)需要准备的数据集名称。 |
+| --task | str | 将数据集格式转换为指定任务的 MMOCR 格式。可选项为: 'textdet', 'textrecog', 'textspotting' 和 'kie'。 |
+| --nproc | str | 使用的进程数,默认为 4。 |
+
+例如,以下命令展示了如何使用该脚本为 ICDAR2015 数据集准备文本检测任务所需的数据。
+
+```bash
+python tools/dataset_converters/prepare_dataset.py icdar2015 --task textdet
+```
+
+该脚本也支持同时准备多个数据集,例如,以下命令展示了如何使用该脚本同时为 ICDAR2015 和 TotalText 数据集准备文本识别任务所需的数据。
+
+```bash
+python tools/dataset_converters/prepare_dataset.py icdar2015 totaltext --task textrecog
+```
+
+进一步了解 MMOCR 支持的数据集,您可以浏览[支持的数据集文档](./datasetzoo.md)
+
+## 进阶用法
+
+### 数据集配置
+
+数据集自动化准备脚本使用了模块化的设计,极大地增强了扩展性,用户能够很方便地配置其他公开数据集或私有数据集。数据集自动化准备脚本的配置文件被统一存储在 `dataset_zoo/` 目录下,用户可以在该目录下找到所有已由 MMOCR 官方支持的数据集准备脚本配置文件。该文件夹的目录结构如下:
+
+```text
+dataset_zoo/
+├── icdar2015
+│ ├── metafile.yml
+│ ├── textdet.py
+│ ├── textrecog.py
+│ └── textspotting.py
+└── wildreceipt
+ ├── metafile.yml
+ ├── kie.py
+ ├── textdet.py
+ ├── textrecog.py
+ └── textspotting.py
+```
+
+其中,`metafile.yml` 是数据集的元信息文件,其中存放了对应数据集的基本信息,包括发布年份,论文作者,以及版权等其他信息。其它以任务名命名的则是数据集准备脚本的配置文件,用于配置数据集的下载、解压、格式转换等操作。这些配置文件采用了 Python 格式,其使用方法与 MMOCR 算法库的其他配置文件完全一致,详见[配置文件文档](../config.md)。
+
+#### 数据集元文件
+
+以数据集 ICDAR2015 为例,`metafile.yml` 中存储了基础的数据集信息:
+
+```yaml
+Name: 'Incidental Scene Text IC15'
+Paper:
+ Title: ICDAR 2015 Competition on Robust Reading
+ URL: https://rrc.cvc.uab.es/files/short_rrc_2015.pdf
+ Venue: ICDAR
+ Year: '2015'
+ BibTeX: '@inproceedings{karatzas2015icdar,
+ title={ICDAR 2015 competition on robust reading},
+ author={Karatzas, Dimosthenis and Gomez-Bigorda, Lluis and Nicolaou, Anguelos and Ghosh, Suman and Bagdanov, Andrew and Iwamura, Masakazu and Matas, Jiri and Neumann, Lukas and Chandrasekhar, Vijay Ramaseshan and Lu, Shijian and others},
+ booktitle={2015 13th international conference on document analysis and recognition (ICDAR)},
+ pages={1156--1160},
+ year={2015},
+ organization={IEEE}}'
+Data:
+ Website: https://rrc.cvc.uab.es/?ch=4
+ Language:
+ - English
+ Scene:
+ - Natural Scene
+ Granularity:
+ - Word
+ Tasks:
+ - textdet
+ - textrecog
+ - textspotting
+ License:
+ Type: CC BY 4.0
+ Link: https://creativecommons.org/licenses/by/4.0/
+```
+
+该文件在数据集准备过程中并不是强制要求的(因此用户在使用添加自己的私有数据集时可以忽略该文件),但为了用户更好地了解各个公开数据集的信息,我们建议用户在使用数据集准备脚本前阅读对应的元文件信息,以了解该数据集的特征是否符合用户需求。
+
+#### 数据集准备脚本配置文件
+
+下面,我们将介绍数据集准备脚本配置文件 `textXXX.py` 的默认字段与使用方法。
+
+我们在配置文件中提供了 `data_root` 与 `cache_path` 两个默认字段,分别用于存放转换后的 MMOCR 格式的数据集文件,以及在数据准备过程中下载的压缩包等临时文件。
+
+```python
+data_root = './data/icdar2015'
+cache_path = './data/cache'
+```
+
+其次,数据集的准备通常包含了“原始数据准备”以及“格式转换和保存”这两个主要步骤。因此,我们约定通过 `data_obtainer` 和 `data_converter` 参数来配置这两个步骤的行为。在某些情况下,用户也可以通过缺省 `data_converter` 参数来仅进行原始数据的下载和解压,而不进行格式转换和保存。或者,对于本地存储的数据集,通过缺省 `data_obtainer` 参数来仅进行格式转换和保存。
+
+以 ICDAR2015 数据集的文本检测任务准备配置文件(`dataset_zoo/icdar2015/textdet.py`)为例:
+
+```python
+data_obtainer = dict(
+ type='NaiveDataObtainer',
+ cache_path=cache_path,
+ data_root=data_root,
+ files=[
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip',
+ save_name='ic15_textdet_train_img.zip',
+ md5='c51cbace155dcc4d98c8dd19d378f30d',
+ split=['train'],
+ content=['image'],
+ mapping=[['ic15_textdet_train_img', 'imgs/train']]),
+ dict(
+ url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip',
+ save_name='ic15_textdet_test_img.zip',
+ md5='97e4c1ddcf074ffcc75feff2b63c35dd',
+ split=['test'],
+ content=['image'],
+ mapping=[['ic15_textdet_test_img', 'imgs/test']]),
+ ])
+```
+
+数据准备器 `data_obtainer` 的类型默认为 `NaiveDataObtainer`,其主要功能是依次下载压缩包并解压到指定目录。在这里,我们通过 `files` 参数来配置下载的压缩包的 URL、保存名称、MD5 值等信息。其中,`mapping` 参数用于指定该压缩包中的数据解压后的存放路径。另外 `split` 和 `content` 这两个可选参数则分别标明了该压缩包中存储的内容类型与其对应的数据集合。
+
+```python
+data_converter = dict(
+ type='TextDetDataConverter',
+ splits=['train', 'test'],
+ data_root=data_root,
+ gatherer=dict(
+ type='pair_gather',
+ suffixes=['.jpg', '.JPG'],
+ rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']),
+ parser=dict(type='ICDARTxtTextDetAnnParser'),
+ dumper=dict(type='JsonDumper'),
+ delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img'])
+```
+
+数据转换器 `data_converter` 负责完成原始数据的读取与格式转换,并保存为 MMOCR 支持的格式。其中我们针对不同的任务,提供了内置的集中数据转换器,如文本检测任务数据转换器 `TextDetDataConverter`,文本识别任务数据转换器 `TextRecogDataConverter`,端到端文本检测识别任务转换器 `TextSpottingDataConverter`,以及关键信息抽取任务数据转换器 `WildReceiptConverter`(由于关键信息抽取任务目前仅支持 WildReceipt 数据集,我们暂时只提供了基于该数据集的数据转换器)。
+
+以文本检测任务为例,`TextDetDataConverter` 主要完成以下工作:
+
+- 收集并匹配原始数据集中的图片与标注文件,如图像 `img_1.jpg` 与 标注 `gt_img_1.txt`
+- 读取原始标注文件,解析出文本框坐标与文本内容等必要信息
+- 将解析后的数据统一转换至 MMOCR 支持的格式
+- 将转换后的数据保存为指定路径和格式
+
+以上个步骤我们分别可以通过 `gatherer`,`parser`,`dumper` 来进行配置。
+
+具体而言,`gatherer` 用于收集并匹配原始数据集中的图片与标注文件。常用的 OCR 数据集通常有两种标注保存形式,一种为多个标注文件对应多张图片,一种则为单个标注文件对应多张图片,如:
+
+```text
+多对多
+├── img_1.jpg
+├── gt_img_1.txt
+├── img_2.jpg
+├── gt_img_2.txt
+├── img_3.JPG
+├── gt_img_3.txt
+
+单对多
+├── img_1.jpg
+├── img_2.jpg
+├── img_3.JPG
+├── gt.txt
+```
+
+因此,我们内置了 `pair_gather` 与 `mono_gather` 来处理以上这两种情况。其中 `pair_gather` 用于多对多的情况,`mono_gather` 用于单对多的情况。`pair_gather` 需要指定 `suffixes` 参数,用于指定图片的后缀名,如上述例子中的 `suffixes=[.jpg,.JPG]`。此外,还需要通过正则表达式来指定图片与标注文件的对应关系,如上述例子中的 `rule=[r'img_(\d+)\.([jJ][pP][gG])',r'gt_img_\1.txt']`。其中 `\d+` 用于匹配图片的序号,`([jJ][pP][gG])` 用于匹配图片的后缀名,`\_1` 则将匹配到的图片序号与标注文件序号对应起来。
+
+当获取了图像与标注文件的对应关系后,data preparer 将解析原始标注文件。由于不同数据集的标注格式通常有很大的区别,当我们需要支持新的数据集时,通常需要实现一个新的 `parser` 来解析原始标注文件。parser 将任务相关的数据解析后打包成 MMOCR 的统一格式。
+
+最后,我们可以通过指定不同的 dumper 来决定要将数据保存为何种格式。目前,我们仅支持 `JsonDumper` 与 `WildreceiptOpensetDumper`,其中,前者用于将数据保存为标准的 MMOCR Json 格式,而后者用于将数据保存为 Wildreceipt 格式。未来,我们计划支持 `LMDBDumper` 用于保存 LMDB 格式的标注文件。
+
+### 使用 Data Preparer 准备自定义数据集
+
+\[待更新\]
diff --git a/docs/zh_cn/user_guides/data_prepare/det.md b/docs/zh_cn/user_guides/data_prepare/det.md
index a4a7d81a..ba5c583b 100644
--- a/docs/zh_cn/user_guides/data_prepare/det.md
+++ b/docs/zh_cn/user_guides/data_prepare/det.md
@@ -1,4 +1,8 @@
-# 文字检测
+# 文字检测\[过时\]
+
+```{warning}
+该页面内容已经过时,所有有关数据格式转换相关的脚本都将最终迁移至数据准备器 **dataset preparer**,这个全新设计的模块能够极大地方便用户完成冗长的数据准备步骤,详见[相关文档](./dataset_preparer.md)。
+```
## 概览
diff --git a/docs/zh_cn/user_guides/data_prepare/kie.md b/docs/zh_cn/user_guides/data_prepare/kie.md
index 51b5e962..eb5ec674 100644
--- a/docs/zh_cn/user_guides/data_prepare/kie.md
+++ b/docs/zh_cn/user_guides/data_prepare/kie.md
@@ -1,4 +1,8 @@
-# 关键信息提取
+# 关键信息提取\[过时\]
+
+```{warning}
+该页面内容已经过时,所有有关数据格式转换相关的脚本都将最终迁移至数据准备器 **dataset preparer**,这个全新设计的模块能够极大地方便用户完成冗长的数据准备步骤,详见[相关文档](./dataset_preparer.md)。
+```
## 概览
diff --git a/docs/zh_cn/user_guides/data_prepare/recog.md b/docs/zh_cn/user_guides/data_prepare/recog.md
index ce6679b9..cede41ab 100644
--- a/docs/zh_cn/user_guides/data_prepare/recog.md
+++ b/docs/zh_cn/user_guides/data_prepare/recog.md
@@ -1,7 +1,7 @@
-# 文字识别
+# 文字识别\[过时\]
```{warning}
-该章节翻译落后于[英文版文档](../../en/user_guides/../../user_guides/data_prepare/recog.md)。
+该页面内容已经过时,所有有关数据格式转换相关的脚本都将最终迁移至数据准备器 **dataset preparer**,这个全新设计的模块能够极大地方便用户完成冗长的数据准备步骤,详见[相关文档](./dataset_preparer.md)。
```
## 概览
diff --git a/mmocr/datasets/preparers/__init__.py b/mmocr/datasets/preparers/__init__.py
new file mode 100644
index 00000000..a104478a
--- /dev/null
+++ b/mmocr/datasets/preparers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_converter import (TextDetDataConverter, TextRecogDataConverter,
+ TextSpottingDataConverter, WildReceiptConverter)
+from .data_obtainer import NaiveDataObtainer
+from .data_preparer import DatasetPreparer
+from .dumpers import * # noqa
+from .parsers import * # noqa
+
+__all__ = [
+ 'DatasetPreparer', 'NaiveDataObtainer', 'TextDetDataConverter',
+ 'TextRecogDataConverter', 'TextSpottingDataConverter',
+ 'WildReceiptConverter'
+]
diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py
new file mode 100644
index 00000000..73d1e0eb
--- /dev/null
+++ b/mmocr/datasets/preparers/data_converter.py
@@ -0,0 +1,701 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import re
+import shutil
+from abc import abstractmethod
+from functools import partial
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import mmcv
+from mmengine import mkdir_or_exist, track_parallel_progress
+
+from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox
+from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS
+
+
+class BaseDataConverter:
+ """Base class for data processor.
+
+ Args:
+ splits (List): A list of splits to be processed.
+ data_root (str): Path to the data root.
+ gatherer (Dict): Config dict for gathering the dataset files.
+ parser (Dict): Config dict for parsing the dataset files.
+ dumper (Dict): Config dict for dumping the dataset files.
+ nproc (int): Number of processes to process the data.
+ task (str): Task of the dataset.
+ dataset_name (str): Dataset name.
+ delete (Optional[List]): A list of files to be deleted after
+ conversion.
+ config_path (str): Path to the configs. Defaults to 'configs/'.
+ """
+
+ def __init__(self,
+ splits: List,
+ data_root: str,
+ gatherer: Dict,
+ parser: Dict,
+ dumper: Dict,
+ nproc: int,
+ task: str,
+ dataset_name: str,
+ delete: Optional[List] = None,
+ config_path: str = 'configs/'):
+ assert isinstance(nproc, int) and nproc > 0, \
+ 'nproc must be a positive integer.'
+ self.splits = splits
+ self.data_root = data_root
+ self.nproc = nproc
+ self.task = task
+ self.dataset_name = dataset_name
+ self.delete = delete
+ self.config_path = config_path
+ self.img_dir = f'{task}_imgs'
+ parser.update(dict(nproc=nproc))
+ dumper.update(dict(task=task, dataset_name=dataset_name))
+ self.parser = DATA_PARSERS.build(parser)
+ self.dumper = DATA_DUMPERS.build(dumper)
+ gather_type = gatherer.pop('type')
+ self.gatherer_args = gatherer
+ if gather_type == 'pair_gather':
+ self.gatherer = self.pair_gather
+ elif gather_type == 'mono_gather':
+ self.gatherer = self.mono_gather
+ else:
+ raise NotImplementedError
+
+ def __call__(self):
+ """Process the data."""
+ # Convert and dump annotations to MMOCR format
+ dataset_config = dict()
+ for split in self.splits:
+ print(f'Parsing {split} split...')
+ # Gather the info such as file names required by parser
+ img_path = osp.join(self.data_root, self.img_dir, split)
+ ann_path = osp.join(self.data_root, 'annotations')
+ gatherer_args = dict(
+ img_path=img_path, ann_path=ann_path, split=split)
+ gatherer_args.update(self.gatherer_args)
+ files = self.gatherer(**gatherer_args)
+ # Convert dataset annotations to MMOCR format
+ samples = self.parser.parse_files(files, split)
+ print(f'Packing {split} annotations...')
+ func = partial(self.pack_instance, split=split)
+ samples = track_parallel_progress(func, samples, nproc=self.nproc)
+ samples = self.add_meta(samples)
+ # Dump annotation files
+ dataset_config[split] = self.dumper.dump(samples, self.data_root,
+ split)
+ self.generate_dataset_config(dataset_config)
+ self.clean()
+
+ def generate_dataset_config(self, dataset_config: Dict) -> None:
+ """Generate dataset config file. Dataset config is a python file that
+ contains the dataset information.
+
+ Examples:
+ Generated dataset config
+ >>> ic15_rec_data_root = 'data/icdar2015/'
+ >>> ic15_rec_train = dict(
+ >>> type='OCRDataset',
+ >>> data_root=ic15_rec_data_root,
+ >>> ann_file='textrecog_train.json',
+ >>> test_mode=False,
+ >>> pipeline=None)
+ >>> ic15_rec_test = dict(
+ >>> type='OCRDataset',
+ >>> data_root=ic15_rec_data_root,
+ >>> ann_file='textrecog_test.json',
+ >>> test_mode=True,
+ >>> pipeline=None)
+
+ Args:
+ dataset_config (Dict): A dict contains the dataset config string of
+ each split.
+ """
+ if self.task == 'kie':
+ # Not supported yet
+ return
+ cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets',
+ f'{self.dataset_name}.py')
+ if not osp.exists(cfg_path):
+ with open(cfg_path, 'w') as f:
+ f.write(
+ f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n' # noqa: E501
+ )
+ for split in self.splits:
+ with open(cfg_path, 'a') as f:
+ f.write(dataset_config[split])
+
+ @abstractmethod
+ def pack_instance(self, sample: Tuple, split: str) -> Dict:
+ """Pack the parsed annotation info to an MMOCR format instance.
+
+ Args:
+ sample (Tuple): A tuple of (img_file, ann_file).
+ - img_path (str): Path to image file.
+ - instances (Sequence[Dict]): A list of converted annos.
+ split (str): The split of the instance.
+
+ Returns:
+ Dict: An MMOCR format instance.
+ """
+
+ @abstractmethod
+ def add_meta(self, sample: Dict) -> Dict:
+ """Add meta information to the sample.
+
+ Args:
+ sample (Dict): A sample of the dataset.
+
+ Returns:
+ Dict: A sample with meta information.
+ """
+
+ def mono_gather(self, ann_path: str, mapping: str, split: str,
+ **kwargs) -> str:
+ """Gather the dataset file. Specifically for the case that only one
+ annotation file is needed. For example,
+
+ img_001.jpg \
+ img_002.jpg ---> train.json
+ img_003.jpg /
+
+ Args:
+ anno_path (str): Path to the annotations.
+ mapping (str): Mapping rule of the annotation names. For example,
+ "f'{split}.json'" will return 'train.json' when the split is
+ 'train'.
+ split (str): The current split.
+
+ Returns:
+ str: Path to the annotation file.
+ """
+
+ return osp.join(ann_path, eval(mapping))
+
+ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
+ **kwargs) -> List[Tuple]:
+ """Gather the dataset files. Specifically for the paired annotations.
+ That is to say, each image has a corresponding annotation file. For
+ example,
+
+ img_1.jpg <---> gt_img_1.txt
+ img_2.jpg <---> gt_img_2.txt
+ img_3.jpg <---> gt_img_3.txt
+
+ Args:
+ img_path (str): Path to the images.
+ suffixes (List[str]): File suffixes that used for searching.
+ rule (Sequence): The rule for pairing the files. The
+ first element is the matching pattern for the file, and the
+ second element is the replacement pattern, which should
+ be a regular expression. For example, to map the image
+ name img_1.jpg to the annotation name gt_img_1.txt,
+ the rule is
+ [r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501
+
+ Returns:
+ List[Tuple]: A list of tuples (img_path, ann_path).
+ """
+ files = list()
+ for file in list_files(img_path, suffixes):
+ file2 = re.sub(rule[0], rule[1], osp.basename(file))
+ file2 = file.replace(osp.basename(file), file2)
+ file2 = file2.replace(self.img_dir, 'annotations')
+ files.append((file, file2))
+
+ return files
+
+ def clean(self) -> None:
+ for d in self.delete:
+ delete_file = osp.join(self.data_root, d)
+ if osp.exists(delete_file):
+ shutil.rmtree(delete_file)
+
+
+@DATA_CONVERTERS.register_module()
+class TextDetDataConverter(BaseDataConverter):
+ """Text detection data converter.
+
+ Args:
+ splits (List): A list of splits to be processed.
+ data_root (str): Path to the data root.
+ gatherer (Dict): Config dict for gathering the dataset files.
+ parser (Dict): Config dict for parsing the dataset files.
+ dumper (Dict): Config dict for dumping the dataset files.
+ dataset_name (str): Name of the dataset.
+ nproc (int): Number of processes to process the data.
+ delete (Optional[List]): A list of files to be deleted after
+ conversion. Defaults to ['annotations].
+ """
+
+ def __init__(self,
+ splits: List,
+ data_root: str,
+ gatherer: Dict,
+ parser: Dict,
+ dumper: Dict,
+ dataset_name: str,
+ nproc: int,
+ delete: List = ['annotations']) -> None:
+ super().__init__(
+ splits=splits,
+ data_root=data_root,
+ gatherer=gatherer,
+ parser=parser,
+ dumper=dumper,
+ dataset_name=dataset_name,
+ nproc=nproc,
+ delete=delete,
+ task='textdet')
+
+ def pack_instance(self,
+ sample: Tuple,
+ split: str,
+ bbox_label: int = 0) -> Dict:
+ """Pack the parsed annotation info to an MMOCR format instance.
+
+ Args:
+ sample (Tuple): A tuple of (img_file, instances).
+ - img_path (str): Path to the image file.
+ - instances (Sequence[Dict]): A list of converted annos. Each
+ element should be a dict with the following keys:
+ - 'poly' or 'box'
+ - 'ignore'
+ - 'bbox_label' (optional)
+ split (str): The split of the instance.
+
+ Returns:
+ Dict: An MMOCR format instance.
+ """
+
+ img_path, instances = sample
+
+ img = mmcv.imread(img_path)
+ h, w = img.shape[:2]
+
+ packed_instances = list()
+ for instance in instances:
+ poly = instance.get('poly', None)
+ box = instance.get('box', None)
+ assert box or poly
+ packed_sample = dict(
+ polygon=poly if poly else list(
+ bbox2poly(box).astype('float64')),
+ bbox=box if box else list(poly2bbox(poly).astype('float64')),
+ bbox_label=bbox_label,
+ ignore=instance['ignore'])
+ packed_instances.append(packed_sample)
+
+ packed_instances = dict(
+ instances=packed_instances,
+ img_path=img_path.replace(self.data_root + '/', ''),
+ height=h,
+ width=w)
+
+ return packed_instances
+
+ def add_meta(self, sample: Dict) -> Dict:
+ meta = {
+ 'metainfo': {
+ 'dataset_type': 'TextDetDataset',
+ 'task_name': 'textdet',
+ 'category': [{
+ 'id': 0,
+ 'name': 'text'
+ }]
+ },
+ 'data_list': sample
+ }
+ return meta
+
+
+@DATA_CONVERTERS.register_module()
+class TextSpottingDataConverter(BaseDataConverter):
+ """Text spotting data converter.
+
+ Args:
+ splits (List): A list of splits to be processed.
+ data_root (str): Path to the data root.
+ gatherer (Dict): Config dict for gathering the dataset files.
+ parser (Dict): Config dict for parsing the dataset files.
+ dumper (Dict): Config dict for dumping the dataset files.
+ dataset_name (str): Name of the dataset.
+ nproc (int): Number of processes to process the data.
+ delete (Optional[List]): A list of files to be deleted after
+ conversion. Defaults to ['annotations'].
+ """
+
+ def __init__(self,
+ splits: List,
+ data_root: str,
+ gatherer: Dict,
+ parser: Dict,
+ dumper: Dict,
+ dataset_name: str,
+ nproc: int,
+ delete: List = ['annotations']) -> None:
+ super().__init__(
+ splits=splits,
+ data_root=data_root,
+ gatherer=gatherer,
+ parser=parser,
+ dumper=dumper,
+ dataset_name=dataset_name,
+ nproc=nproc,
+ delete=delete,
+ task='textspotting')
+ # Textspotting task shares the same images with textdet task
+ self.img_dir = 'textdet_imgs'
+
+ def pack_instance(self,
+ sample: Tuple,
+ split: str,
+ bbox_label: int = 0) -> Dict:
+ """Pack the parsed annotation info to an MMOCR format instance.
+
+ Args:
+ sample (Tuple): A tuple of (img_file, ann_file).
+ - img_path (str): Path to image file.
+ - instances (Sequence[Dict]): A list of converted annos. Each
+ element should be a dict with the following keys:
+ - 'poly' or 'box'
+ - 'text'
+ - 'ignore'
+ - 'bbox_label' (optional)
+ split (str): The split of the instance.
+
+ Returns:
+ Dict: An MMOCR format instance.
+ """
+
+ img_path, instances = sample
+
+ img = mmcv.imread(img_path)
+ h, w = img.shape[:2]
+
+ packed_instances = list()
+ for instance in instances:
+ assert 'text' in instance, 'Text is not found in the instance.'
+ poly = instance.get('poly', None)
+ box = instance.get('box', None)
+ assert box or poly
+ packed_sample = dict(
+ polygon=poly if poly else list(
+ bbox2poly(box).astype('float64')),
+ bbox=box if box else list(poly2bbox(poly).astype('float64')),
+ bbox_label=bbox_label,
+ ignore=instance['ignore'],
+ text=instance['text'])
+ packed_instances.append(packed_sample)
+
+ packed_instances = dict(
+ instances=packed_instances, img_path=img_path, height=h, width=w)
+
+ return packed_instances
+
+ def add_meta(self, sample: Dict) -> Dict:
+ meta = {
+ 'metainfo': {
+ 'dataset_type': 'TextSpottingDataset',
+ 'task_name': 'textspotting',
+ 'category': [{
+ 'id': 0,
+ 'name': 'text'
+ }]
+ },
+ 'data_list': sample
+ }
+ return meta
+
+
+@DATA_CONVERTERS.register_module()
+class TextRecogDataConverter(BaseDataConverter):
+ """Text recognition data converter.
+
+ Args:
+ splits (List): A list of splits to be processed.
+ data_root (str): Path to the data root.
+ gatherer (Dict): Config dict for gathering the dataset files.
+ parser (Dict): Config dict for parsing the dataset annotations.
+ dumper (Dict): Config dict for dumping the dataset files.
+ dataset_name (str): Name of the dataset.
+ nproc (int): Number of processes to process the data.
+ delete (Optional[List]): A list of files to be deleted after
+ conversion. Defaults to ['annotations].
+ """
+
+ def __init__(self,
+ splits: List,
+ data_root: str,
+ gatherer: Dict,
+ parser: Dict,
+ dumper: Dict,
+ dataset_name: str,
+ nproc: int,
+ delete: List = ['annotations']):
+ super().__init__(
+ splits=splits,
+ data_root=data_root,
+ gatherer=gatherer,
+ parser=parser,
+ dumper=dumper,
+ dataset_name=dataset_name,
+ nproc=nproc,
+ task='textrecog',
+ delete=delete)
+
+ def pack_instance(self, sample: Tuple, split: str) -> Dict:
+ """Pack the text info to a recognition instance.
+
+ Args:
+ samples (Tuple): A tuple of (img_name, text).
+ split (str): The split of the instance.
+
+ Returns:
+ Dict: The packed instance.
+ """
+
+ img_name, text = sample
+ packed_instance = dict(
+ instances=[dict(text=text)],
+ img_path=osp.join(self.img_dir, split, img_name))
+
+ return packed_instance
+
+ def add_meta(self, sample: Dict) -> Dict:
+ meta = {
+ 'metainfo': {
+ 'dataset_type': 'TextRecogDataset',
+ 'task_name': 'textrecog',
+ 'category': [{
+ 'id': 0,
+ 'name': 'text'
+ }]
+ },
+ 'data_list': sample
+ }
+ return meta
+
+
+@DATA_CONVERTERS.register_module()
+class TextRecogCropConverter(TextRecogDataConverter):
+ """Text recognition crop converter. This converter will crop the text from
+ the original image. The parser used for this Converter should be a TextDet
+ parser.
+
+ Args:
+ splits (List): A list of splits to be processed.
+ data_root (str): Path to the data root.
+ gatherer (Dict): Config dict for gathering the dataset files.
+ parser (Dict): Config dict for parsing the dataset annotations.
+ dumper (Dict): Config dict for dumping the dataset files.
+ dataset_name (str): Name of the dataset.
+ nproc (int): Number of processes to process the data.
+ long_edge_pad_ratio (float): The ratio of padding the long edge of the
+ cropped image. Defaults to 0.1.
+ short_edge_pad_ratio (float): The ratio of padding the short edge of
+ the cropped image. Defaults to 0.05.
+ delete (Optional[List]): A list of files to be deleted after
+ conversion. Defaults to ['annotations].
+ """
+
+ def __init__(self,
+ splits: List,
+ data_root: str,
+ gatherer: Dict,
+ parser: Dict,
+ dumper: Dict,
+ dataset_name: str,
+ nproc: int,
+ long_edge_pad_ratio: float = 0.1,
+ short_edge_pad_ratio: float = 0.05,
+ delete: List = ['annotations']):
+ super().__init__(
+ splits=splits,
+ data_root=data_root,
+ gatherer=gatherer,
+ parser=parser,
+ dumper=dumper,
+ dataset_name=dataset_name,
+ nproc=nproc,
+ delete=delete)
+ self.ignore = self.parser.ignore
+ self.lepr = long_edge_pad_ratio
+ self.sepr = short_edge_pad_ratio
+ # Crop converter crops the images of textdet to patches
+ self.img_dir = 'textdet_imgs'
+ self.cropped_img_dir = 'textrecog_imgs'
+ self.crop_save_path = osp.join(self.data_root, self.cropped_img_dir)
+ mkdir_or_exist(self.crop_save_path)
+ for split in splits:
+ mkdir_or_exist(osp.join(self.crop_save_path, split))
+
+ def pack_instance(self, sample: Tuple, split: str) -> List:
+ """Crop patches from image.
+
+ Args:
+ samples (Tuple): A tuple of (img_name, text).
+ split (str): The split of the instance.
+
+ Return:
+ List: The list of cropped patches.
+ """
+
+ def get_box(instance: Dict) -> List:
+ if 'box' in instance:
+ return bbox2poly(instance['box']).tolist()
+ if 'poly' in instance:
+ return bbox2poly(poly2bbox(instance['poly'])).tolist()
+
+ data_list = []
+ img_path, instances = sample
+ img = mmcv.imread(img_path)
+ for i, instance in enumerate(instances):
+ box, text = get_box(instance), instance['text']
+ if text == self.ignore:
+ continue
+ patch = crop_img(img, box, self.lepr, self.sepr)
+ if patch.shape[0] == 0 or patch.shape[1] == 0:
+ continue
+ patch_name = osp.splitext(
+ osp.basename(img_path))[0] + f'_{i}' + osp.splitext(
+ osp.basename(img_path))[1]
+ dst_path = osp.join(self.crop_save_path, split, patch_name)
+ mmcv.imwrite(patch, dst_path)
+ rec_instance = dict(
+ instances=[dict(text=text)],
+ img_path=osp.join(self.cropped_img_dir, split, patch_name))
+ data_list.append(rec_instance)
+
+ return data_list
+
+
+@DATA_CONVERTERS.register_module()
+class WildReceiptConverter(BaseDataConverter):
+ """MMOCR only supports wildreceipt dataset for KIE task now. This converter
+ converts the wildreceipt dataset from close set to open set.
+
+ Args:
+ splits (List): A list of splits to be processed.
+ data_root (str): Path to the data root.
+ gatherer (Dict): Config dict for gathering the dataset files.
+ parser (Dict): Config dict for parsing the dataset annotations.
+ dumper (Dict): Config dict for dumping the dataset files.
+ nproc (int): Number of processes to process the data.
+ delete (Optional[List]): A list of files to be deleted after
+ conversion. Defaults to ['annotations].
+ merge_bg_others (bool): If True, give the same label to "background"
+ class and "others" class. Defaults to True.
+ ignore_idx (int): Index for ``ignore`` class. Defaults to 0.
+ others_idx (int): Index for ``others`` class. Defaults to 25.
+ """
+
+ def __init__(self,
+ splits: List,
+ data_root: str,
+ gatherer: Dict,
+ parser: Dict,
+ dumper: Dict,
+ dataset_name: str,
+ nproc: int,
+ delete: Optional[List] = None,
+ merge_bg_others: bool = False,
+ ignore_idx: int = 0,
+ others_idx: int = 25):
+ self.ignore_idx = ignore_idx
+ self.others_idx = others_idx
+ self.merge_bg_others = merge_bg_others
+ parser.update(dict(ignore=ignore_idx))
+ super().__init__(
+ splits=splits,
+ data_root=data_root,
+ gatherer=gatherer,
+ parser=parser,
+ dumper=dumper,
+ dataset_name=dataset_name,
+ nproc=nproc,
+ task='kie',
+ delete=delete)
+
+ def add_meta(self, samples: List) -> List:
+ """No meta info is required for the wildreceipt dataset."""
+ return samples
+
+ def pack_instance(self, sample: str, split: str):
+ """Pack line-json str of close set to line-json str of open set.
+
+ Args:
+ sample (str): The string to be deserialized to
+ the close set dictionary object.
+ split (str): The split of the instance.
+ """
+ # Two labels at the same index of the following two lists
+ # make up a key-value pair. For example, in wildreceipt,
+ # closeset_key_inds[0] maps to "Store_name_key"
+ # and closeset_value_inds[0] maps to "Store_addr_value".
+ closeset_key_inds = list(range(2, self.others_idx, 2))
+ closeset_value_inds = list(range(1, self.others_idx, 2))
+
+ openset_node_label_mapping = {
+ 'bg': 0,
+ 'key': 1,
+ 'value': 2,
+ 'others': 3
+ }
+ if self.merge_bg_others:
+ openset_node_label_mapping['others'] = openset_node_label_mapping[
+ 'bg']
+
+ closeset_obj = json.loads(sample)
+ openset_obj = {
+ 'file_name': closeset_obj['file_name'],
+ 'height': closeset_obj['height'],
+ 'width': closeset_obj['width'],
+ 'annotations': []
+ }
+
+ edge_idx = 1
+ label_to_edge = {}
+ for anno in closeset_obj['annotations']:
+ label = anno['label']
+ if label == self.ignore_idx:
+ anno['label'] = openset_node_label_mapping['bg']
+ anno['edge'] = edge_idx
+ edge_idx += 1
+ elif label == self.others_idx:
+ anno['label'] = openset_node_label_mapping['others']
+ anno['edge'] = edge_idx
+ edge_idx += 1
+ else:
+ edge = label_to_edge.get(label, None)
+ if edge is not None:
+ anno['edge'] = edge
+ if label in closeset_key_inds:
+ anno['label'] = openset_node_label_mapping['key']
+ elif label in closeset_value_inds:
+ anno['label'] = openset_node_label_mapping['value']
+ else:
+ tmp_key = 'key'
+ if label in closeset_key_inds:
+ label_with_same_edge = closeset_value_inds[
+ closeset_key_inds.index(label)]
+ elif label in closeset_value_inds:
+ label_with_same_edge = closeset_key_inds[
+ closeset_value_inds.index(label)]
+ tmp_key = 'value'
+ edge_counterpart = label_to_edge.get(
+ label_with_same_edge, None)
+ if edge_counterpart is not None:
+ anno['edge'] = edge_counterpart
+ else:
+ anno['edge'] = edge_idx
+ edge_idx += 1
+ anno['label'] = openset_node_label_mapping[tmp_key]
+ label_to_edge[label] = anno['edge']
+
+ openset_obj['annotations'] = closeset_obj['annotations']
+
+ return json.dumps(openset_obj, ensure_ascii=False)
diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py
new file mode 100644
index 00000000..43906967
--- /dev/null
+++ b/mmocr/datasets/preparers/data_obtainer.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import ssl
+import urllib.request as request
+from typing import Dict, List, Optional, Tuple
+
+from mmengine import mkdir_or_exist
+
+from mmocr.utils import check_integrity, is_archive
+from .data_preparer import DATA_OBTAINERS
+
+ssl._create_default_https_context = ssl._create_unverified_context
+
+
+@DATA_OBTAINERS.register_module()
+class NaiveDataObtainer:
+ """A naive pipeline for obtaining dataset.
+
+ download -> extract -> move
+
+ Args:
+ files (list[dict]): A list of file information.
+ cache_path (str): The path to cache the downloaded files.
+ data_root (str): The root path of the dataset.
+ task (str): The task of the dataset.
+ """
+
+ def __init__(self, files: List[Dict], cache_path: str, data_root: str,
+ task: str) -> None:
+ self.files = files
+ self.cache_path = cache_path
+ self.data_root = data_root
+ self.task = task
+ mkdir_or_exist(self.data_root)
+ mkdir_or_exist(osp.join(self.data_root, f'{task}_imgs'))
+ mkdir_or_exist(osp.join(self.data_root, 'annotations'))
+ mkdir_or_exist(self.cache_path)
+
+ def __call__(self):
+ for file in self.files:
+ save_name = file.get('save_name', None)
+ url = file.get('url', None)
+ md5 = file.get('md5', None)
+ download_path = osp.join(
+ self.cache_path,
+ osp.basename(url) if save_name is None else save_name)
+ # Download required files
+ if not check_integrity(download_path, md5):
+ self.download(url=url, dst_path=download_path)
+ # Extract downloaded zip files to data root
+ self.extract(src_path=download_path, dst_path=self.data_root)
+ # Move & Rename dataset files
+ if 'mapping' in file:
+ self.move(mapping=file['mapping'])
+ self.clean()
+
+ def download(self, url: Optional[str], dst_path: str) -> None:
+ """Download file from given url with progress bar.
+
+ Args:
+ url (str): The url to download the file.
+ dst_path (str): The destination path to save the file.
+ """
+
+ def progress(down: float, block: float, size: float) -> None:
+ """Show download progress.
+
+ Args:
+ down (float): Downloaded size.
+ block (float): Block size.
+ size (float): Total size of the file.
+ """
+
+ percent = min(100. * down * block / size, 100)
+ file_name = osp.basename(dst_path)
+ print(f'\rDownloading {file_name}: {percent:.2f}%', end='')
+
+ if url is None and not osp.exists(dst_path):
+ raise FileNotFoundError(
+ 'Direct url is not available for this dataset.'
+ ' Please manually download the required files'
+ ' following the guides.')
+
+ print(f'Start to download {osp.basename(dst_path)}...')
+ print('If you stuck here for a long time, please check your network.')
+ request.urlretrieve(url, dst_path, progress)
+
+ def extract(self,
+ src_path: str,
+ dst_path: str,
+ delete: bool = False) -> None:
+ """Extract zip/tar.gz files.
+
+ Args:
+ src_path (str): Path to the zip file.
+ dst_path (str): Path to the destination folder.
+ delete (bool, optional): Whether to delete the zip file. Defaults
+ to False.
+ """
+
+ if not is_archive(src_path):
+ # Move the file to the destination folder if it is not a zip
+ shutil.move(src_path, dst_path)
+ return
+
+ zip_name = osp.basename(src_path).split('.')[0]
+ if dst_path is None:
+ dst_path = osp.join(osp.dirname(src_path), zip_name)
+ else:
+ dst_path = osp.join(dst_path, zip_name)
+ mkdir_or_exist(dst_path)
+ print(f'Extracting: {osp.basename(src_path)}')
+ if src_path.endswith('.zip'):
+ try:
+ import zipfile
+ except ImportError:
+ raise ImportError(
+ 'Please install zipfile by running "pip install zipfile".')
+ with zipfile.ZipFile(src_path, 'r') as zip_ref:
+ zip_ref.extractall(dst_path)
+ elif src_path.endswith('.tar.gz') or src_path.endswith('.tar'):
+ if src_path.endswith('.tar.gz'):
+ mode = 'r:gz'
+ elif src_path.endswith('.tar'):
+ mode = 'r:'
+ try:
+ import tarfile
+ except ImportError:
+ raise ImportError(
+ 'Please install tarfile by running "pip install tarfile".')
+ with tarfile.open(src_path, mode) as tar_ref:
+ tar_ref.extractall(dst_path)
+ if delete:
+ os.remove(src_path)
+
+ def move(self, mapping: List[Tuple[str, str]]) -> None:
+ """Rename and move dataset files one by one.
+
+ Args:
+ mapping (List[Tuple[str, str]]): A list of tuples, each
+ tuple contains the source file name and the destination file name.
+ """
+ for src, dst in mapping:
+ src = osp.join(self.data_root, src)
+ dst = osp.join(self.data_root, dst)
+ if osp.exists(src) and not osp.exists(dst):
+ shutil.move(src, dst)
+
+ def clean(self) -> None:
+ """Remove empty dirs."""
+ for root, dirs, files in os.walk(self.data_root, topdown=False):
+ if not files and not dirs:
+ os.rmdir(root)
diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py
new file mode 100644
index 00000000..98642aee
--- /dev/null
+++ b/mmocr/datasets/preparers/data_preparer.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import time
+
+from mmengine import Registry
+from mmengine.config import Config
+
+DATA_OBTAINERS = Registry('data_obtainer')
+DATA_CONVERTERS = Registry('data_converter')
+DATA_PARSERS = Registry('data_parser')
+DATA_DUMPERS = Registry('data_dumper')
+
+
+class DatasetPreparer:
+ """Base class of dataset preparer.
+
+ Dataset preparer is used to prepare dataset for MMOCR. It mainly consists
+ of two steps:
+
+ 1. Obtain the dataset
+ - Download
+ - Extract
+ - Move/Rename
+ 2. Process the dataset
+ - Parse original annotations
+ - Convert to mmocr format
+ - Dump the annotation file
+ - Clean useless files
+
+ After all these steps, the original datasets have been prepared for
+ usage in MMOCR. Check out the dataset format used in MMOCR here:
+ https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html
+ """
+
+ def __init__(self,
+ cfg_path: str,
+ dataset_name: str,
+ task: str = 'textdet',
+ nproc: int = 4) -> None:
+ """Initialization. Load necessary meta info and print license.
+
+ Args:
+ cfg_path (str): Path to dataset config file.
+ dataset_name (str): Dataset name.
+ task (str): Task type. Options are 'textdet', 'textrecog',
+ 'textspotter', and 'kie'. Defaults to 'textdet'.
+ nproc (int): Number of parallel processes. Defaults to 4.
+ """
+ cfg_path = osp.join(cfg_path, dataset_name)
+ self.nproc = nproc
+ self.task = task
+ self.dataset_name = dataset_name
+ self.parse_meta(cfg_path)
+ self.parse_cfg(cfg_path)
+
+ def __call__(self):
+ """Prepare the dataset."""
+ if self.with_obtainer:
+ print('Obtaining Dataset...')
+ self.data_obtainer()
+ if self.with_converter:
+ print('Converting Dataset...')
+ self.data_converter()
+
+ def parse_meta(self, cfg_path: str) -> None:
+ """Parse meta file.
+
+ Args:
+ cfg_path (str): Path to meta file.
+ """
+ try:
+ meta = Config.fromfile(osp.join(cfg_path, 'metafile.yml'))
+ except FileNotFoundError:
+ return
+ assert self.task in meta['Data']['Tasks'], \
+ f'Task {self.task} not supported!'
+ # License related
+ if meta['Data']['License']['Type']:
+ print(f"\033[1;33;40mDataset Name: {meta['Name']}")
+ print(f"License Type: {meta['Data']['License']['Type']}")
+ print(f"License Link: {meta['Data']['License']['Link']}")
+ print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
+ print(
+ '\033[1;31;43mMMOCR does not own the dataset. Using this '
+ 'dataset you must accept the license provided by the owners, '
+ 'and cite the corresponding papers appropriately.')
+ print('If you do not agree with the above license, please cancel '
+ 'the progress immediately by pressing ctrl+c. Otherwise, '
+ 'you are deemed to accept the terms and conditions.\033[0m')
+ for i in range(5):
+ print(f'{5-i}...')
+ time.sleep(1)
+
+ def parse_cfg(self, cfg_path: str) -> None:
+ """Parse dataset config file.
+
+ Args:
+ cfg_path (str): Path to dataset config file.
+ """
+ cfg_path = osp.join(cfg_path, self.task + '.py')
+ assert osp.exists(cfg_path), f'Config file {cfg_path} not found!'
+ cfg = Config.fromfile(cfg_path)
+
+ if 'data_obtainer' in cfg:
+ cfg.data_obtainer.update(task=self.task)
+ self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer)
+ if 'data_converter' in cfg:
+ cfg.data_converter.update(
+ dict(nproc=self.nproc, dataset_name=self.dataset_name))
+ self.data_converter = DATA_CONVERTERS.build(cfg.data_converter)
+
+ @property
+ def with_obtainer(self) -> bool:
+ """bool: whether the data preparer has an obtainer"""
+ return getattr(self, 'data_obtainer', None) is not None
+
+ @property
+ def with_converter(self) -> bool:
+ """bool: whether the data preparer has an converter"""
+ return getattr(self, 'data_converter', None) is not None
diff --git a/mmocr/datasets/preparers/dumpers/__init__.py b/mmocr/datasets/preparers/dumpers/__init__.py
new file mode 100644
index 00000000..4dc93d9c
--- /dev/null
+++ b/mmocr/datasets/preparers/dumpers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dumpers import JsonDumper, WildreceiptOpensetDumper
+
+__all__ = ['JsonDumper', 'WildreceiptOpensetDumper']
diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py
new file mode 100644
index 00000000..8cc8d9a1
--- /dev/null
+++ b/mmocr/datasets/preparers/dumpers/dumpers.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, List
+
+import mmengine
+
+from mmocr.utils import list_to_file
+from ..data_preparer import DATA_DUMPERS
+
+
+@DATA_DUMPERS.register_module()
+class JsonDumper:
+
+ def __init__(self, task: str, dataset_name: str) -> None:
+ self.task = task
+ self.dataset_name = dataset_name
+
+ def dump(self, data: Dict, data_root: str, split: str) -> str:
+ """Dump data to json file.
+
+ Args:
+ data (Dict): Data to be dumped.
+ data_root (str): Root directory of data.
+ split (str): Split of data.
+ cfg_path (str): Path to configs. Defaults to 'configs/'.
+
+ Returns:
+ str: String of dataset config.
+
+ Examples:
+ The returned dataset config
+ >>> ic15_rec_train = dict(
+ >>> type='OCRDataset',
+ >>> data_root=ic15_rec_data_root,
+ >>> ann_file='textrecog_train.json',
+ >>> test_mode=False,
+ >>> pipeline=None)
+ """
+
+ dst_file = osp.join(data_root, f'{self.task}_{split}.json')
+ mmengine.dump(data, dst_file)
+
+ cfg = f'\n{self.dataset_name}_{self.task}_{split} = dict (\n'
+ cfg += ' type=\'OCRDataset\',\n'
+ cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501
+ cfg += f' ann_file=\'{osp.basename(dst_file)}\',\n'
+ if split == 'train' and self.task == 'textdet':
+ cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n'
+ elif split in ['test', 'val']:
+ cfg += ' test_mode=True,\n'
+ cfg += ' pipeline=None)\n'
+
+ return cfg
+
+
+@DATA_DUMPERS.register_module()
+class WildreceiptOpensetDumper:
+
+ def __init__(self, task: str) -> None:
+ self.task = task
+
+ def dump(self, data: List, data_root: str, split: str) -> str:
+ """Dump data to txt file.
+
+ Args:
+ data (List): Data to be dumped.
+ data_root (str): Root directory of data.
+ split (str): Split of data.
+ """
+
+ list_to_file(osp.join(data_root, f'openset_{split}.txt'), data)
+
+ return None
diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py
new file mode 100644
index 00000000..83681eab
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coco_parser import COCOTextDetAnnParser
+from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
+ ICDARTxtTextRecogAnnParser)
+from .totaltext_parser import TotaltextTextDetAnnParser
+from .wildreceipt_parser import WildreceiptKIEAnnParser
+
+__all__ = [
+ 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
+ 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
+ 'COCOTextDetAnnParser'
+]
diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py
new file mode 100644
index 00000000..4228fa67
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/base.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+
+from mmengine import track_parallel_progress
+
+
+class BaseParser:
+ """Base class for parsing annotations.
+
+ Args:
+ data_root (str, optional): Path to the data root. Defaults to None.
+ nproc (int, optional): Number of processes. Defaults to 1.
+ """
+
+ def __init__(self,
+ data_root: Optional[str] = None,
+ nproc: int = 1) -> None:
+ self.data_root = data_root
+ self.nproc = nproc
+
+ def __call__(self, files: List[Tuple], split: str) -> List:
+ """Parse annotations.
+
+ Args:
+ files (List[Tuple]): A list of a tuple of
+ (image_path, annotation_path).
+ split (str): The split of the dataset.
+
+ Returns:
+ List: A list of a tuple of (image_path, instances)
+ """
+ samples = self.parse_files(files, split)
+ return samples
+
+ def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]:
+ """Convert annotations to MMOCR format.
+
+ Args:
+ files (Tuple): A list of tuple of path to image and annotation.
+
+ Returns:
+ List[Tuple]: A list of a tuple of (image_path, instances)
+ """
+ func = partial(self.parse_file, split=split)
+ samples = track_parallel_progress(func, files, nproc=self.nproc)
+ return samples
+
+ @abstractmethod
+ def parse_file(self, file: Tuple, split: str) -> Tuple:
+ """Convert annotation for a single image.
+
+ Args:
+ file (Tuple): A tuple of path to image and annotation
+ split (str): Current split.
+
+ Returns:
+ Tuple: A tuple of (img_path, instance). Instance is a dict
+ containing parsed annotations, which should contain the
+ following keys:
+ - 'poly' or 'box' (textdet or textspotting)
+ - 'text' (textspotting or textrecog)
+ - 'ignore' (all task)
+ """
+ raise NotImplementedError
+
+ def loader(self,
+ file_path: str,
+ separator: str = ',',
+ format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
+ encoding='utf-8') -> Union[Dict, str]:
+ """A basic loader designed for .txt format annotation.
+
+ Args:
+ file_path (str): Path to the txt file.
+ separator (str, optional): Separator of data. Defaults to ','.
+ format (str, optional): Annotation format.
+ Defaults to 'x1,y1,x2,y2,x3,y3,x4,y4,trans'.
+ encoding (str, optional): Encoding format. Defaults to 'utf-8'.
+
+ Yields:
+ Iterator[Union[Dict, str]]: Original text line or a dict containing
+ the information of the text line.
+ """
+ keys = format.split(separator)
+ with open(file_path, 'r', encoding=encoding) as f:
+ for line in f.readlines():
+ line = line.strip()
+ if line:
+ yield dict(zip(keys, line.split(separator)))
diff --git a/mmocr/datasets/preparers/parsers/coco_parser.py b/mmocr/datasets/preparers/parsers/coco_parser.py
new file mode 100644
index 00000000..9b1cc859
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/coco_parser.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+from mmdet.datasets.api_wrappers import COCO
+
+from mmocr.datasets.preparers.data_preparer import DATA_PARSERS
+from mmocr.datasets.preparers.parsers.base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class COCOTextDetAnnParser(BaseParser):
+ """COCO Text Detection Parser.
+
+ Args:
+ data_root (str): The root path of the dataset. Defaults to None.
+ nproc (int): The number of processes to parse the annotation. Defaults
+ to 1.
+ """
+
+ def __init__(self, data_root: str = None, nproc: int = 1) -> None:
+
+ super().__init__(nproc=nproc, data_root=data_root)
+
+ def parse_files(self, files: Tuple, split: str = None) -> Dict:
+ """Parse single annotation."""
+ samples = list()
+ coco = COCO(files)
+ img_ids = coco.get_img_ids()
+
+ total_ann_ids = []
+ for img_id in img_ids:
+ img_info = coco.load_imgs([img_id])[0]
+ img_info['img_id'] = img_id
+ img_path = img_info['file_name']
+ ann_ids = coco.get_ann_ids(img_ids=[img_id])
+ ann_info = coco.load_anns(ann_ids)
+ total_ann_ids.extend(ann_ids)
+ instances = list()
+ for ann in ann_info:
+ instances.append(
+ dict(
+ poly=ann['segmentation'][0],
+ text=ann.get('text', None),
+ ignore=ann.get('iscrowd', False)))
+ samples.append((img_path, instances))
+ return samples
diff --git a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
new file mode 100644
index 00000000..d69eb0d7
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+from ..data_preparer import DATA_PARSERS
+from .base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class ICDARTxtTextDetAnnParser(BaseParser):
+ """ICDAR2015 Text Detection Parser.
+
+ The original annotation format of this dataset is stored in txt files,
+ which is formed as the following format:
+ x1, y1, x2, y2, x3, y3, x4, y4, transcription
+
+ Args:
+ separator (str): The separator between each element in a line. Defaults
+ to ','.
+ ignore (str): The text to be ignored. Defaults to '###'.
+ format (str): The format of the annotation. Defaults to
+ 'x1,y1,x2,y2,x3,y3,x4,trans'.
+ encoding (str): The encoding of the annotation file. Defaults to
+ 'utf-8-sig'.
+ nproc (int): The number of processes to parse the annotation. Defaults
+ to 1.
+ remove_strs (List[str], Optional): Used to remove redundant strings in
+ the transcription. Defaults to None.
+ """
+
+ def __init__(self,
+ separator: str = ',',
+ ignore: str = '###',
+ format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
+ encoding: str = 'utf-8-sig',
+ nproc: int = 1,
+ remove_strs: Optional[List[str]] = None) -> None:
+ self.sep = separator
+ self.format = format
+ self.encoding = encoding
+ self.ignore = ignore
+ self.remove_strs = remove_strs
+ super().__init__(nproc=nproc)
+
+ def parse_file(self, file: Tuple, split: str) -> Tuple:
+ """Parse single annotation."""
+ img_file, txt_file = file
+ instances = list()
+ for anno in self.loader(txt_file, self.sep, self.format,
+ self.encoding):
+ anno = list(anno.values())
+ if self.remove_strs is not None:
+ for flag in self.remove_strs:
+ for i in range(len(anno)):
+ if flag in anno[i]:
+ anno[i] = anno[i].replace(flag, '')
+ poly = list(map(float, anno[0:-1]))
+ text = anno[-1]
+ instances.append(
+ dict(poly=poly, text=text, ignore=text == self.ignore))
+
+ return img_file, instances
+
+
+@DATA_PARSERS.register_module()
+class ICDARTxtTextRecogAnnParser(BaseParser):
+ """ICDAR2015 Text Detection Parser.
+
+ The original annotation format of this dataset is stored in txt files,
+ which is formed as the following format:
+ img_path, transcription
+
+ Args:
+ separator (str): The separator between each element in a line. Defaults
+ to ','.
+ ignore (str): The text to be ignored. Defaults to '#'.
+ format (str): The format of the annotation. Defaults to 'img, text'.
+ encoding (str): The encoding of the annotation file. Defaults to
+ 'utf-8-sig'.
+ nproc (int): The number of processes to parse the annotation. Defaults
+ to 1.
+ """
+
+ def __init__(self,
+ separator: str = ',',
+ ignore: str = '#',
+ format: str = 'img,text',
+ encoding: str = 'utf-8-sig',
+ nproc: int = 1) -> None:
+ self.sep = separator
+ self.format = format
+ self.encoding = encoding
+ self.ignore = ignore
+ super().__init__(nproc=nproc)
+
+ def parse_files(self, files: str, split: str) -> List:
+ """Parse annotations."""
+ assert isinstance(files, str)
+ samples = list()
+ for anno in self.loader(
+ file_path=files, format=self.format, encoding=self.encoding):
+ text = anno['text'].strip().replace('"', '')
+ samples.append((anno['img'], text))
+
+ return samples
diff --git a/mmocr/datasets/preparers/parsers/totaltext_parser.py b/mmocr/datasets/preparers/parsers/totaltext_parser.py
new file mode 100644
index 00000000..1a7d65c5
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/totaltext_parser.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+from typing import Dict, Tuple
+
+import yaml
+
+from ..data_preparer import DATA_PARSERS
+from .base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class TotaltextTextDetAnnParser(BaseParser):
+ """TotalText Text Detection Parser.
+
+ The original annotation format of this dataset is stored in txt files,
+ which is formed as the following format:
+ x: [[x1 x2 x3 ... xn]], y: [[y1 y2 y3 ... yn]],
+ ornt: [u'c'], transcriptions: [u'transcription']
+
+ Args:
+ data_root (str): Path to the dataset root.
+ ignore (str): The text of the ignored instances. Default: '#'.
+ nproc (int): Number of processes to load the data. Default: 1.
+ """
+
+ def __init__(self,
+ data_root: str,
+ ignore: str = '#',
+ nproc: int = 1) -> None:
+ self.ignore = ignore
+ super().__init__(data_root=data_root, nproc=nproc)
+
+ def parse_file(self, file: Tuple, split: str) -> Dict:
+ """Convert single annotation."""
+ img_file, txt_file = file
+ instances = list()
+ for poly, text in self.loader(txt_file):
+ instances.append(
+ dict(poly=poly, text=text, ignore=text == self.ignore))
+
+ return img_file, instances
+
+ def loader(self, file_path: str) -> str:
+ """The annotation of the totaltext dataset may be stored in multiple
+ lines, this loader is designed for this special case.
+
+ Args:
+ file_path (str): Path to the txt file
+
+ Yield:
+ str: Complete annotation of the txt file
+ """
+
+ def parsing_line(line: str) -> Tuple:
+ """Parsing a line of the annotation.
+
+ Args:
+ line (str): A line of the annotation.
+
+ Returns:
+ Tuple: A tuple of (polygon, transcription).
+ """
+ line = '{' + line.replace('[[', '[').replace(']]', ']') + '}'
+ ann_dict = re.sub('([0-9]) +([0-9])', r'\1,\2', line)
+ ann_dict = re.sub('([0-9]) +([ 0-9])', r'\1,\2', ann_dict)
+ ann_dict = re.sub('([0-9]) -([0-9])', r'\1,-\2', ann_dict)
+ ann_dict = ann_dict.replace("[u',']", "[u'#']")
+ ann_dict = yaml.safe_load(ann_dict)
+
+ # polygon
+ xs, ys = ann_dict['x'], ann_dict['y']
+ poly = []
+ for x, y in zip(xs, ys):
+ poly.append(x)
+ poly.append(y)
+ # text
+ text = ann_dict['transcriptions']
+ if len(text) == 0:
+ text = '#'
+ else:
+ word = text[0]
+ if len(text) > 1:
+ for ann_word in text[1:]:
+ word += ',' + ann_word
+ text = str(eval(word))
+
+ return poly, text
+
+ with open(file_path, 'r') as f:
+ for idx, line in enumerate(f):
+ line = line.strip()
+ if idx == 0:
+ tmp_line = line
+ continue
+ if not line.startswith('x:'):
+ tmp_line += ' ' + line
+ continue
+ complete_line = tmp_line
+ tmp_line = line
+ yield parsing_line(complete_line)
+
+ if tmp_line != '':
+ yield parsing_line(tmp_line)
diff --git a/mmocr/datasets/preparers/parsers/wildreceipt_parser.py b/mmocr/datasets/preparers/parsers/wildreceipt_parser.py
new file mode 100644
index 00000000..b1a95236
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/wildreceipt_parser.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import Dict, Tuple
+
+from mmocr.utils import list_from_file
+from ..data_preparer import DATA_PARSERS
+from .base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class WildreceiptTextDetAnnParser(BaseParser):
+ """Wildreceipt Text Detection Parser.
+
+ The original annotation format of this dataset is stored in txt files,
+ which is formed as the following json line format:
+ {"file_name": "xxx/xxx/xx/xxxx.jpeg",
+ "height": 1200,
+ "width": 1600,
+ "annotations": [
+ "box": [x1, y1, x2, y2, x3, y3, x4, y4],
+ "text": "xxx",
+ "label": 25,
+ ]}
+
+ Args:
+ data_root (str): The root path of the dataset.
+ ignore (int): The label to be ignored. Defaults to 0.
+ nproc (int): The number of processes to parse the annotation. Defaults
+ to 1.
+ """
+
+ def __init__(self,
+ data_root: str,
+ ignore: int = 0,
+ nproc: int = 1) -> None:
+ self.ignore = ignore
+ super().__init__(data_root=data_root, nproc=nproc)
+
+ def parse_files(self, files: Tuple, split: str) -> Dict:
+ """Convert single annotation."""
+ closeset_lines = list_from_file(files)
+ samples = list()
+ for line in closeset_lines:
+ instances = list()
+ line = json.loads(line)
+ img_file = osp.join(self.data_root, line['file_name'])
+ for anno in line['annotations']:
+ poly = anno['box']
+ text = anno['text']
+ label = anno['label']
+ instances.append(
+ dict(poly=poly, text=text, ignore=label == self.ignore))
+ samples.append((img_file, instances))
+
+ return samples
+
+
+@DATA_PARSERS.register_module()
+class WildreceiptKIEAnnParser(BaseParser):
+ """Wildreceipt KIE Parser.
+
+ The original annotation format of this dataset is stored in txt files,
+ which is formed as the following json line format:
+ {"file_name": "xxx/xxx/xx/xxxx.jpeg",
+ "height": 1200,
+ "width": 1600,
+ "annotations": [
+ "box": [x1, y1, x2, y2, x3, y3, x4, y4],
+ "text": "xxx",
+ "label": 25,
+ ]}
+
+ Args:
+ data_root (str): The root path of the dataset.
+ ignore (int): The label to be ignored. Defaults to 0.
+ nproc (int): The number of processes to parse the annotation. Defaults
+ to 1.
+ """
+
+ def __init__(self,
+ data_root: str,
+ ignore: int = 0,
+ nproc: int = 1) -> None:
+ self.ignore = ignore
+ super().__init__(data_root=data_root, nproc=nproc)
+
+ def parse_files(self, files: Tuple, split: str) -> Dict:
+ """Convert single annotation."""
+ closeset_lines = list_from_file(files)
+
+ return closeset_lines
diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py
index 2f878107..39dfabef 100644
--- a/mmocr/utils/__init__.py
+++ b/mmocr/utils/__init__.py
@@ -6,7 +6,8 @@ from .check_argument import (equal_len, is_2dlist, is_3dlist, is_none_or_type,
is_type_list, valid_boundary)
from .collect_env import collect_env
from .data_converter_utils import dump_ocr_data, recog_anno_to_imginfo
-from .fileio import list_from_file, list_to_file
+from .fileio import (check_integrity, is_archive, list_files, list_from_file,
+ list_to_file)
from .img_utils import crop_img, warp_img
from .mask_utils import fill_hole
from .parsers import LineJsonParser, LineStrParser
@@ -40,5 +41,6 @@ __all__ = [
'ConfigType', 'DetSampleList', 'RecForwardResults', 'InitConfigType',
'OptConfigType', 'OptDetSampleList', 'OptInitConfigType', 'OptMultiConfig',
'OptRecSampleList', 'RecSampleList', 'MultiConfig', 'OptTensor',
- 'ColorType', 'OptKIESampleList', 'KIESampleList'
+ 'ColorType', 'OptKIESampleList', 'KIESampleList', 'is_archive',
+ 'check_integrity', 'list_files'
]
diff --git a/mmocr/utils/fileio.py b/mmocr/utils/fileio.py
index d5651d84..f7fad4ec 100644
--- a/mmocr/utils/fileio.py
+++ b/mmocr/utils/fileio.py
@@ -1,7 +1,12 @@
# Copyright (c) OpenMMLab. All rights reserved.
-import os
+import hashlib
+import os.path as osp
+import sys
+import warnings
+from glob import glob
+from typing import List
-import mmengine
+from mmengine import mkdir_or_exist
def list_to_file(filename, lines):
@@ -11,7 +16,7 @@ def list_to_file(filename, lines):
filename (str): The output filename. It will be created/overwritten.
lines (list(str)): Data to be written.
"""
- mmengine.mkdir_or_exist(os.path.dirname(filename))
+ mkdir_or_exist(osp.dirname(filename))
with open(filename, 'w', encoding='utf-8') as fw:
for line in lines:
fw.write(f'{line}\n')
@@ -36,3 +41,69 @@ def list_from_file(filename, encoding='utf-8'):
for line in f:
item_list.append(line.rstrip('\n\r'))
return item_list
+
+
+def is_archive(file_path: str) -> bool:
+ """Check whether the file is a supported archive format.
+
+ Args:
+ file_path (str): Path to the file.
+
+ Returns:
+ bool: Whether the file is an archive.
+ """
+
+ suffixes = ['zip', 'tar', 'tar.gz']
+
+ for suffix in suffixes:
+ if file_path.endswith(suffix):
+ return True
+ return False
+
+
+def check_integrity(file_path: str,
+ md5: str,
+ chunk_size: int = 1024 * 1024) -> bool:
+ """Check if the file exist and match to the given md5 code.
+
+ Args:
+ file_path (str): Path to the file.
+ md5 (str): MD5 to be matched.
+ chunk_size (int, optional): Chunk size. Defaults to 1024*1024.
+
+ Returns:
+ bool: Whether the md5 is matched.
+ """
+ if md5 is None:
+ warnings.warn('MD5 is None, skip the integrity check.')
+ return True
+ if not osp.exists(file_path):
+ return False
+
+ if sys.version_info >= (3, 9):
+ hash = hashlib.md5(usedforsecurity=False)
+ else:
+ hash = hashlib.md5()
+ with open(file_path, 'rb') as f:
+ for chunk in iter(lambda: f.read(chunk_size), b''):
+ hash.update(chunk)
+
+ return hash.hexdigest() == md5
+
+
+def list_files(path: str, suffixes: List) -> List:
+ """Retrieve file list from the path.
+
+ Args:
+ path (str): Path to the directory.
+ suffixes (list[str], optional): Suffixes to be retrieved.
+
+ Returns:
+ List: List of the files.
+ """
+
+ file_list = []
+ for suffix in suffixes:
+ file_list.extend(glob(osp.join(path, '*' + suffix)))
+
+ return file_list
diff --git a/tests/data/preparer/dummy/metafile.yml b/tests/data/preparer/dummy/metafile.yml
new file mode 100644
index 00000000..7706ef53
--- /dev/null
+++ b/tests/data/preparer/dummy/metafile.yml
@@ -0,0 +1,24 @@
+Name: Dummy Dataset
+Paper:
+ Title: Dummy Dataset
+ URL: https://github.com/open-mmlab/mmocr
+ Venue: MMOCR
+ Year: 2022
+ BibTeX: ''
+Data:
+ Website: https://github.com/open-mmlab/mmocr
+ Language:
+ - English
+ - Chinese
+ Scene:
+ - Natural Scene
+ Granularity:
+ - Word
+ Tasks:
+ - textdet
+ - textrecog
+ - textspotting
+ License:
+ Type: CC BY 4.0
+ Link: https://creativecommons.org/licenses/by/4.0/
+ Format: .txt
diff --git a/tests/data/preparer/dummy/textdet.py b/tests/data/preparer/dummy/textdet.py
new file mode 100644
index 00000000..2fa11b20
--- /dev/null
+++ b/tests/data/preparer/dummy/textdet.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+data_root = 'tests/data/preparer/dummy'
+cache_path = 'tests/data/preparer/dummy'
diff --git a/tests/test_datasets/test_preparers/test_data_preparer.py b/tests/test_datasets/test_preparers/test_data_preparer.py
new file mode 100644
index 00000000..c531db6f
--- /dev/null
+++ b/tests/test_datasets/test_preparers/test_data_preparer.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import unittest
+
+from mmocr.datasets.preparers.data_preparer import DatasetPreparer
+
+
+class TestDataPreparer(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.cfg_path = 'tests/data/preparer'
+ self.dataset_name = 'dummy'
+
+ def test_dataset_preparer(self):
+ preparer = DatasetPreparer(self.cfg_path, self.dataset_name, 'textdet')
+ preparer()
diff --git a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py
new file mode 100644
index 00000000..512acedd
--- /dev/null
+++ b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers.dumpers import (JsonDumper,
+ WildreceiptOpensetDumper)
+
+
+class TestDumpers(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.root = tempfile.TemporaryDirectory()
+
+ def test_json_dumpers(self):
+ task, split = 'textdet', 'train'
+ fake_data = dict(
+ metainfo=dict(
+ dataset_type='TextDetDataset',
+ task_name='textdet',
+ category=[dict(id=0, name='text')]))
+
+ dumper = JsonDumper(task, dataset_name='test')
+ dumper.dump(fake_data, self.root.name, split)
+ with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f:
+ data = json.load(f)
+ self.assertEqual(data, fake_data)
+
+ def test_wildreceipt_dumper(self):
+ task, split = 'kie', 'train'
+ fake_data = ['test1', 'test2']
+
+ dumper = WildreceiptOpensetDumper(task)
+ dumper.dump(fake_data, self.root.name, split)
+ with open(osp.join(self.root.name, f'openset_{split}.txt'), 'r') as f:
+ data = f.read().splitlines()
+ self.assertEqual(data, fake_data)
diff --git a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
new file mode 100644
index 00000000..e12820a6
--- /dev/null
+++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers.parsers.icdar_txt_parser import (
+ ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser)
+from mmocr.utils import list_to_file
+
+
+class TestIC15Parsers(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.root = tempfile.TemporaryDirectory()
+
+ def _create_dummy_ic15_det(self):
+ fake_anno = [
+ '377,117,463,117,465,130,378,130,Genaxis Theatre',
+ '493,115,519,115,519,131,493,131,[06]',
+ '374,155,409,155,409,170,374,170,###',
+ ]
+ ann_file = osp.join(self.root.name, 'ic15_det.txt')
+ list_to_file(ann_file, fake_anno)
+ return (osp.join(self.root.name, 'ic15_det.jpg'), ann_file)
+
+ def _create_dummy_ic15_recog(self):
+ fake_anno = [
+ 'word_1.png, "Genaxis Theatre"',
+ 'word_2.png, "[06]"',
+ 'word_3.png, "62-03"',
+ ]
+ ann_file = osp.join(self.root.name, 'ic15_recog.txt')
+ list_to_file(ann_file, fake_anno)
+ return ann_file
+
+ def test_textdet_parsers(self):
+ parser = ICDARTxtTextDetAnnParser()
+ file = self._create_dummy_ic15_det()
+ img, instances = parser.parse_file(file, 'train')
+ self.assertEqual(img, file[0])
+ self.assertEqual(len(instances), 3)
+ self.assertIn('poly', instances[0])
+ self.assertIn('text', instances[0])
+ self.assertIn('ignore', instances[0])
+ self.assertEqual(instances[0]['text'], 'Genaxis Theatre')
+ self.assertEqual(instances[2]['ignore'], True)
+
+ def test_textrecog_parsers(self):
+ parser = ICDARTxtTextRecogAnnParser()
+ file = self._create_dummy_ic15_recog()
+ samples = parser.parse_files(file, 'train')
+ self.assertEqual(len(samples), 3)
+ img, text = samples[0]
+ self.assertEqual(img, 'word_1.png')
+ self.assertEqual(text, 'Genaxis Theatre')
diff --git a/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py
new file mode 100644
index 00000000..713d7fc7
--- /dev/null
+++ b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers.parsers.totaltext_parser import \
+ TotaltextTextDetAnnParser
+from mmocr.utils import list_to_file
+
+
+class TestTTParsers(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.root = tempfile.TemporaryDirectory()
+
+ def _create_dummy_tt_det(self):
+ fake_anno = [
+ "x: [[ 53 120 121 56]], y: [[446 443 456 458]], ornt: [u'h'], transcriptions: [u'PERUNDING']", # noqa: E501
+ "x: [[123 165 166 125]], y: [[443 440 453 455]], ornt: [u'h'], transcriptions: [u'PENILAI']", # noqa: E501
+ "x: [[168 179 179 167]], y: [[439 439 452 453]], ornt: [u'#'], transcriptions: [u'#']", # noqa: E501
+ ]
+ ann_file = osp.join(self.root.name, 'tt_det.txt')
+ list_to_file(ann_file, fake_anno)
+ return (osp.join(self.root.name, 'tt_det.jpg'), ann_file)
+
+ def test_textdet_parsers(self):
+ parser = TotaltextTextDetAnnParser(self.root.name)
+ file = self._create_dummy_tt_det()
+ img, instances = parser.parse_file(file, 'train')
+ self.assertEqual(img, file[0])
+ self.assertEqual(len(instances), 3)
+ self.assertIn('poly', instances[0])
+ self.assertIn('text', instances[0])
+ self.assertIn('ignore', instances[0])
+ self.assertEqual(instances[0]['text'], 'PERUNDING')
+ self.assertEqual(instances[2]['ignore'], True)
diff --git a/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py
new file mode 100644
index 00000000..f4e5510d
--- /dev/null
+++ b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import tempfile
+import unittest
+
+from mmocr.datasets.preparers.parsers.wildreceipt_parser import (
+ WildreceiptKIEAnnParser, WildreceiptTextDetAnnParser)
+from mmocr.utils import list_to_file
+
+
+class TestWildReceiptParsers(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.root = tempfile.TemporaryDirectory()
+ fake_sample = dict(
+ file_name='test.jpg',
+ height=100,
+ width=100,
+ annotations=[
+ dict(
+ box=[
+ 550.0, 190.0, 937.0, 190.0, 937.0, 104.0, 550.0, 104.0
+ ],
+ text='test',
+ label=1,
+ ),
+ dict(
+ box=[
+ 1048.0, 211.0, 1074.0, 211.0, 1074.0, 196.0, 1048.0,
+ 196.0
+ ],
+ text='ATOREMGRTOMMILAZZO',
+ label=0,
+ )
+ ])
+ fake_sample = [json.dumps(fake_sample)]
+ self.anno = osp.join(self.root.name, 'wildreceipt.txt')
+ list_to_file(self.anno, fake_sample)
+
+ def test_textdet_parsers(self):
+ parser = WildreceiptTextDetAnnParser(self.root.name)
+ samples = parser.parse_files(self.anno, 'train')
+ self.assertEqual(len(samples), 1)
+ self.assertEqual(osp.basename(samples[0][0]), 'test.jpg')
+ instances = samples[0][1]
+ self.assertEqual(len(instances), 2)
+ self.assertIn('poly', instances[0])
+ self.assertIn('text', instances[0])
+ self.assertIn('ignore', instances[0])
+ self.assertEqual(instances[0]['text'], 'test')
+ self.assertEqual(instances[1]['ignore'], True)
+
+ def test_kie_parsers(self):
+ parser = WildreceiptKIEAnnParser(self.root.name)
+ samples = parser.parse_files(self.anno, 'train')
+ self.assertEqual(len(samples), 1)
diff --git a/tests/test_utils/test_fileio.py b/tests/test_utils/test_fileio.py
index e3f929db..2a2399d5 100644
--- a/tests/test_utils/test_fileio.py
+++ b/tests/test_utils/test_fileio.py
@@ -1,8 +1,11 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
+import os
import tempfile
+import unittest
-from mmocr.utils import list_from_file, list_to_file
+from mmocr.utils import (check_integrity, is_archive, list_files,
+ list_from_file, list_to_file)
lists = [
[],
@@ -102,3 +105,57 @@ def test_list_from_file():
lines = list(map(str, lines))
assert len(lines) == len(lines2)
assert all(line1 == line2 for line1, line2 in zip(lines, lines2))
+
+
+class TestIsArchive(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.zip = 'data/annotations_123.zip'
+ self.tar = 'data/img.abc.tar'
+ self.targz = 'data/img12345_.tar.gz'
+ self.rar = '/m/abc/t.rar'
+ self.dir = '/a/b/c/'
+
+ def test_is_archive(self):
+ # test zip
+ self.assertTrue(is_archive(self.zip))
+ # test tar
+ self.assertTrue(is_archive(self.tar))
+ # test tar.gz
+ self.assertTrue(is_archive(self.targz))
+ # test rar
+ self.assertFalse(is_archive(self.rar))
+ # test dir
+ self.assertFalse(is_archive(self.dir))
+
+
+class TestCheckIntegrity(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.file1 = ('tests/data/det_toy_dataset/instances_test.json',
+ '77b17b0125996af519ef82aaacc8d96b')
+ self.file2 = ('tests/data/det_toy_dataset/imgs/test/img_1.jpg',
+ 'abc123')
+ self.file3 = ('abc/abc.jpg', 'abc123')
+
+ def test_check_integrity(self):
+ file, md5 = self.file1
+ self.assertTrue(check_integrity(file, md5))
+ file, md5 = self.file2
+ self.assertFalse(check_integrity(file, md5))
+ self.assertTrue(check_integrity(file, None))
+ file, md5 = self.file3
+ self.assertFalse(check_integrity(file, md5))
+
+
+class TestListFiles(unittest.TestCase):
+
+ def setUp(self) -> None:
+ self.path = 'tests/data/det_toy_dataset/imgs/test'
+
+ def test_check_integrity(self):
+ suffix = 'jpg'
+ files = list_files(self.path, suffix)
+ for file in os.listdir(self.path):
+ if file.endswith(suffix):
+ self.assertIn(os.path.join(self.path, file), files)
diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py
new file mode 100644
index 00000000..aee8ca9e
--- /dev/null
+++ b/tools/dataset_converters/prepare_dataset.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import warnings
+
+from mmocr.datasets.preparers import DatasetPreparer
+from mmocr.utils import register_all_modules
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Preparing datasets used in MMOCR.')
+ parser.add_argument(
+ 'datasets',
+ help='A list of the dataset names that would like to prepare.',
+ nargs='+')
+ parser.add_argument(
+ '--nproc', help='Number of processes to run', default=4, type=int)
+ parser.add_argument(
+ '--task',
+ default='textdet',
+ choices=['textdet', 'textrecog', 'textspotting', 'kie'],
+ help='Task type. Options are "textdet", "textrecog", "textspotting"'
+ ' and "kie".')
+ parser.add_argument(
+ '--dataset-zoo-path',
+ default='./dataset_zoo',
+ help='Path to dataset zoo config files.')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+ register_all_modules()
+ for dataset in args.datasets:
+ if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)):
+ warnings.warn(f'{dataset} is not supported yet. Please check '
+ 'dataset zoo for supported datasets.')
+ continue
+ preparer = DatasetPreparer(
+ cfg_path=args.dataset_zoo_path,
+ dataset_name=dataset,
+ task=args.task,
+ nproc=args.nproc)
+ preparer()
+
+
+if __name__ == '__main__':
+ main()