diff --git a/dataset_zoo/cocotextv2/textdet.py b/dataset_zoo/cocotextv2/textdet.py index c2e2fdf8..f8b5b863 100644 --- a/dataset_zoo/cocotextv2/textdet.py +++ b/dataset_zoo/cocotextv2/textdet.py @@ -1,41 +1,39 @@ data_root = 'data/cocotextv2' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='http://images.cocodataset.org/zips/train2014.zip', - save_name='cocotextv2_train_img.zip', - md5='0da8c0bd3d6becc4dcb32757491aca88', - split=['train', 'val'], - content=['image'], - mapping=[['cocotextv2_train_img/train2014', - 'textdet_imgs/train']]), - dict( - url='https://github.com/bgshih/cocotext/releases/download/dl/' - 'cocotext.v2.zip', - save_name='cocotextv2_annotation.zip', - md5='5e39f7d6f2f11324c6451e63523c440c', - split=['train', 'val'], - content=['annotation'], - mapping=[[ - 'cocotextv2_annotation/cocotext.v2.json', - 'annotations/train.json' - ]]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train'], - data_root=data_root, - gatherer=dict(type='mono_gather', train_ann='train.json'), - parser=dict( - type='COCOTextDetAnnParser', - variant='cocotext', - data_root=data_root + '/textdet_imgs/train'), +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='http://images.cocodataset.org/zips/train2014.zip', + save_name='cocotextv2_train_img.zip', + md5='0da8c0bd3d6becc4dcb32757491aca88', + content=['image'], + mapping=[[ + 'cocotextv2_train_img/train2014', 'textdet_imgs/imgs' + ]]), + dict( + url='https://github.com/bgshih/cocotext/releases/download/dl/' + 'cocotext.v2.zip', + save_name='cocotextv2_annotation.zip', + md5='5e39f7d6f2f11324c6451e63523c440c', + content=['annotation'], + mapping=[[ + 'cocotextv2_annotation/cocotext.v2.json', + 'annotations/train.json' + ]]), + ]), + gatherer=dict( + type='MonoGatherer', + ann_name='train.json', + img_dir='textdet_imgs/imgs'), + parser=dict(type='COCOTextDetAnnParser', variant='cocotext'), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper')) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +val_preparer = train_preparer + +delete = ['annotations', 'cocotextv2_annotation', 'cocotextv2_train_img'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/cocotextv2/textrecog.py b/dataset_zoo/cocotextv2/textrecog.py index 212c7e7d..6299aeb9 100644 --- a/dataset_zoo/cocotextv2/textrecog.py +++ b/dataset_zoo/cocotextv2/textrecog.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.val_preparer.packer.type = 'TextRecogCropPacker' config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/cocotextv2/textspotting.py b/dataset_zoo/cocotextv2/textspotting.py index 88486337..9a9124fc 100644 --- a/dataset_zoo/cocotextv2/textspotting.py +++ b/dataset_zoo/cocotextv2/textspotting.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/cute80/textrecog.py b/dataset_zoo/cute80/textrecog.py index e4fd1cd9..8da3723f 100644 --- a/dataset_zoo/cute80/textrecog.py +++ b/dataset_zoo/cute80/textrecog.py @@ -2,51 +2,65 @@ # the fixed version as done in # https://github.com/clovaai/deep-text-recognition-benchmark by default. # If you want to use the original version, please comment out the following -# lines: L31-L38, and uncomment L23-L30, L40-L49. +# lines: L10-L31, and uncomment L33-L63 data_root = 'data/cute80' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://download.openmmlab.com/mmocr/data/mixture/ct80/' - 'timage.tar.gz', - save_name='ct80.tar.gz', - md5='9f3b1fe0e76f1fdfc70de3a365603d5e', - split=['test'], - content=['image'], - mapping=[['ct80/timage', 'textrecog_imgs/test']]), - # dict( - # url='https://download.openmmlab.com/mmocr/data/mixture/ct80/' - # 'test_label.txt', - # save_name='ct80_test.txt', - # md5='f679dec62916d3268aff9cd81990d260', - # split=['test'], - # content=['annotation'], - # mapping=[['ct80_test.txt', 'annotations/test.txt']]) - dict( - url='https://download.openmmlab.com/mmocr/data/1.x/recog/ct80/' - 'textrecog_test.json', - save_name='textrecog_test.json', - md5='9c5c79d843b900325e7fd453b318cad9', - split=['test'], - content=['annotation']) - ]) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/mixture/ct80/' + 'timage.tar.gz', + save_name='ct80.tar.gz', + md5='9f3b1fe0e76f1fdfc70de3a365603d5e', + split=['test'], + content=['image'], + mapping=[['ct80/timage', 'textrecog_imgs/test']]), + dict( + url='https://download.openmmlab.com/mmocr/data/1.x/recog/ct80/' + 'textrecog_test.json', + save_name='textrecog_test.json', + md5='9c5c79d843b900325e7fd453b318cad9', + split=['test'], + content=['annotation']) + ])) -# data_converter = dict( -# type='TextRecogDataConverter', -# splits=['test'], -# data_root=data_root, -# gatherer=dict(type='mono_gather', test_ann='test.txt'), +# test_preparer = dict( +# obtainer=dict( +# type='NaiveDataObtainer', +# cache_path=cache_path, +# data_root=data_root, +# files=[ +# dict( +# url='https://download.openmmlab.com/mmocr/data/mixture/ct80/' +# 'timage.tar.gz', +# save_name='ct80.tar.gz', +# md5='9f3b1fe0e76f1fdfc70de3a365603d5e', +# split=['test'], +# content=['image'], +# mapping=[['ct80/timage', 'textrecog_imgs/test']]), +# dict( +# url='https://download.openmmlab.com/mmocr/data/mixture/ct80/' +# 'test_label.txt', +# save_name='ct80_test.txt', +# md5='f679dec62916d3268aff9cd81990d260', +# split=['test'], +# content=['annotation'], +# mapping=[['ct80_test.txt', 'annotations/test.txt']]) +# ]), +# gatherer=dict(type='MonoGatherer', ann_name='test.txt'), # parser=dict( # type='ICDARTxtTextRecogAnnParser', # separator=' ', # format='img text ignore1 ignore2'), -# dumper=dict(type='JsonDumper')) - +# packer=dict(type='TextRecogPacker'), +# dumper=dict(type='JsonDumper'), +# ) +delete = ['ct80'] config_generator = dict( type='TextRecogConfigGenerator', data_root=data_root, train_anns=None) diff --git a/dataset_zoo/funsd/textdet.py b/dataset_zoo/funsd/textdet.py index 8e958096..a3b88ab8 100644 --- a/dataset_zoo/funsd/textdet.py +++ b/dataset_zoo/funsd/textdet.py @@ -1,38 +1,62 @@ data_root = 'data/funsd' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://guillaumejaume.github.io/FUNSD/dataset.zip', - save_name='funsd.zip', - md5='e05de47de238aa343bf55d8807d659a9', - split=['train', 'test'], - content=['image', 'annotation'], - mapping=[ - ['funsd/dataset/training_data/images', 'textdet_imgs/train'], - ['funsd/dataset/testing_data/images', 'textdet_imgs/test'], - [ - 'funsd/dataset/training_data/annotations', - 'annotations/train' - ], - ['funsd/dataset/testing_data/annotations', 'annotations/test'], - ]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://guillaumejaume.github.io/FUNSD/dataset.zip', + save_name='funsd.zip', + md5='e05de47de238aa343bf55d8807d659a9', + content=['image', 'annotation'], + mapping=[ + [ + 'funsd/dataset/training_data/images', + 'textdet_imgs/train' + ], + [ + 'funsd/dataset/training_data/annotations', + 'annotations/train' + ], + ]), + ]), gatherer=dict( - type='pair_gather', - suffixes=['.png'], + type='PairGatherer', + img_suffixes=['.png'], rule=[r'(\w+)\.png', r'\1.json']), parser=dict(type='FUNSDTextDetAnnParser'), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper'), - delete=['annotations', 'funsd']) +) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://guillaumejaume.github.io/FUNSD/dataset.zip', + save_name='funsd.zip', + md5='e05de47de238aa343bf55d8807d659a9', + content=['image', 'annotation'], + mapping=[ + ['funsd/dataset/testing_data/images', 'textdet_imgs/test'], + [ + 'funsd/dataset/testing_data/annotations', + 'annotations/test' + ], + ]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.png'], + rule=[r'(\w+)\.png', r'\1.json']), + parser=dict(type='FUNSDTextDetAnnParser'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) +delete = ['annotations', 'funsd'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/funsd/textrecog.py b/dataset_zoo/funsd/textrecog.py index 212c7e7d..c4436e07 100644 --- a/dataset_zoo/funsd/textrecog.py +++ b/dataset_zoo/funsd/textrecog.py @@ -1,5 +1,9 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' + +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.test_preparer.packer.type = 'TextRecogCropPacker' config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/funsd/textspotting.py b/dataset_zoo/funsd/textspotting.py index 88486337..4681c61a 100644 --- a/dataset_zoo/funsd/textspotting.py +++ b/dataset_zoo/funsd/textspotting.py @@ -1,5 +1,8 @@ _base_ = ['textdet.py'] +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/icdar2013/textdet.py b/dataset_zoo/icdar2013/textdet.py index 170152c5..10f24d52 100644 --- a/dataset_zoo/icdar2013/textdet.py +++ b/dataset_zoo/icdar2013/textdet.py @@ -1,52 +1,29 @@ data_root = 'data/icdar2013' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge2_Training_Task12_Images.zip', - save_name='ic13_textdet_train_img.zip', - md5='a443b9649fda4229c9bc52751bad08fb', - split=['train'], - content=['image'], - mapping=[['ic13_textdet_train_img', 'textdet_imgs/train']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge2_Test_Task12_Images.zip', - save_name='ic13_textdet_test_img.zip', - md5='af2e9f070c4c6a1c7bdb7b36bacf23e3', - split=['test'], - content=['image'], - mapping=[['ic13_textdet_test_img', 'textdet_imgs/test']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge2_Training_Task1_GT.zip', - save_name='ic13_textdet_train_gt.zip', - md5='f3a425284a66cd67f455d389c972cce4', - split=['train'], - content=['annotation'], - mapping=[['ic13_textdet_train_gt', 'annotations/train']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge2_Test_Task1_GT.zip', - save_name='ic13_textdet_test_gt.zip', - md5='3191c34cd6ac28b60f5a7db7030190fb', - split=['test'], - content=['annotation'], - mapping=[['ic13_textdet_test_gt', 'annotations/test']]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge2_Training_Task12_Images.zip', + save_name='ic13_textdet_train_img.zip', + md5='a443b9649fda4229c9bc52751bad08fb', + content=['image'], + mapping=[['ic13_textdet_train_img', 'textdet_imgs/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge2_Training_Task1_GT.zip', + save_name='ic13_textdet_train_gt.zip', + md5='f3a425284a66cd67f455d389c972cce4', + content=['annotation'], + mapping=[['ic13_textdet_train_gt', 'annotations/train']]), + ]), gatherer=dict( - type='pair_gather', - suffixes=['.jpg'], + type='PairGatherer', + img_suffixes=['.jpg'], rule=[r'(\w+)\.jpg', r'gt_\1.txt']), parser=dict( type='ICDARTxtTextDetAnnParser', @@ -54,6 +31,45 @@ data_converter = dict( format='x1 y1 x2 y2 trans', separator=' ', mode='xyxy'), - dumper=dict(type='JsonDumper')) + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge2_Test_Task12_Images.zip', + save_name='ic13_textdet_test_img.zip', + md5='af2e9f070c4c6a1c7bdb7b36bacf23e3', + content=['image'], + mapping=[['ic13_textdet_test_img', 'textdet_imgs/test']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge2_Test_Task1_GT.zip', + save_name='ic13_textdet_test_gt.zip', + md5='3191c34cd6ac28b60f5a7db7030190fb', + content=['annotation'], + mapping=[['ic13_textdet_test_gt', 'annotations/test']]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.jpg'], + rule=[r'(\w+)\.jpg', r'gt_\1.txt']), + parser=dict( + type='ICDARTxtTextDetAnnParser', + remove_strs=[',', '"'], + format='x1 y1 x2 y2 trans', + separator=' ', + mode='xyxy'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) +delete = [ + 'annotations', 'ic13_textdet_train_img', 'ic13_textdet_train_gt', + 'ic13_textdet_test_img', 'ic13_textdet_test_gt' +] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/icdar2013/textrecog.py b/dataset_zoo/icdar2013/textrecog.py index f3306edd..23fd969f 100644 --- a/dataset_zoo/icdar2013/textrecog.py +++ b/dataset_zoo/icdar2013/textrecog.py @@ -8,87 +8,118 @@ data_root = 'data/icdar2013' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge2_Training_Task3_Images_GT.zip', - save_name='ic13_textrecog_train_img_gt.zip', - md5='6f0dbc823645968030878df7543f40a4', - split=['train'], - content=['image'], - mapping=[[ - 'ic13_textrecog_train_img_gt/gt.txt', 'annotations/train.txt' - ], ['ic13_textrecog_train_img_gt', 'textrecog_imgs/train']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge2_Test_Task3_Images.zip', - save_name='ic13_textrecog_test_img.zip', - md5='3206778eebb3a5c5cc15c249010bf77f', - split=['test'], - content=['image'], - mapping=[['ic13_textrecog_test_img', 'textrecog_imgs/test']]), - dict( - url='https://download.openmmlab.com/mmocr/data/1.x/recog/' - 'icdar_2013/train_labels.json', - save_name='ic13_train_labels.json', - md5='008fcd0056e72c4cf3064fb4d1fce81b', - split=['train'], - content=['annotation'], - mapping=[['ic13_train_labels.json', 'textrecog_train.json']]), - # Note that we offer two versions of test set annotations as follows. - # Please choose one of them to download and comment the other. By - # default, we use the second one. - # 1. The original official annotation, which contains 1095 test - # samples. - # dict( - # url='https://rrc.cvc.uab.es/downloads/' - # 'Challenge2_Test_Task3_GT.txt', - # save_name='ic13_textrecog_test_gt.txt', - # md5='2634060ed8fe6e7a4a9b8d68785835a1', - # split=['test'], - # content=['annotation'], - # mapping=[['ic13_textrecog_test_gt.txt', 'annotations/test.txt']]), # noqa - # 2. The widely-used version for academic purpose, which filters out - # words with non-alphanumeric characters. This version contains 1015 - # test samples. - dict( - url='https://download.openmmlab.com/mmocr/data/1.x/recog/' - 'icdar_2013/textrecog_test_1015.json', - save_name='textrecog_test.json', - md5='68fdd818f63df8b93dc952478952009a', - split=['test'], - content=['annotation'], - ), - # 3. The 857 version further pruned words shorter than 3 characters. - dict( - url='https://download.openmmlab.com/mmocr/data/1.x/recog/' - 'icdar_2013/textrecog_test_857.json', - save_name='textrecog_test_857.json', - md5='3bed3985b0c51a989ad4006f6de8352b', - split=['test'], - content=['annotation'], - ), - ]) +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge2_Training_Task3_Images_GT.zip', + save_name='ic13_textrecog_train_img_gt.zip', + md5='6f0dbc823645968030878df7543f40a4', + content=['image'], + mapping=[ + # ['ic13_textrecog_train_img_gt/gt.txt', + # 'annotations/train.txt'], + ['ic13_textrecog_train_img_gt', 'textrecog_imgs/train'] + ]), + dict( + url='https://download.openmmlab.com/mmocr/data/1.x/recog/' + 'icdar_2013/train_labels.json', + save_name='ic13_train_labels.json', + md5='008fcd0056e72c4cf3064fb4d1fce81b', + content=['annotation'], + mapping=[['ic13_train_labels.json', 'textrecog_train.json']]), + ])) -# Uncomment the data converter if you want to use the original 1095 version. -# data_converter = dict( -# type='TextRecogDataConverter', -# splits=['train', 'test'], -# data_root=data_root, -# gatherer=dict( -# type='mono_gather', train_ann='train.txt', test_ann='test.txt'), +# Note that we offer two versions of test set annotations as follows.Please +# choose one of them to download and comment the other. By default, we use the +# second one. +# 1. The original official annotation, which contains 1095 test +# samples. + +# Uncomment the test_preparer if you want to use the original 1095 version. + +# test_preparer = dict( +# obtainer=dict( +# type='NaiveDataObtainer', +# cache_path=cache_path, +# files=[ +# dict( +# url='https://rrc.cvc.uab.es/downloads/' +# 'Challenge2_Test_Task3_Images.zip', +# save_name='ic13_textrecog_test_img.zip', +# md5='3206778eebb3a5c5cc15c249010bf77f', +# split=['test'], +# content=['image'], +# mapping=[['ic13_textrecog_test_img', +# 'textrecog_imgs/test']]), +# dict( +# url='https://rrc.cvc.uab.es/downloads/' +# 'Challenge2_Test_Task3_GT.txt', +# save_name='ic13_textrecog_test_gt.txt', +# md5='2634060ed8fe6e7a4a9b8d68785835a1', +# split=['test'], +# content=['annotation'], +# mapping=[[ +# 'ic13_textrecog_test_gt.txt', 'annotations/test.txt' +# ]]), # noqa +# # The 857 version further pruned words shorter than 3 characters. +# dict( +# url='https://download.openmmlab.com/mmocr/data/1.x/recog/' +# 'icdar_2013/textrecog_test_857.json', +# save_name='textrecog_test_857.json', +# md5='3bed3985b0c51a989ad4006f6de8352b', +# split=['test'], +# content=['annotation'], +# ), +# ]), +# gatherer=dict(type='MonoGatherer', ann_name='test.txt'), # parser=dict( # type='ICDARTxtTextRecogAnnParser', separator=', ', # format='img, text'), # noqa -# dumper=dict(type='JsonDumper')) +# packer=dict(type='TextRecogPacker'), +# dumper=dict(type='JsonDumper'), +# ) + +# 2. The widely-used version for academic purpose, which filters +# out words with non-alphanumeric characters. This version contains +# 1015 test samples. +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge2_Test_Task3_Images.zip', + save_name='ic13_textrecog_test_img.zip', + md5='3206778eebb3a5c5cc15c249010bf77f', + split=['test'], + content=['image'], + mapping=[['ic13_textrecog_test_img', 'textrecog_imgs/test']]), + dict( + url='https://download.openmmlab.com/mmocr/data/1.x/recog/' + 'icdar_2013/textrecog_test_1015.json', + save_name='textrecog_test.json', + md5='68fdd818f63df8b93dc952478952009a', + split=['test'], + content=['annotation'], + ), + # The 857 version further pruned words shorter than 3 characters. + dict( + url='https://download.openmmlab.com/mmocr/data/1.x/recog/' + 'icdar_2013/textrecog_test_857.json', + save_name='textrecog_test_857.json', + md5='3bed3985b0c51a989ad4006f6de8352b', + split=['test'], + content=['annotation'], + ), + ])) config_generator = dict( type='TextRecogConfigGenerator', - data_root=data_root, test_anns=[ dict(ann_file='textrecog_test.json'), dict(dataset_postfix='857', ann_file='textrecog_test_857.json') diff --git a/dataset_zoo/icdar2013/textspotting.py b/dataset_zoo/icdar2013/textspotting.py index 88486337..9e7fde0e 100644 --- a/dataset_zoo/icdar2013/textspotting.py +++ b/dataset_zoo/icdar2013/textspotting.py @@ -1,5 +1,8 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/icdar2015/textdet.py b/dataset_zoo/icdar2015/textdet.py index cfae76d1..612f59f4 100644 --- a/dataset_zoo/icdar2015/textdet.py +++ b/dataset_zoo/icdar2015/textdet.py @@ -1,53 +1,60 @@ data_root = 'data/icdar2015' cache_path = 'data/cache' - -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip', - save_name='ic15_textdet_train_img.zip', - md5='c51cbace155dcc4d98c8dd19d378f30d', - split=['train'], - content=['image'], - mapping=[['ic15_textdet_train_img', 'textdet_imgs/train']]), - dict( - url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip', - save_name='ic15_textdet_test_img.zip', - md5='97e4c1ddcf074ffcc75feff2b63c35dd', - split=['test'], - content=['image'], - mapping=[['ic15_textdet_test_img', 'textdet_imgs/test']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'ch4_training_localization_transcription_gt.zip', - save_name='ic15_textdet_train_gt.zip', - md5='3bfaf1988960909014f7987d2343060b', - split=['train'], - content=['annotation'], - mapping=[['ic15_textdet_train_gt', 'annotations/train']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge4_Test_Task4_GT.zip', - save_name='ic15_textdet_test_gt.zip', - md5='8bce173b06d164b98c357b0eb96ef430', - split=['test'], - content=['annotation'], - mapping=[['ic15_textdet_test_gt', 'annotations/test']]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_training_images.zip', + save_name='ic15_textdet_train_img.zip', + md5='c51cbace155dcc4d98c8dd19d378f30d', + content=['image'], + mapping=[['ic15_textdet_train_img', 'textdet_imgs/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch4_training_localization_transcription_gt.zip', + save_name='ic15_textdet_train_gt.zip', + md5='3bfaf1988960909014f7987d2343060b', + content=['annotation'], + mapping=[['ic15_textdet_train_gt', 'annotations/train']]), + ]), gatherer=dict( - type='pair_gather', - suffixes=['.jpg', '.JPG'], + type='PairGatherer', + img_suffixes=['.jpg', '.JPG'], rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), parser=dict(type='ICDARTxtTextDetAnnParser', encoding='utf-8-sig'), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper'), - delete=['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img']) +) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip', + save_name='ic15_textdet_test_img.zip', + md5='97e4c1ddcf074ffcc75feff2b63c35dd', + content=['image'], + mapping=[['ic15_textdet_test_img', 'textdet_imgs/test']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge4_Test_Task4_GT.zip', + save_name='ic15_textdet_test_gt.zip', + md5='8bce173b06d164b98c357b0eb96ef430', + content=['annotation'], + mapping=[['ic15_textdet_test_gt', 'annotations/test']]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.jpg', '.JPG'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), + parser=dict(type='ICDARTxtTextDetAnnParser', encoding='utf-8-sig'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) + +config_generator = dict(type='TextDetConfigGenerator') +delete = ['annotations', 'ic15_textdet_test_img', 'ic15_textdet_train_img'] diff --git a/dataset_zoo/icdar2015/textrecog.py b/dataset_zoo/icdar2015/textrecog.py index 69372bfb..daecdf90 100644 --- a/dataset_zoo/icdar2015/textrecog.py +++ b/dataset_zoo/icdar2015/textrecog.py @@ -4,61 +4,66 @@ data_root = 'data/icdar2015' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'ch4_training_word_images_gt.zip', - save_name='ic15_textrecog_train_img_gt.zip', - md5='600caf8c6a64a3dcf638839820edcca9', - split=['train'], - content=['image', 'annotation'], - mapping=[[ - 'ic15_textrecog_train_img_gt/gt.txt', 'annotations/train.txt' - ], ['ic15_textrecog_train_img_gt', 'textrecog_imgs/train']]), - dict( - url='https://rrc.cvc.uab.es/downloads/ch4_test_word_images_gt.zip', - save_name='ic15_textrecog_test_img.zip', - md5='d7a71585f4cc69f89edbe534e7706d5d', - split=['test'], - content=['image'], - mapping=[['ic15_textrecog_test_img', 'textrecog_imgs/test']]), - dict( - url='https://rrc.cvc.uab.es/downloads/' - 'Challenge4_Test_Task3_GT.txt', - save_name='ic15_textrecog_test_gt.txt', - md5='d7a71585f4cc69f89edbe534e7706d5d', - split=['test'], - content=['annotation'], - mapping=[['ic15_textrecog_test_gt.txt', 'annotations/test.txt']]), - # 3. The 1811 version discards non-alphanumeric character images and - # some extremely rotated, perspective-shifted, and curved images for - # evaluation - dict( - url='https://download.openmmlab.com/mmocr/data/1.x/recog/' - 'icdar_2015/textrecog_test_1811.json', - save_name='textrecog_test_1811.json', - md5='8d218ef1c37540ea959e22eeabc79ae4', - split=['test'], - content=['annotation'], - ), - ]) - -data_converter = dict( - type='TextRecogDataConverter', - splits=['train', 'test'], - data_root=data_root, - gatherer=dict( - type='mono_gather', train_ann='train.txt', test_ann='test.txt'), +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch4_training_word_images_gt.zip', + save_name='ic15_textrecog_train_img_gt.zip', + md5='600caf8c6a64a3dcf638839820edcca9', + content=['image', 'annotation'], + mapping=[[ + 'ic15_textrecog_train_img_gt/gt.txt', + 'annotations/train.txt' + ], ['ic15_textrecog_train_img_gt', 'textrecog_imgs/train']]), + ]), + gatherer=dict(type='MonoGatherer', ann_name='train.txt'), parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'), + packer=dict(type='TextRecogPacker'), dumper=dict(type='JsonDumper')) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch4_test_word_images_gt.zip', + save_name='ic15_textrecog_test_img.zip', + md5='d7a71585f4cc69f89edbe534e7706d5d', + content=['image'], + mapping=[['ic15_textrecog_test_img', 'textrecog_imgs/test']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'Challenge4_Test_Task3_GT.txt', + save_name='ic15_textrecog_test_gt.txt', + md5='d7a71585f4cc69f89edbe534e7706d5d', + content=['annotation'], + mapping=[[ + 'ic15_textrecog_test_gt.txt', 'annotations/test.txt' + ]]), + # 3. The 1811 version discards non-alphanumeric character images + # and some extremely rotated, perspective-shifted, and curved + # images for evaluation + dict( + url='https://download.openmmlab.com/mmocr/data/1.x/recog/' + 'icdar_2015/textrecog_test_1811.json', + save_name='textrecog_test_1811.json', + md5='8d218ef1c37540ea959e22eeabc79ae4', + content=['annotation'], + ), + ]), + gatherer=dict(type='MonoGatherer', ann_name='test.txt'), + parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'), + packer=dict(type='TextRecogPacker'), + dumper=dict(type='JsonDumper')) +delete = ['annotations'] config_generator = dict( type='TextRecogConfigGenerator', - data_root=data_root, test_anns=[ dict(ann_file='textrecog_test.json'), dict(dataset_postfix='1811', ann_file='textrecog_test_1811.json') diff --git a/dataset_zoo/icdar2015/textspotting.py b/dataset_zoo/icdar2015/textspotting.py index 88486337..fd9307f1 100644 --- a/dataset_zoo/icdar2015/textspotting.py +++ b/dataset_zoo/icdar2015/textspotting.py @@ -1,5 +1,7 @@ _base_ = ['textdet.py'] - -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/iiit5k/textrecog.py b/dataset_zoo/iiit5k/textrecog.py index ead178a5..872bf5b0 100644 --- a/dataset_zoo/iiit5k/textrecog.py +++ b/dataset_zoo/iiit5k/textrecog.py @@ -1,50 +1,64 @@ data_root = 'data/iiit5k' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/' - 'IIIT5K-Word_V3.0.tar.gz', - save_name='IIIT5K.tar.gz', - md5='56781bc327d22066aa1c239ee788fd46', - split=['test', 'train'], - content=['image'], - mapping=[['IIIT5K/IIIT5K/test', 'textrecog_imgs/test'], - ['IIIT5K/IIIT5K/train', 'textrecog_imgs/train']]), - dict( - url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/' - 'test_label.txt', - save_name='iiit5k_test.txt', - md5='82ecfa34a28d59284d1914dc906f5380', - split=['test'], - content=['annotation'], - mapping=[['iiit5k_test.txt', 'annotations/test.txt']]), - dict( - url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/' - 'train_label.txt', - save_name='iiit5k_train.txt', - md5='f4731ce1eadc259532c2834266e5126d', - split=['train'], - content=['annotation'], - mapping=[['iiit5k_train.txt', 'annotations/train.txt']]), - ]) - -data_converter = dict( - type='TextRecogDataConverter', - splits=['train', 'test'], - data_root=data_root, - gatherer=dict( - type='mono_gather', train_ann='train.txt', test_ann='test.txt'), +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/' + 'IIIT5K-Word_V3.0.tar.gz', + save_name='IIIT5K.tar.gz', + md5='56781bc327d22066aa1c239ee788fd46', + content=['image'], + mapping=[['IIIT5K/IIIT5K/train', 'textrecog_imgs/train']]), + dict( + url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/' + 'train_label.txt', + save_name='iiit5k_train.txt', + md5='f4731ce1eadc259532c2834266e5126d', + content=['annotation'], + mapping=[['iiit5k_train.txt', 'annotations/train.txt']]) + ]), + gatherer=dict(type='MonoGatherer', ann_name='train.txt'), parser=dict( type='ICDARTxtTextRecogAnnParser', encoding='utf-8', separator=' ', format='img text'), + packer=dict(type='TextRecogPacker'), dumper=dict(type='JsonDumper'), - delete=['annotations', 'IIIT5K']) +) -config_generator = dict(type='TextRecogConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/' + 'IIIT5K-Word_V3.0.tar.gz', + save_name='IIIT5K.tar.gz', + md5='56781bc327d22066aa1c239ee788fd46', + content=['image'], + mapping=[['IIIT5K/IIIT5K/test', 'textrecog_imgs/test']]), + dict( + url='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/' + 'test_label.txt', + save_name='iiit5k_test.txt', + md5='82ecfa34a28d59284d1914dc906f5380', + content=['annotation'], + mapping=[['iiit5k_test.txt', 'annotations/test.txt']]) + ]), + gatherer=dict(type='MonoGatherer', ann_name='test.txt'), + parser=dict( + type='ICDARTxtTextRecogAnnParser', + encoding='utf-8', + separator=' ', + format='img text'), + packer=dict(type='TextRecogPacker'), + dumper=dict(type='JsonDumper'), +) +delete = ['annotations', 'IIIT5K'] +config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/naf/textdet.py b/dataset_zoo/naf/textdet.py index 25261b71..73470c1e 100644 --- a/dataset_zoo/naf/textdet.py +++ b/dataset_zoo/naf/textdet.py @@ -1,17 +1,15 @@ data_root = 'data/naf' cache_path = 'data/cache' -data_obtainer = dict( +obtainer = dict( type='NaiveDataObtainer', cache_path=cache_path, - data_root=data_root, files=[ dict( url='https://github.com/herobd/NAF_dataset/releases/' 'download/v1.0/labeled_images.tar.gz', save_name='naf_image.tar.gz', md5='6521cdc25c313a1f2928a16a77ad8f29', - split=['train', 'test', 'val'], content=['image'], mapping=[['naf_image/labeled_images', 'temp_images/']]), dict( @@ -19,7 +17,6 @@ data_obtainer = dict( 'refs/heads/master.zip', save_name='naf_anno.zip', md5='abf5af6266cc527d772231751bc884b3', - split=['train', 'test', 'val'], content=['annotation'], mapping=[ [ @@ -33,17 +30,21 @@ data_obtainer = dict( ]), ]) -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test', 'val'], - data_root=data_root, - gatherer=dict(type='naf_gather'), - parser=dict(type='NAFAnnParser', data_root=data_root, det=True), - delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'], +train_preparer = dict( + obtainer=obtainer, + gatherer=dict(type='NAFGatherer'), + parser=dict(type='NAFAnnParser', det=True), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper'), - nproc=1) +) +test_preparer = train_preparer + +val_preparer = train_preparer + +delete = [ + 'temp_images', 'data_split.json', 'annotations', 'naf_anno', 'naf_image' +] config_generator = dict( type='TextDetConfigGenerator', - data_root=data_root, val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')]) diff --git a/dataset_zoo/naf/textrecog.py b/dataset_zoo/naf/textrecog.py index 943bd8cd..915bdbc2 100644 --- a/dataset_zoo/naf/textrecog.py +++ b/dataset_zoo/naf/textrecog.py @@ -4,16 +4,15 @@ # not to use them for recognition and text spotting. _base_ = ['textdet.py'] -data_root = 'data/naf' - -data_converter = dict( - type='TextRecogCropConverter', - parser=dict( - type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'], - det=False), - delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations']) - +_base_.train_preparer.parser.update(dict(ignore=['¿', '§'], det=False)) +_base_.test_preparer.parser.update(dict(ignore=['¿', '§'], det=False)) +_base_.val_preparer.parser.update(dict(ignore=['¿', '§'], det=False)) +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.test_preparer.packer.type = 'TextRecogCropPacker' +_base_.val_preparer.packer.type = 'TextRecogCropPacker' +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val' config_generator = dict( type='TextRecogConfigGenerator', - data_root=data_root, val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')]) diff --git a/dataset_zoo/naf/textspotting.py b/dataset_zoo/naf/textspotting.py index 97b50b4a..65d3ec1a 100644 --- a/dataset_zoo/naf/textspotting.py +++ b/dataset_zoo/naf/textspotting.py @@ -4,15 +4,16 @@ # not to use them for recognition and text spotting. _base_ = ['textdet.py'] -data_root = 'data/naf' -data_converter = dict( - type='TextSpottingDataConverter', - parser=dict( - type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'], - det=False), - delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations']) +_base_.train_preparer.parser.update(dict(ignore=['¿', '§'], det=False)) +_base_.test_preparer.parser.update(dict(ignore=['¿', '§'], det=False)) +_base_.val_preparer.parser.update(dict(ignore=['¿', '§'], det=False)) +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' +_base_.val_preparer.packer.type = 'TextSpottingPacker' +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val' config_generator = dict( type='TextSpottingConfigGenerator', - data_root=data_root, val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')]) diff --git a/dataset_zoo/sroie/textdet.py b/dataset_zoo/sroie/textdet.py index 78bb399a..8b994ea8 100644 --- a/dataset_zoo/sroie/textdet.py +++ b/dataset_zoo/sroie/textdet.py @@ -1,55 +1,64 @@ data_root = 'data/sroie' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://download.openmmlab.com/mmocr/data/' - 'sroie/0325updated.task1train(626p).zip', - save_name='0325updated.task1train(626p).zip', - md5='16137490f6865caac75772b9111d348c', - split=['train'], - content=['image', 'annotation'], - mapping=[[ - '0325updated/0325updated.task1train(626p)/*.jpg', - 'textdet_imgs/train' - ], - [ - '0325updated/0325updated.task1train(626p)/*.txt', - 'annotations/train' - ]]), - dict( - url='https://download.openmmlab.com/mmocr/data/' - 'sroie/task1&2_test(361p).zip', - save_name='task1&2_test(361p).zip', - md5='1bde54705db0995c57a6e34cce437fea', - split=['test'], - content=['image'], - mapping=[[ - 'task1&2_test(361p)/fulltext_test(361p)', 'textdet_imgs/test' - ]]), - dict( - url='https://download.openmmlab.com/mmocr/data/sroie/text.zip', - save_name='text.zip', - md5='8c534653f252ff4d3943fa27a956a74b', - split=['test'], - content=['annotation'], - mapping=[['text', 'annotations/test']]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/' + 'sroie/0325updated.task1train(626p).zip', + save_name='0325updated.task1train(626p).zip', + md5='16137490f6865caac75772b9111d348c', + content=['image', 'annotation'], + mapping=[[ + '0325updated/0325updated.task1train(626p)/*.jpg', + 'textdet_imgs/train' + ], + [ + '0325updated/0325updated.task1train(626p)/*.txt', + 'annotations/train' + ]]) + ]), gatherer=dict( - type='pair_gather', - suffixes=['.jpg'], + type='PairGatherer', + img_suffixes=['.jpg'], rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']), parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper'), - delete=['text', 'task1&2_test(361p)', '0325updated', 'annotations']) +) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/' + 'sroie/task1&2_test(361p).zip', + save_name='task1&2_test(361p).zip', + md5='1bde54705db0995c57a6e34cce437fea', + content=['image'], + mapping=[[ + 'task1&2_test(361p)/fulltext_test(361p)', + 'textdet_imgs/test' + ]]), + dict( + url='https://download.openmmlab.com/mmocr/data/sroie/text.zip', + save_name='text.zip', + md5='8c534653f252ff4d3943fa27a956a74b', + content=['annotation'], + mapping=[['text', 'annotations/test']]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.jpg'], + rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']), + parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) +delete = ['text', 'task1&2_test(361p)', '0325updated', 'annotations'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/sroie/textrecog.py b/dataset_zoo/sroie/textrecog.py index 212c7e7d..07d03dfb 100644 --- a/dataset_zoo/sroie/textrecog.py +++ b/dataset_zoo/sroie/textrecog.py @@ -1,5 +1,8 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.test_preparer.packer.type = 'TextRecogCropPacker' config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/sroie/textspotting.py b/dataset_zoo/sroie/textspotting.py index 88486337..9e7fde0e 100644 --- a/dataset_zoo/sroie/textspotting.py +++ b/dataset_zoo/sroie/textspotting.py @@ -1,5 +1,8 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/svt/textdet.py b/dataset_zoo/svt/textdet.py index 4a72cd1b..703851a9 100644 --- a/dataset_zoo/svt/textdet.py +++ b/dataset_zoo/svt/textdet.py @@ -1,30 +1,44 @@ data_root = 'data/svt' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='http://www.iapr-tc11.org/dataset/SVT/svt.zip', - save_name='svt.zip', - md5='42d19160010d990ae6223b14f45eff88', - split=['train', 'test'], - content=['image', 'annotations'], - mapping=[['svt/svt1/train.xml', 'annotations/train.xml'], - ['svt/svt1/test.xml', 'annotations/test.xml'], - ['svt/svt1/img', 'textdet_imgs/img']]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='http://www.iapr-tc11.org/dataset/SVT/svt.zip', + save_name='svt.zip', + md5='42d19160010d990ae6223b14f45eff88', + content=['image', 'annotations'], + mapping=[['svt/svt1/train.xml', 'annotations/train.xml'], + ['svt/svt1/img', 'textdet_imgs/img']]), + ]), gatherer=dict( - type='mono_gather', train_ann='train.xml', test_ann='test.xml'), - parser=dict(type='SVTTextDetAnnParser', data_root=data_root), + type='MonoGatherer', ann_name='train.xml', img_dir='textdet_imgs/img'), + parser=dict(type='SVTTextDetAnnParser'), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper'), - delete=['annotations', 'svt']) +) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='http://www.iapr-tc11.org/dataset/SVT/svt.zip', + save_name='svt.zip', + md5='42d19160010d990ae6223b14f45eff88', + content=['image', 'annotations'], + mapping=[['svt/svt1/test.xml', 'annotations/test.xml'], + ['svt/svt1/img', 'textdet_imgs/img']]), + ]), + gatherer=dict( + type='MonoGatherer', ann_name='test.xml', img_dir='textdet_imgs/img'), + parser=dict(type='SVTTextDetAnnParser'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) +delete = ['annotations', 'svt'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/svt/textrecog.py b/dataset_zoo/svt/textrecog.py index 212c7e7d..eff1abe3 100644 --- a/dataset_zoo/svt/textrecog.py +++ b/dataset_zoo/svt/textrecog.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.test_preparer.packer.type = 'TextRecogCropPacker' config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/svt/textspotting.py b/dataset_zoo/svt/textspotting.py index 88486337..9a9124fc 100644 --- a/dataset_zoo/svt/textspotting.py +++ b/dataset_zoo/svt/textspotting.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/svtp/textrecog.py b/dataset_zoo/svtp/textrecog.py index 484dffea..bffe23f4 100644 --- a/dataset_zoo/svtp/textrecog.py +++ b/dataset_zoo/svtp/textrecog.py @@ -1,29 +1,23 @@ data_root = 'data/svtp' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://download.openmmlab.com/mmocr/data/svtp.zip', - save_name='svtp.zip', - md5='4232b46c81ba99eea6d057dcb06b8f75', - split=['test'], - content=['image', 'annotation'], - mapping=[['svtp/par1', 'textrecog_imgs/test'], - ['svtp/gt.txt', 'annotations/test.txt']]), - ]) - -data_converter = dict( - type='TextRecogDataConverter', - splits=['test'], - data_root=data_root, - gatherer=dict(type='mono_gather', test_ann='test.txt'), +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/svtp.zip', + save_name='svtp.zip', + md5='4232b46c81ba99eea6d057dcb06b8f75', + content=['image', 'annotation'], + mapping=[['svtp/par1', 'textrecog_imgs/test'], + ['svtp/gt.txt', 'annotations/test.txt']]), + ]), + gatherer=dict(type='MonoGatherer', ann_name='test.txt'), parser=dict( type='ICDARTxtTextRecogAnnParser', separator=' ', format='img text'), - dumper=dict(type='JsonDumper'), - delete=['svtp', 'annotations']) - -config_generator = dict(type='TextRecogConfigGenerator', data_root=data_root) + packer=dict(type='TextRecogPacker'), + dumper=dict(type='JsonDumper')) +delete = ['svtp', 'annotations'] +config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/textocr/textdet.py b/dataset_zoo/textocr/textdet.py index 056489f9..482e293f 100644 --- a/dataset_zoo/textocr/textdet.py +++ b/dataset_zoo/textocr/textdet.py @@ -1,52 +1,67 @@ data_root = 'data/textocr' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://dl.fbaipublicfiles.com/textvqa/images/' - 'train_val_images.zip', - save_name='textocr_textdet_train_val_img.zip', - md5='d12dd8098899044e4ae1af34db7ecfef', - split=['train', 'val'], - content=['image'], - mapping=[[ - 'textocr_textdet_train_val_img/train_images', - 'textdet_imgs/train' - ]]), - dict( - url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/' - 'TextOCR_0.1_train.json', - save_name='textocr_textdet_train.json', - md5='0f8ba1beefd2ca4d08a4f82bcbe6cfb4', - split=['train'], - content=['annotation'], - mapping=[['textocr_textdet_train.json', - 'annotations/train.json']]), - dict( - url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/' - 'TextOCR_0.1_val.json', - save_name='textocr_textdet_val.json', - md5='fb151383ea7b3c530cde9ef0d5c08347', - split=['val'], - content=['annotation'], - mapping=[['textocr_textdet_val.json', 'annotations/val.json']]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'val'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://dl.fbaipublicfiles.com/textvqa/images/' + 'train_val_images.zip', + save_name='textocr_textdet_img.zip', + md5='d12dd8098899044e4ae1af34db7ecfef', + content=['image'], + mapping=[[ + 'textocr_textdet_img/train_images', 'textdet_imgs/images' + ]]), + dict( + url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/' + 'TextOCR_0.1_train.json', + save_name='textocr_textdet_train.json', + md5='0f8ba1beefd2ca4d08a4f82bcbe6cfb4', + content=['annotation'], + mapping=[[ + 'textocr_textdet_train.json', 'annotations/train.json' + ]]), + ]), gatherer=dict( - type='mono_gather', train_ann='train.json', val_ann='val.json'), - parser=dict( - type='COCOTextDetAnnParser', - variant='textocr', - data_root=data_root + '/textdet_imgs/'), - dumper=dict(type='JsonDumper'), - delete=['annotations', 'textocr_textdet_train_val_img']) + type='MonoGatherer', + ann_name='train.json', + img_dir='textdet_imgs/images'), + parser=dict(type='COCOTextDetAnnParser', variant='textocr'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper')) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +val_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://dl.fbaipublicfiles.com/textvqa/images/' + 'train_val_images.zip', + save_name='textocr_textdet_img.zip', + md5='d12dd8098899044e4ae1af34db7ecfef', + content=['image'], + mapping=[[ + 'textocr_textdet_img/train_images', 'textdet_imgs/images' + ]]), + dict( + url='https://dl.fbaipublicfiles.com/textvqa/data/textocr/' + 'TextOCR_0.1_val.json', + save_name='textocr_textdet_val.json', + md5='fb151383ea7b3c530cde9ef0d5c08347', + content=['annotation'], + mapping=[['textocr_textdet_val.json', + 'annotations/val.json']]), + ]), + gatherer=dict( + type='MonoGatherer', + ann_name='val.json', + img_dir='textdet_imgs/images'), + parser=dict(type='COCOTextDetAnnParser', variant='textocr'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper')) +delete = ['annotations', 'textocr_textdet_img'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/textocr/textrecog.py b/dataset_zoo/textocr/textrecog.py index 212c7e7d..6299aeb9 100644 --- a/dataset_zoo/textocr/textrecog.py +++ b/dataset_zoo/textocr/textrecog.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.val_preparer.packer.type = 'TextRecogCropPacker' config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/textocr/textspotting.py b/dataset_zoo/textocr/textspotting.py index 88486337..b9e5a1d0 100644 --- a/dataset_zoo/textocr/textspotting.py +++ b/dataset_zoo/textocr/textspotting.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.val_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/totaltext/textdet.py b/dataset_zoo/totaltext/textdet.py index eee0edcd..58aa986e 100644 --- a/dataset_zoo/totaltext/textdet.py +++ b/dataset_zoo/totaltext/textdet.py @@ -1,41 +1,62 @@ data_root = 'data/totaltext' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://universityofadelaide.box.com/shared/static/' - '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip', - save_name='totaltext.zip', - md5='5b56d71a4005a333cf200ff35ce87f75', - split=['train', 'test'], - content=['image'], - mapping=[['totaltext/Images/Train', 'textdet_imgs/train'], - ['totaltext/Images/Test', 'textdet_imgs/test']]), - dict( - url='https://universityofadelaide.box.com/shared/static/' - '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip', - save_name='txt_format.zip', - md5='53377a83420b4a0244304467512134e8', - split=['train', 'test'], - content=['annotation'], - mapping=[['txt_format/Train', 'annotations/train'], - ['txt_format/Test', 'annotations/test']]), - ]) - -data_converter = dict( - type='TextDetDataConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://universityofadelaide.box.com/shared/static/' + '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip', + save_name='totaltext.zip', + md5='5b56d71a4005a333cf200ff35ce87f75', + content=['image'], + mapping=[['totaltext/Images/Train', 'textdet_imgs/train']]), + dict( + url='https://universityofadelaide.box.com/shared/static/' + '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip', + save_name='txt_format.zip', + md5='53377a83420b4a0244304467512134e8', + content=['annotation'], + mapping=[['txt_format/Train', 'annotations/train']]), + ]), gatherer=dict( - type='pair_gather', - suffixes=['.jpg', '.JPG'], + type='PairGatherer', + img_suffixes=['.jpg', '.JPG'], rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']), - parser=dict(type='TotaltextTextDetAnnParser', data_root=data_root), + parser=dict(type='TotaltextTextDetAnnParser'), + packer=dict(type='TextDetPacker'), dumper=dict(type='JsonDumper'), - delete=['totaltext', 'txt_format', 'annotations']) +) -config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://universityofadelaide.box.com/shared/static/' + '8xro7hnvb0sqw5e5rxm73tryc59j6s43.zip', + save_name='totaltext.zip', + md5='5b56d71a4005a333cf200ff35ce87f75', + content=['image'], + mapping=[['totaltext/Images/Test', 'textdet_imgs/test']]), + dict( + url='https://universityofadelaide.box.com/shared/static/' + '2vmpvjb48pcrszeegx2eznzc4izan4zf.zip', + save_name='txt_format.zip', + md5='53377a83420b4a0244304467512134e8', + content=['annotation'], + mapping=[['txt_format/Test', 'annotations/test']]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.jpg', '.JPG'], + rule=[r'img(\d+)\.([jJ][pP][gG])', r'poly_gt_img\1.txt']), + parser=dict(type='TotaltextTextDetAnnParser'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) +delete = ['totaltext', 'txt_format', 'annotations'] +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/totaltext/textrecog.py b/dataset_zoo/totaltext/textrecog.py index 212c7e7d..07d03dfb 100644 --- a/dataset_zoo/totaltext/textrecog.py +++ b/dataset_zoo/totaltext/textrecog.py @@ -1,5 +1,8 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.train_preparer.packer.type = 'TextRecogCropPacker' +_base_.test_preparer.packer.type = 'TextRecogCropPacker' config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/totaltext/textspotting.py b/dataset_zoo/totaltext/textspotting.py index 88486337..9e7fde0e 100644 --- a/dataset_zoo/totaltext/textspotting.py +++ b/dataset_zoo/totaltext/textspotting.py @@ -1,5 +1,8 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.test_preparer.gatherer.img_dir = 'textdet_imgs/test' +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/dataset_zoo/wildreceipt/kie.py b/dataset_zoo/wildreceipt/kie.py index f7309e0d..fc900bed 100644 --- a/dataset_zoo/wildreceipt/kie.py +++ b/dataset_zoo/wildreceipt/kie.py @@ -1,35 +1,71 @@ data_root = 'data/wildreceipt' cache_path = 'data/cache' -data_obtainer = dict( - type='NaiveDataObtainer', - cache_path=cache_path, - data_root=data_root, - files=[ - dict( - url='https://download.openmmlab.com/mmocr/data/wildreceipt.tar', - save_name='wildreceipt.tar', - md5='2a2c4a1b4777fb4fe185011e17ad46ae', - split=['train', 'test'], - content=['image', 'annotation'], - mapping=[ - ['wildreceipt/wildreceipt/class_list.txt', 'class_list.txt'], - ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'], - ['wildreceipt/wildreceipt/test.txt', 'test.txt'], - ['wildreceipt/wildreceipt/train.txt', 'train.txt'], - ['wildreceipt/wildreceipt/image_files', 'image_files'], - ]), - ]) - -data_converter = dict( - type='WildReceiptConverter', - splits=['train', 'test'], - data_root=data_root, +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/' + 'wildreceipt.tar', + save_name='wildreceipt.tar', + md5='2a2c4a1b4777fb4fe185011e17ad46ae', + content=['image', 'annotation'], + mapping=[ + [ + 'wildreceipt/wildreceipt/class_list.txt', + 'class_list.txt' + ], + ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'], + [ + 'wildreceipt/wildreceipt/train.txt', + 'annotations/train.txt' + ], + [ + 'wildreceipt/wildreceipt/image_files/*/*/*.*', + 'image_files' + ], + ]), + ]), gatherer=dict( - type='mono_gather', - train_ann='train.txt', - test_ann='test.txt', - ann_path=data_root), - parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root), + type='MonoGatherer', ann_name='train.txt', img_dir='image_files'), + parser=dict(type='WildreceiptKIEAnnParser'), + packer=dict(type='WildReceiptPacker'), dumper=dict(type='WildreceiptOpensetDumper'), - delete=['wildreceipt']) +) + +test_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/' + 'wildreceipt.tar', + save_name='wildreceipt.tar', + md5='2a2c4a1b4777fb4fe185011e17ad46ae', + content=['image', 'annotation'], + mapping=[ + [ + 'wildreceipt/wildreceipt/class_list.txt', + 'class_list.txt' + ], + ['wildreceipt/wildreceipt/dict.txt', 'dict.txt'], + [ + 'wildreceipt/wildreceipt/test.txt', + 'annotations/test.txt' + ], + [ + 'wildreceipt/wildreceipt/image_files/*/*/*.*', + 'image_files' + ], + ]), + ]), + gatherer=dict( + type='MonoGatherer', img_dir='image_files', ann_name='test.txt'), + parser=dict(type='WildreceiptKIEAnnParser'), + packer=dict(type='WildReceiptPacker'), + dumper=dict(type='WildreceiptOpensetDumper'), +) +delete = ['wildreceipt', 'annotations'] diff --git a/dataset_zoo/wildreceipt/textdet.py b/dataset_zoo/wildreceipt/textdet.py index 21678e07..ec6b359a 100644 --- a/dataset_zoo/wildreceipt/textdet.py +++ b/dataset_zoo/wildreceipt/textdet.py @@ -1,9 +1,14 @@ _base_ = ['kie.py'] -data_converter = dict( - type='TextDetDataConverter', - parser=dict(type='WildreceiptTextDetAnnParser'), - dumper=dict(type='JsonDumper')) +_base_.train_preparer.update( + dict( + parser=dict(type='WildreceiptTextDetAnnParser'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'))) +_base_.test_preparer.update( + dict( + parser=dict(type='WildreceiptTextDetAnnParser'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'))) -config_generator = dict( - type='TextRecogConfigGenerator', data_root=_base_.data_root) +config_generator = dict(type='TextDetConfigGenerator') diff --git a/dataset_zoo/wildreceipt/textrecog.py b/dataset_zoo/wildreceipt/textrecog.py index 6cf80f0d..db54cb5e 100644 --- a/dataset_zoo/wildreceipt/textrecog.py +++ b/dataset_zoo/wildreceipt/textrecog.py @@ -1,4 +1,15 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextRecogCropConverter') +_base_.train_preparer.update( + dict( + parser=dict(type='WildreceiptTextDetAnnParser'), + packer=dict(type='TextRecogCropPacker'), + dumper=dict(type='JsonDumper'))) + +_base_.test_preparer.update( + dict( + parser=dict(type='WildreceiptTextDetAnnParser'), + packer=dict(type='TextRecogCropPacker'), + dumper=dict(type='JsonDumper'))) + config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/wildreceipt/textspotting.py b/dataset_zoo/wildreceipt/textspotting.py index 88486337..9a9124fc 100644 --- a/dataset_zoo/wildreceipt/textspotting.py +++ b/dataset_zoo/wildreceipt/textspotting.py @@ -1,5 +1,6 @@ _base_ = ['textdet.py'] -data_converter = dict(type='TextSpottingDataConverter') +_base_.train_preparer.packer.type = 'TextSpottingPacker' +_base_.test_preparer.packer.type = 'TextSpottingPacker' config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/mmocr/datasets/preparers/__init__.py b/mmocr/datasets/preparers/__init__.py index 0425dba8..e2323e32 100644 --- a/mmocr/datasets/preparers/__init__.py +++ b/mmocr/datasets/preparers/__init__.py @@ -1,17 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .config_generator import (TextDetConfigGenerator, - TextRecogConfigGenerator, - TextSpottingConfigGenerator) -from .data_converter import (TextDetDataConverter, TextRecogDataConverter, - TextSpottingDataConverter, WildReceiptConverter) -from .data_obtainer import NaiveDataObtainer +from .config_generators import * # noqa from .data_preparer import DatasetPreparer from .dumpers import * # noqa +from .gatherers import * # noqa +from .obtainers import * # noqa +from .packers import * # noqa from .parsers import * # noqa -__all__ = [ - 'DatasetPreparer', 'NaiveDataObtainer', 'TextDetDataConverter', - 'TextRecogDataConverter', 'TextSpottingDataConverter', - 'WildReceiptConverter', 'TextDetConfigGenerator', - 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator' -] +__all__ = ['DatasetPreparer'] diff --git a/mmocr/datasets/preparers/config_generator.py b/mmocr/datasets/preparers/config_generator.py deleted file mode 100644 index 73766149..00000000 --- a/mmocr/datasets/preparers/config_generator.py +++ /dev/null @@ -1,374 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import os.path as osp -from abc import abstractmethod -from typing import Dict, List, Optional - -from mmengine import mkdir_or_exist - -from .data_preparer import CFG_GENERATORS - - -class BaseDatasetConfigGenerator: - """Base class for dataset config generator. - - Args: - data_root (str): The root path of the dataset. - task (str): The task of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to None. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to None. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to None. - config_path (str): Path to the configs. Defaults to 'configs/'. - """ - - def __init__( - self, - data_root: str, - task: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = None, - val_anns: Optional[List[Dict]] = None, - test_anns: Optional[List[Dict]] = None, - config_path: str = 'configs/', - ) -> None: - self.config_path = config_path - self.data_root = data_root - self.task = task - self.dataset_name = dataset_name - self.overwrite_cfg = overwrite_cfg - self._prepare_anns(train_anns, val_anns, test_anns) - - def _prepare_anns(self, train_anns: Optional[List[Dict]], - val_anns: Optional[List[Dict]], - test_anns: Optional[List[Dict]]) -> None: - """Preprocess input arguments and stores these information into - ``self.anns``. - - ``self.anns`` is a dict that maps the name of a dataset config variable - to a dict, which contains the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - split (str): The split the annotation belongs to. Usually - it can be 'train', 'val' and 'test'. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - """ - self.anns = {} - for split, ann_list in zip(('train', 'val', 'test'), - (train_anns, val_anns, test_anns)): - if ann_list is None: - continue - if not isinstance(ann_list, list): - raise ValueError(f'{split}_anns must be either a list or' - ' None!') - for ann_dict in ann_list: - assert 'ann_file' in ann_dict - if ann_dict.get('dataset_postfix', ''): - key = f'{self.dataset_name}_{ann_dict["dataset_postfix"]}_{self.task}_{split}' # noqa - else: - key = f'{self.dataset_name}_{self.task}_{split}' - ann_dict['split'] = split - if key in self.anns: - raise ValueError( - f'Duplicate dataset variable {key} found! ' - 'Please use different dataset_postfix to avoid ' - 'conflict.') - self.anns[key] = ann_dict - - def __call__(self) -> None: - """Generates the base dataset config.""" - - dataset_config = self._gen_dataset_config() - - cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets', - f'{self.dataset_name}.py') - if osp.exists(cfg_path) and not self.overwrite_cfg: - print(f'{cfg_path} found, skipping.') - return - mkdir_or_exist(osp.dirname(cfg_path)) - with open(cfg_path, 'w') as f: - f.write( - f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n' # noqa: E501 - ) - f.write(dataset_config) - - @abstractmethod - def _gen_dataset_config(self) -> str: - """Generate a full dataset config based on the annotation file - dictionary. - - Returns: - str: The generated dataset config. - """ - - -@CFG_GENERATORS.register_module() -class TextDetConfigGenerator(BaseDatasetConfigGenerator): - """Text detection config generator. - - Args: - data_root (str): The root path of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to - ``[dict(file='textdet_train.json', dataset_postfix='')]``. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to []. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to ``[dict(file='textdet_test.json')]``. - config_path (str): Path to the configs. Defaults to 'configs/'. - """ - - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='textdet_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='textdet_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: - super().__init__( - data_root=data_root, - task='textdet', - overwrite_cfg=overwrite_cfg, - dataset_name=dataset_name, - train_anns=train_anns, - val_anns=val_anns, - test_anns=test_anns, - config_path=config_path, - ) - - def _gen_dataset_config(self) -> str: - """Generate a full dataset config based on the annotation file - dictionary. - - Args: - ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps - a config variable name (such as icdar2015_textrecog_train) to - its corresponding annotation information dict. Each dict - contains following keys: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults - to None. - - split (str): The split the annotation belongs to. Usually - it can be 'train', 'val' and 'test'. - - Returns: - str: The generated dataset config. - """ - cfg = '' - for key_name, ann_dict in self.anns.items(): - cfg += f'\n{key_name} = dict(\n' - cfg += ' type=\'OCRDataset\',\n' - cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 - cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' - if ann_dict['split'] == 'train': - cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501 - elif ann_dict['split'] in ['test', 'val']: - cfg += ' test_mode=True,\n' - cfg += ' pipeline=None)\n' - return cfg - - -@CFG_GENERATORS.register_module() -class TextRecogConfigGenerator(BaseDatasetConfigGenerator): - """Text recognition config generator. - - Args: - data_root (str): The root path of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to - ``[dict(file='textrecog_train.json'), dataset_postfix='']``. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to []. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to ``[dict(file='textrecog_test.json')]``. - config_path (str): Path to the configs. Defaults to 'configs/'. - - Example: - It generates a dataset config like: - >>> ic15_rec_data_root = 'data/icdar2015/' - >>> icdar2015_textrecog_train = dict( - >>> type='OCRDataset', - >>> data_root=ic15_rec_data_root, - >>> ann_file='textrecog_train.json', - >>> test_mode=False, - >>> pipeline=None) - >>> icdar2015_textrecog_test = dict( - >>> type='OCRDataset', - >>> data_root=ic15_rec_data_root, - >>> ann_file='textrecog_test.json', - >>> test_mode=True, - >>> pipeline=None) - """ - - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='textrecog_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='textrecog_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: - super().__init__( - data_root=data_root, - task='textrecog', - overwrite_cfg=overwrite_cfg, - dataset_name=dataset_name, - train_anns=train_anns, - val_anns=val_anns, - test_anns=test_anns, - config_path=config_path) - - def _gen_dataset_config(self) -> str: - """Generate a full dataset config based on the annotation file - dictionary. - - Args: - ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps - a config variable name (such as icdar2015_textrecog_train) to - its corresponding annotation information dict. Each dict - contains following keys: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults - to None. - - split (str): The split the annotation belongs to. Usually - it can be 'train', 'val' and 'test'. - - Returns: - str: The generated dataset config. - """ - cfg = '' - for key_name, ann_dict in self.anns.items(): - cfg += f'\n{key_name} = dict(\n' - cfg += ' type=\'OCRDataset\',\n' - cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 - cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' - if ann_dict['split'] in ['test', 'val']: - cfg += ' test_mode=True,\n' - cfg += ' pipeline=None)\n' - return cfg - - -@CFG_GENERATORS.register_module() -class TextSpottingConfigGenerator(TextDetConfigGenerator): - """Text spotting config generator. - - Args: - data_root (str): The root path of the dataset. - dataset_name (str): The name of the dataset. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, config generator will not generate new - config for datasets whose configs are already in base. - train_anns (List[Dict], optional): A list of train annotation files - to appear in the base configs. Defaults to - ``[dict(file='textspotting_train.json', dataset_postfix='')]``. - Each element is typically a dict with the following fields: - - ann_file (str): The path to the annotation file relative to - data_root. - - dataset_postfix (str, optional): Affects the postfix of the - resulting variable in the generated config. If specified, the - dataset variable will be named in the form of - ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to - None. - val_anns (List[Dict], optional): A list of val annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to []. - test_anns (List[Dict], optional): A list of test annotation files - to appear in the base configs, similar to ``train_anns``. Defaults - to ``[dict(file='textspotting_test.json')]``. - config_path (str): Path to the configs. Defaults to 'configs/'. - """ - - def __init__( - self, - data_root: str, - dataset_name: str, - overwrite_cfg: bool = False, - train_anns: Optional[List[Dict]] = [ - dict(ann_file='textspotting_train.json', dataset_postfix='') - ], - val_anns: Optional[List[Dict]] = [], - test_anns: Optional[List[Dict]] = [ - dict(ann_file='textspotting_test.json', dataset_postfix='') - ], - config_path: str = 'configs/', - ) -> None: - BaseDatasetConfigGenerator.__init__( - self, - data_root=data_root, - task='textspotting', - overwrite_cfg=overwrite_cfg, - dataset_name=dataset_name, - train_anns=train_anns, - val_anns=val_anns, - test_anns=test_anns, - config_path=config_path, - ) diff --git a/mmocr/datasets/preparers/config_generators/__init__.py b/mmocr/datasets/preparers/config_generators/__init__.py new file mode 100644 index 00000000..8e884c6d --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseDatasetConfigGenerator +from .textdet_config_generator import TextDetConfigGenerator +from .textrecog_config_generator import TextRecogConfigGenerator +from .textspotting_config_generator import TextSpottingConfigGenerator + +__all__ = [ + 'BaseDatasetConfigGenerator', 'TextDetConfigGenerator', + 'TextRecogConfigGenerator', 'TextSpottingConfigGenerator' +] diff --git a/mmocr/datasets/preparers/config_generators/base.py b/mmocr/datasets/preparers/config_generators/base.py new file mode 100644 index 00000000..6139ba4f --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/base.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from abc import abstractmethod +from typing import Dict, List, Optional + +from mmengine import mkdir_or_exist + + +class BaseDatasetConfigGenerator: + """Base class for dataset config generator. + + Args: + data_root (str): The root path of the dataset. + task (str): The task of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to None. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to None. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to None. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + task: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = None, + val_anns: Optional[List[Dict]] = None, + test_anns: Optional[List[Dict]] = None, + config_path: str = 'configs/', + ) -> None: + self.config_path = config_path + self.data_root = data_root + self.task = task + self.dataset_name = dataset_name + self.overwrite_cfg = overwrite_cfg + self._prepare_anns(train_anns, val_anns, test_anns) + + def _prepare_anns(self, train_anns: Optional[List[Dict]], + val_anns: Optional[List[Dict]], + test_anns: Optional[List[Dict]]) -> None: + """Preprocess input arguments and stores these information into + ``self.anns``. + + ``self.anns`` is a dict that maps the name of a dataset config variable + to a dict, which contains the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + """ + self.anns = {} + for split, ann_list in zip(('train', 'val', 'test'), + (train_anns, val_anns, test_anns)): + if ann_list is None: + continue + if not isinstance(ann_list, list): + raise ValueError(f'{split}_anns must be either a list or' + ' None!') + for ann_dict in ann_list: + assert 'ann_file' in ann_dict + if ann_dict.get('dataset_postfix', ''): + key = f'{self.dataset_name}_{ann_dict["dataset_postfix"]}_{self.task}_{split}' # noqa + else: + key = f'{self.dataset_name}_{self.task}_{split}' + ann_dict['split'] = split + if key in self.anns: + raise ValueError( + f'Duplicate dataset variable {key} found! ' + 'Please use different dataset_postfix to avoid ' + 'conflict.') + self.anns[key] = ann_dict + + def __call__(self) -> None: + """Generates the base dataset config.""" + + dataset_config = self._gen_dataset_config() + + cfg_path = osp.join(self.config_path, self.task, '_base_', 'datasets', + f'{self.dataset_name}.py') + if osp.exists(cfg_path) and not self.overwrite_cfg: + print(f'{cfg_path} found, skipping.') + return + mkdir_or_exist(osp.dirname(cfg_path)) + with open(cfg_path, 'w') as f: + f.write( + f'{self.dataset_name}_{self.task}_data_root = \'{self.data_root}\'\n' # noqa: E501 + ) + f.write(dataset_config) + + @abstractmethod + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Returns: + str: The generated dataset config. + """ diff --git a/mmocr/datasets/preparers/config_generators/textdet_config_generator.py b/mmocr/datasets/preparers/config_generators/textdet_config_generator.py new file mode 100644 index 00000000..56aff50d --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/textdet_config_generator.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from ..data_preparer import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class TextDetConfigGenerator(BaseDatasetConfigGenerator): + """Text detection config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='textdet_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='textdet_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='textdet_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='textdet_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + super().__init__( + data_root=data_root, + task='textdet', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'OCRDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] == 'train': + cfg += ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' # noqa: E501 + elif ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg diff --git a/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py b/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py new file mode 100644 index 00000000..613c8848 --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/textrecog_config_generator.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from ..data_preparer import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator + + +@CFG_GENERATORS.register_module() +class TextRecogConfigGenerator(BaseDatasetConfigGenerator): + """Text recognition config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='textrecog_train.json'), dataset_postfix='']``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='textrecog_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + + Example: + It generates a dataset config like: + >>> ic15_rec_data_root = 'data/icdar2015/' + >>> icdar2015_textrecog_train = dict( + >>> type='OCRDataset', + >>> data_root=ic15_rec_data_root, + >>> ann_file='textrecog_train.json', + >>> test_mode=False, + >>> pipeline=None) + >>> icdar2015_textrecog_test = dict( + >>> type='OCRDataset', + >>> data_root=ic15_rec_data_root, + >>> ann_file='textrecog_test.json', + >>> test_mode=True, + >>> pipeline=None) + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='textrecog_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='textrecog_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + super().__init__( + data_root=data_root, + task='textrecog', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path) + + def _gen_dataset_config(self) -> str: + """Generate a full dataset config based on the annotation file + dictionary. + + Args: + ann_dict (dict[str, dict(str, str)]): A nested dictionary that maps + a config variable name (such as icdar2015_textrecog_train) to + its corresponding annotation information dict. Each dict + contains following keys: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults + to None. + - split (str): The split the annotation belongs to. Usually + it can be 'train', 'val' and 'test'. + + Returns: + str: The generated dataset config. + """ + cfg = '' + for key_name, ann_dict in self.anns.items(): + cfg += f'\n{key_name} = dict(\n' + cfg += ' type=\'OCRDataset\',\n' + cfg += ' data_root=' + f'{self.dataset_name}_{self.task}_data_root,\n' # noqa: E501 + cfg += f' ann_file=\'{ann_dict["ann_file"]}\',\n' + if ann_dict['split'] in ['test', 'val']: + cfg += ' test_mode=True,\n' + cfg += ' pipeline=None)\n' + return cfg diff --git a/mmocr/datasets/preparers/config_generators/textspotting_config_generator.py b/mmocr/datasets/preparers/config_generators/textspotting_config_generator.py new file mode 100644 index 00000000..58556915 --- /dev/null +++ b/mmocr/datasets/preparers/config_generators/textspotting_config_generator.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional + +from ..data_preparer import CFG_GENERATORS +from .base import BaseDatasetConfigGenerator +from .textdet_config_generator import TextDetConfigGenerator + + +@CFG_GENERATORS.register_module() +class TextSpottingConfigGenerator(TextDetConfigGenerator): + """Text spotting config generator. + + Args: + data_root (str): The root path of the dataset. + dataset_name (str): The name of the dataset. + overwrite_cfg (bool): Whether to overwrite the dataset config file if + it already exists. If False, config generator will not generate new + config for datasets whose configs are already in base. + train_anns (List[Dict], optional): A list of train annotation files + to appear in the base configs. Defaults to + ``[dict(file='textspotting_train.json', dataset_postfix='')]``. + Each element is typically a dict with the following fields: + - ann_file (str): The path to the annotation file relative to + data_root. + - dataset_postfix (str, optional): Affects the postfix of the + resulting variable in the generated config. If specified, the + dataset variable will be named in the form of + ``{dataset_name}_{dataset_postfix}_{task}_{split}``. Defaults to + None. + val_anns (List[Dict], optional): A list of val annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to []. + test_anns (List[Dict], optional): A list of test annotation files + to appear in the base configs, similar to ``train_anns``. Defaults + to ``[dict(file='textspotting_test.json')]``. + config_path (str): Path to the configs. Defaults to 'configs/'. + """ + + def __init__( + self, + data_root: str, + dataset_name: str, + overwrite_cfg: bool = False, + train_anns: Optional[List[Dict]] = [ + dict(ann_file='textspotting_train.json', dataset_postfix='') + ], + val_anns: Optional[List[Dict]] = [], + test_anns: Optional[List[Dict]] = [ + dict(ann_file='textspotting_test.json', dataset_postfix='') + ], + config_path: str = 'configs/', + ) -> None: + BaseDatasetConfigGenerator.__init__( + self, + data_root=data_root, + task='textspotting', + overwrite_cfg=overwrite_cfg, + dataset_name=dataset_name, + train_anns=train_anns, + val_anns=val_anns, + test_anns=test_anns, + config_path=config_path, + ) diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py deleted file mode 100644 index 62f1bc5a..00000000 --- a/mmocr/datasets/preparers/data_converter.py +++ /dev/null @@ -1,752 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import json -import os -import os.path as osp -import re -import shutil -from abc import abstractmethod -from functools import partial -from typing import Dict, List, Optional, Sequence, Tuple - -import mmcv -from mmengine import mkdir_or_exist, track_parallel_progress - -from mmocr.utils import bbox2poly, crop_img, list_files, poly2bbox, warp_img -from .data_preparer import DATA_CONVERTERS, DATA_DUMPERS, DATA_PARSERS - - -class BaseDataConverter: - """Base class for data processor. - - Args: - splits (List): A list of splits to be processed. - data_root (str): Path to the data root. - gatherer (Dict): Config dict for gathering the dataset files. - parser (Dict): Config dict for parsing the dataset files. - dumper (Dict): Config dict for dumping the dataset files. - nproc (int): Number of processes to process the data. - task (str): Task of the dataset. - dataset_name (str): Dataset name. - delete (Optional[List]): A list of files to be deleted after - conversion. - """ - - def __init__(self, - splits: List, - data_root: str, - gatherer: Dict, - parser: Dict, - dumper: Dict, - nproc: int, - task: str, - dataset_name: str, - delete: Optional[List] = None, - config_path: str = 'configs/'): - assert isinstance(nproc, int) and nproc > 0, \ - 'nproc must be a positive integer.' - self.splits = splits - self.data_root = data_root - self.nproc = nproc - self.task = task - self.dataset_name = dataset_name - self.delete = delete - self.config_path = config_path - self.img_dir = f'{task}_imgs' - parser.update(dict(nproc=nproc)) - dumper.update(dict(task=task)) - self.parser = DATA_PARSERS.build(parser) - self.dumper = DATA_DUMPERS.build(dumper) - gather_type = gatherer.pop('type') - self.gatherer_args = gatherer - if gather_type == 'pair_gather': - self.gatherer = self.pair_gather - elif gather_type == 'mono_gather': - self.gatherer = self.mono_gather - elif gather_type == 'naf_gather': - self.gatherer = self.naf_gather - else: - raise NotImplementedError - - def __call__(self): - """Process the data. - - Returns: - Dict: A dict that maps each split to the path of the annotation - files. - """ - # Convert and dump annotations to MMOCR format - for self.current_split in self.splits: - print(f'Parsing {self.current_split} split...') - # Gather the info such as file names required by parser - img_path = osp.join(self.data_root, self.img_dir, - self.current_split) - ann_path = osp.join(self.data_root, 'annotations') - gatherer_args = dict(img_path=img_path, ann_path=ann_path) - gatherer_args.update(self.gatherer_args) - files = self.gatherer(**gatherer_args) - # Convert dataset annotations to MMOCR format - samples = self.parser.parse_files(files, self.current_split) - print(f'Packing {self.current_split} annotations...') - func = partial(self.pack_instance, split=self.current_split) - samples = track_parallel_progress(func, samples, nproc=self.nproc) - samples = self.add_meta(samples) - # Dump annotation files - self.dumper.dump(samples, self.data_root, self.current_split) - self.clean() - - @abstractmethod - def pack_instance(self, sample: Tuple, split: str) -> Dict: - """Pack the parsed annotation info to an MMOCR format instance. - - Args: - sample (Tuple): A tuple of (img_file, ann_file). - - img_path (str): Path to image file. - - instances (Sequence[Dict]): A list of converted annos. - split (str): The split of the instance. - - Returns: - Dict: An MMOCR format instance. - """ - - @abstractmethod - def add_meta(self, sample: List) -> Dict: - """Add meta information to the sample. - - Args: - sample (List): A list of samples of the dataset. - - Returns: - Dict: A dict contains the meta information and samples. - """ - - def mono_gather(self, - ann_path: str, - train_ann: Optional[str] = None, - val_ann: Optional[str] = None, - test_ann: Optional[str] = None, - **kwargs) -> str: - """Gather the dataset file. Specifically for the case that only one - annotation file is needed. For example, - - img_001.jpg \ - img_002.jpg ---> train.json - img_003.jpg / - - Args: - anno_path (str): Path to the annotations. - train_ann (str, optional): The annotation file name of the train - split in the original dataset. Defaults to None. - val_ann (str, optional): The annotation file name of the val split - in the original dataset. Defaults to None. - test_ann (str, optional): The annotation file name of the test - split in the original dataset. Defaults to None. - - Returns: - str: Path to the annotation file. - """ - - ann_file = eval(f'{self.current_split}_ann') - if ann_file is None: - raise ValueError( - f'{self.current_split}_ann must be specified in gatherer!') - return osp.join(ann_path, ann_file) - - def pair_gather(self, img_path: str, suffixes: List, rule: Sequence, - **kwargs) -> List[Tuple]: - """Gather the dataset files. Specifically for the paired annotations. - That is to say, each image has a corresponding annotation file. For - example, - - img_1.jpg <---> gt_img_1.txt - img_2.jpg <---> gt_img_2.txt - img_3.jpg <---> gt_img_3.txt - - Args: - img_path (str): Path to the images. - suffixes (List[str]): File suffixes that used for searching. - rule (Sequence): The rule for pairing the files. The - first element is the matching pattern for the file, and the - second element is the replacement pattern, which should - be a regular expression. For example, to map the image - name img_1.jpg to the annotation name gt_img_1.txt, - the rule is - [r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501 - - Returns: - List[Tuple]: A list of tuples (img_path, ann_path). - """ - files = list() - for file in list_files(img_path, suffixes): - if not re.match(rule[0], osp.basename(file)): - continue - file2 = re.sub(rule[0], rule[1], osp.basename(file)) - file2 = file.replace(osp.basename(file), file2) - file2 = file2.replace(self.img_dir, 'annotations') - files.append((file, file2)) - - return files - - def naf_gather(self, img_path: str, ann_path: str, - **kwargs) -> List[Tuple]: - """Gather the dataset file from NAF dataset. Specifically for the case - that there is a split file that contains the names of different splits. - For example, - - img_001.jpg train: img_001.jpg - img_002.jpg ---> data_split.json ---> test: img_002.jpg - img_003.jpg val: img_003.jpg - - Args: - img_path (str): Path to the images. - anno_path (str): Path to the annotations. - Returns: - List[Tuple]: A list of tuples (img_path, ann_path). - """ - split_file = osp.join(self.data_root, 'data_split.json') - with open(split_file, 'r') as f: - split_data = json.load(f) - files = [] - # Rename the key - split_data['val'] = split_data.pop('valid') - if not osp.exists(img_path): - os.makedirs(img_path) - for groups in split_data[self.current_split]: - for img_name in split_data[self.current_split][groups]: - src_img = osp.join(self.data_root, 'temp_images', img_name) - dst_img = osp.join(img_path, img_name) - if not osp.exists(src_img): - Warning(f'{src_img} does not exist!') - continue - # move the image to the new path - shutil.move(src_img, dst_img) - ann = osp.join(ann_path, img_name.replace('.jpg', '.json')) - files.append((dst_img, ann)) - return files - - def clean(self) -> None: - for d in self.delete: - delete_file = osp.join(self.data_root, d) - if osp.exists(delete_file): - if osp.isdir(delete_file): - shutil.rmtree(delete_file) - else: - os.remove(delete_file) - - -@DATA_CONVERTERS.register_module() -class TextDetDataConverter(BaseDataConverter): - """Text detection data converter. - - Args: - splits (List): A list of splits to be processed. - data_root (str): Path to the data root. - gatherer (Dict): Config dict for gathering the dataset files. - parser (Dict): Config dict for parsing the dataset files. - dumper (Dict): Config dict for dumping the dataset files. - dataset_name (str): Name of the dataset. - nproc (int): Number of processes to process the data. - delete (Optional[List]): A list of files to be deleted after - conversion. Defaults to ['annotations]. - """ - - def __init__(self, - splits: List, - data_root: str, - gatherer: Dict, - parser: Dict, - dumper: Dict, - dataset_name: str, - nproc: int, - delete: List = ['annotations']) -> None: - super().__init__( - splits=splits, - data_root=data_root, - gatherer=gatherer, - parser=parser, - dumper=dumper, - dataset_name=dataset_name, - nproc=nproc, - delete=delete, - task='textdet') - - def pack_instance(self, - sample: Tuple, - split: str, - bbox_label: int = 0) -> Dict: - """Pack the parsed annotation info to an MMOCR format instance. - - Args: - sample (Tuple): A tuple of (img_file, instances). - - img_path (str): Path to the image file. - - instances (Sequence[Dict]): A list of converted annos. Each - element should be a dict with the following keys: - - 'poly' or 'box' - - 'ignore' - - 'bbox_label' (optional) - split (str): The split of the instance. - - Returns: - Dict: An MMOCR format instance. - """ - - img_path, instances = sample - - img = mmcv.imread(img_path) - h, w = img.shape[:2] - - packed_instances = list() - for instance in instances: - poly = instance.get('poly', None) - box = instance.get('box', None) - assert box or poly - packed_sample = dict( - polygon=poly if poly else list( - bbox2poly(box).astype('float64')), - bbox=box if box else list(poly2bbox(poly).astype('float64')), - bbox_label=bbox_label, - ignore=instance['ignore']) - packed_instances.append(packed_sample) - - packed_instances = dict( - instances=packed_instances, - img_path=img_path.replace(self.data_root + '/', ''), - height=h, - width=w) - - return packed_instances - - def add_meta(self, sample: List) -> Dict: - meta = { - 'metainfo': { - 'dataset_type': 'TextDetDataset', - 'task_name': 'textdet', - 'category': [{ - 'id': 0, - 'name': 'text' - }] - }, - 'data_list': sample - } - return meta - - -@DATA_CONVERTERS.register_module() -class TextSpottingDataConverter(BaseDataConverter): - """Text spotting data converter. - - Args: - splits (List): A list of splits to be processed. - data_root (str): Path to the data root. - gatherer (Dict): Config dict for gathering the dataset files. - parser (Dict): Config dict for parsing the dataset files. - dumper (Dict): Config dict for dumping the dataset files. - dataset_name (str): Name of the dataset. - nproc (int): Number of processes to process the data. - delete (Optional[List]): A list of files to be deleted after - conversion. Defaults to ['annotations']. - """ - - def __init__(self, - splits: List, - data_root: str, - gatherer: Dict, - parser: Dict, - dumper: Dict, - dataset_name: str, - nproc: int, - delete: List = ['annotations']) -> None: - super().__init__( - splits=splits, - data_root=data_root, - gatherer=gatherer, - parser=parser, - dumper=dumper, - dataset_name=dataset_name, - nproc=nproc, - delete=delete, - task='textspotting') - # Textspotting task shares the same images with textdet task - self.img_dir = 'textdet_imgs' - - def pack_instance(self, - sample: Tuple, - split: str, - bbox_label: int = 0) -> Dict: - """Pack the parsed annotation info to an MMOCR format instance. - - Args: - sample (Tuple): A tuple of (img_file, ann_file). - - img_path (str): Path to image file. - - instances (Sequence[Dict]): A list of converted annos. Each - element should be a dict with the following keys: - - 'poly' or 'box' - - 'text' - - 'ignore' - - 'bbox_label' (optional) - split (str): The split of the instance. - - Returns: - Dict: An MMOCR format instance. - """ - - img_path, instances = sample - - img = mmcv.imread(img_path) - h, w = img.shape[:2] - - packed_instances = list() - for instance in instances: - assert 'text' in instance, 'Text is not found in the instance.' - poly = instance.get('poly', None) - box = instance.get('box', None) - assert box or poly - packed_sample = dict( - polygon=poly if poly else list( - bbox2poly(box).astype('float64')), - bbox=box if box else list(poly2bbox(poly).astype('float64')), - bbox_label=bbox_label, - ignore=instance['ignore'], - text=instance['text']) - packed_instances.append(packed_sample) - - packed_instances = dict( - instances=packed_instances, - img_path=img_path.replace(self.data_root + '/', ''), - height=h, - width=w) - - return packed_instances - - def add_meta(self, sample: List) -> Dict: - meta = { - 'metainfo': { - 'dataset_type': 'TextSpottingDataset', - 'task_name': 'textspotting', - 'category': [{ - 'id': 0, - 'name': 'text' - }] - }, - 'data_list': sample - } - return meta - - -@DATA_CONVERTERS.register_module() -class TextRecogDataConverter(BaseDataConverter): - """Text recognition data converter. - - Args: - splits (List): A list of splits to be processed. - data_root (str): Path to the data root. - gatherer (Dict): Config dict for gathering the dataset files. - parser (Dict): Config dict for parsing the dataset annotations. - dumper (Dict): Config dict for dumping the dataset files. - dataset_name (str): Name of the dataset. - nproc (int): Number of processes to process the data. - delete (Optional[List]): A list of files to be deleted after - conversion. Defaults to ['annotations]. - """ - - def __init__(self, - splits: List, - data_root: str, - gatherer: Dict, - parser: Dict, - dumper: Dict, - dataset_name: str, - nproc: int, - delete: List = ['annotations']): - super().__init__( - splits=splits, - data_root=data_root, - gatherer=gatherer, - parser=parser, - dumper=dumper, - dataset_name=dataset_name, - nproc=nproc, - task='textrecog', - delete=delete) - - def pack_instance(self, sample: Tuple, split: str) -> Dict: - """Pack the text info to a recognition instance. - - Args: - samples (Tuple): A tuple of (img_name, text). - split (str): The split of the instance. - - Returns: - Dict: The packed instance. - """ - - img_name, text = sample - packed_instance = dict( - instances=[dict(text=text)], - img_path=osp.join(self.img_dir, split, osp.basename(img_name))) - - return packed_instance - - def add_meta(self, sample: List) -> Dict: - meta = { - 'metainfo': { - 'dataset_type': 'TextRecogDataset', - 'task_name': 'textrecog' - }, - 'data_list': sample - } - return meta - - -@DATA_CONVERTERS.register_module() -class TextRecogCropConverter(TextRecogDataConverter): - """Text recognition crop converter. This converter will crop the text from - the original image. The parser used for this Converter should be a TextDet - parser. - - Args: - splits (List): A list of splits to be processed. - data_root (str): Path to the data root. - gatherer (Dict): Config dict for gathering the dataset files. - parser (Dict): Config dict for parsing the dataset annotations. - dumper (Dict): Config dict for dumping the dataset files. - dataset_name (str): Name of the dataset. - nproc (int): Number of processes to process the data. - crop_with_warp (bool): Whether to crop the text from the original image - using opencv warpPerspective. - jitter (bool): (Applicable when crop_with_warp=True) - Whether to jitter the box. - jitter_ratio_x (float): (Applicable when crop_with_warp=True) - Horizontal jitter ratio relative to the height. - jitter_ratio_y (float): (Applicable when crop_with_warp=True) - Vertical jitter ratio relative to the height. - long_edge_pad_ratio (float): (Applicable when crop_with_warp=False) - The ratio of padding the long edge of the cropped image. - Defaults to 0.1. - short_edge_pad_ratio (float): (Applicable when crop_with_warp=False) - The ratio of padding the short edge of the cropped image. - Defaults to 0.05. - delete (Optional[List]): A list of files to be deleted after - conversion. Defaults to ['annotations]. - """ - - def __init__(self, - splits: List, - data_root: str, - gatherer: Dict, - parser: Dict, - dumper: Dict, - dataset_name: str, - nproc: int, - crop_with_warp: bool = False, - jitter: bool = False, - jitter_ratio_x: float = 0.0, - jitter_ratio_y: float = 0.0, - long_edge_pad_ratio: float = 0.0, - short_edge_pad_ratio: float = 0.0, - delete: List = ['annotations']): - super().__init__( - splits=splits, - data_root=data_root, - gatherer=gatherer, - parser=parser, - dumper=dumper, - dataset_name=dataset_name, - nproc=nproc, - delete=delete) - self.crop_with_warp = crop_with_warp - self.jitter = jitter - self.jrx = jitter_ratio_x - self.jry = jitter_ratio_y - self.lepr = long_edge_pad_ratio - self.sepr = short_edge_pad_ratio - # Crop converter crops the images of textdet to patches - self.img_dir = 'textdet_imgs' - self.cropped_img_dir = 'textrecog_imgs' - self.crop_save_path = osp.join(self.data_root, self.cropped_img_dir) - mkdir_or_exist(self.crop_save_path) - for split in splits: - mkdir_or_exist(osp.join(self.crop_save_path, split)) - - def pack_instance(self, sample: Tuple, split: str) -> List: - """Crop patches from image. - - Args: - samples (Tuple): A tuple of (img_name, text). - split (str): The split of the instance. - - Return: - List: The list of cropped patches. - """ - - def get_box(instance: Dict) -> List: - if 'box' in instance: - return bbox2poly(instance['box']).tolist() - if 'poly' in instance: - return bbox2poly(poly2bbox(instance['poly'])).tolist() - - def get_poly(instance: Dict) -> List: - if 'poly' in instance: - return instance['poly'] - if 'box' in instance: - return bbox2poly(instance['box']).tolist() - - data_list = [] - img_path, instances = sample - img = mmcv.imread(img_path) - for i, instance in enumerate(instances): - if instance['ignore']: - continue - if self.crop_with_warp: - poly = get_poly(instance) - patch = warp_img(img, poly, self.jitter, self.jrx, self.jry) - else: - box = get_box(instance) - patch = crop_img(img, box, self.lepr, self.sepr) - if patch.shape[0] == 0 or patch.shape[1] == 0: - continue - text = instance['text'] - patch_name = osp.splitext( - osp.basename(img_path))[0] + f'_{i}' + osp.splitext( - osp.basename(img_path))[1] - dst_path = osp.join(self.crop_save_path, split, patch_name) - mmcv.imwrite(patch, dst_path) - rec_instance = dict( - instances=[dict(text=text)], - img_path=osp.join(self.cropped_img_dir, split, patch_name)) - data_list.append(rec_instance) - - return data_list - - def add_meta(self, sample: List) -> Dict: - # Since the TextRecogCropConverter packs all of the patches in a single - # image into a list, we need to flatten the list. - sample = [item for sublist in sample for item in sublist] - return super().add_meta(sample) - - -@DATA_CONVERTERS.register_module() -class WildReceiptConverter(BaseDataConverter): - """MMOCR only supports wildreceipt dataset for KIE task now. This converter - converts the wildreceipt dataset from close set to open set. - - Args: - splits (List): A list of splits to be processed. - data_root (str): Path to the data root. - gatherer (Dict): Config dict for gathering the dataset files. - parser (Dict): Config dict for parsing the dataset annotations. - dumper (Dict): Config dict for dumping the dataset files. - nproc (int): Number of processes to process the data. - delete (Optional[List]): A list of files to be deleted after - conversion. Defaults to ['annotations]. - merge_bg_others (bool): If True, give the same label to "background" - class and "others" class. Defaults to True. - ignore_idx (int): Index for ``ignore`` class. Defaults to 0. - others_idx (int): Index for ``others`` class. Defaults to 25. - """ - - def __init__(self, - splits: List, - data_root: str, - gatherer: Dict, - parser: Dict, - dumper: Dict, - dataset_name: str, - nproc: int, - delete: Optional[List] = None, - merge_bg_others: bool = False, - ignore_idx: int = 0, - others_idx: int = 25): - self.ignore_idx = ignore_idx - self.others_idx = others_idx - self.merge_bg_others = merge_bg_others - parser.update(dict(ignore=ignore_idx)) - super().__init__( - splits=splits, - data_root=data_root, - gatherer=gatherer, - parser=parser, - dumper=dumper, - dataset_name=dataset_name, - nproc=nproc, - task='kie', - delete=delete) - - def add_meta(self, samples: List) -> List: - """No meta info is required for the wildreceipt dataset.""" - return samples - - def pack_instance(self, sample: str, split: str): - """Pack line-json str of close set to line-json str of open set. - - Args: - sample (str): The string to be deserialized to - the close set dictionary object. - split (str): The split of the instance. - """ - # Two labels at the same index of the following two lists - # make up a key-value pair. For example, in wildreceipt, - # closeset_key_inds[0] maps to "Store_name_key" - # and closeset_value_inds[0] maps to "Store_addr_value". - closeset_key_inds = list(range(2, self.others_idx, 2)) - closeset_value_inds = list(range(1, self.others_idx, 2)) - - openset_node_label_mapping = { - 'bg': 0, - 'key': 1, - 'value': 2, - 'others': 3 - } - if self.merge_bg_others: - openset_node_label_mapping['others'] = openset_node_label_mapping[ - 'bg'] - - closeset_obj = json.loads(sample) - openset_obj = { - 'file_name': closeset_obj['file_name'], - 'height': closeset_obj['height'], - 'width': closeset_obj['width'], - 'annotations': [] - } - - edge_idx = 1 - label_to_edge = {} - for anno in closeset_obj['annotations']: - label = anno['label'] - if label == self.ignore_idx: - anno['label'] = openset_node_label_mapping['bg'] - anno['edge'] = edge_idx - edge_idx += 1 - elif label == self.others_idx: - anno['label'] = openset_node_label_mapping['others'] - anno['edge'] = edge_idx - edge_idx += 1 - else: - edge = label_to_edge.get(label, None) - if edge is not None: - anno['edge'] = edge - if label in closeset_key_inds: - anno['label'] = openset_node_label_mapping['key'] - elif label in closeset_value_inds: - anno['label'] = openset_node_label_mapping['value'] - else: - tmp_key = 'key' - if label in closeset_key_inds: - label_with_same_edge = closeset_value_inds[ - closeset_key_inds.index(label)] - elif label in closeset_value_inds: - label_with_same_edge = closeset_key_inds[ - closeset_value_inds.index(label)] - tmp_key = 'value' - edge_counterpart = label_to_edge.get( - label_with_same_edge, None) - if edge_counterpart is not None: - anno['edge'] = edge_counterpart - else: - anno['edge'] = edge_idx - edge_idx += 1 - anno['label'] = openset_node_label_mapping[tmp_key] - label_to_edge[label] = anno['edge'] - - openset_obj['annotations'] = closeset_obj['annotations'] - - return json.dumps(openset_obj, ensure_ascii=False) diff --git a/mmocr/datasets/preparers/data_preparer.py b/mmocr/datasets/preparers/data_preparer.py index 0bca6777..89384b77 100644 --- a/mmocr/datasets/preparers/data_preparer.py +++ b/mmocr/datasets/preparers/data_preparer.py @@ -1,32 +1,39 @@ # Copyright (c) OpenMMLab. All rights reserved. +import copy +import os import os.path as osp -import time +import shutil +from typing import List, Optional, Union from mmengine import Registry -from mmengine.config import Config +from mmocr.utils.typing_utils import ConfigType, OptConfigType + +DATA_PREPARERS = Registry('data preparer') DATA_OBTAINERS = Registry('data_obtainer') -DATA_CONVERTERS = Registry('data_converter') +DATA_GATHERERS = Registry('data_gatherer') DATA_PARSERS = Registry('data_parser') DATA_DUMPERS = Registry('data_dumper') +DATA_PACKERS = Registry('data_packer') CFG_GENERATORS = Registry('cfg_generator') +@DATA_PREPARERS.register_module() class DatasetPreparer: """Base class of dataset preparer. Dataset preparer is used to prepare dataset for MMOCR. It mainly consists of three steps: - - 1. Obtain the dataset + 1. For each split: + - Obtain the dataset - Download - Extract - Move/Rename - 2. Process the dataset - - Parse original annotations - - Convert to mmocr format - - Dump the annotation file - - Clean useless files + - Gather the dataset + - Parse the dataset + - Pack the dataset to MMOCR format + - Dump the dataset + 2. Delete useless files 3. Generate the base config for this dataset After all these steps, the original datasets have been prepared for @@ -34,106 +41,169 @@ class DatasetPreparer: https://mmocr.readthedocs.io/en/dev-1.x/user_guides/dataset_prepare.html Args: - cfg_path (str): Path to dataset config file. + data_root (str): Root directory of data. dataset_name (str): Dataset name. task (str): Task type. Options are 'textdet', 'textrecog', 'textspotter', and 'kie'. Defaults to 'textdet'. nproc (int): Number of parallel processes. Defaults to 4. - overwrite_cfg (bool): Whether to overwrite the dataset config file if - it already exists. If False, Dataset Preparer will not generate new - config for datasets whose configs are already in base. + train_preparer (OptConfigType): cfg for train data prepare. It contains + the following keys: + - obtainer: cfg for data obtainer. + - gatherer: cfg for data gatherer. + - parser: cfg for data parser. + - packer: cfg for data packer. + - dumper: cfg for data dumper. + Defaults to None. + test_preparer (OptConfigType): cfg for test data prepare. Defaults to + None. + val_preparer (OptConfigType): cfg for val data prepare. Defaults to + None. + config_generator (OptConfigType): cfg for config generator. Defaults to + None. + delete (list[str], optional): List of files to be deleted. + Defaults to None. """ def __init__(self, - cfg_path: str, - dataset_name: str, + data_root: str, + dataset_name: str = '', task: str = 'textdet', nproc: int = 4, - overwrite_cfg: bool = False) -> None: - cfg_path = osp.join(cfg_path, dataset_name) + train_preparer: OptConfigType = None, + test_preparer: OptConfigType = None, + val_preparer: OptConfigType = None, + config_generator: OptConfigType = None, + delete: Optional[List[str]] = None) -> None: + self.data_root = data_root self.nproc = nproc self.task = task self.dataset_name = dataset_name - self.overwrite_cfg = overwrite_cfg - self.parse_meta(cfg_path) - self.parse_cfg(cfg_path) + self.train_preparer = train_preparer + self.test_preparer = test_preparer + self.val_preparer = val_preparer + self.config_generator = config_generator + self.delete = delete - def __call__(self): + def run(self, splits: Union[str, List] = ['train', 'test', 'val']) -> None: """Prepare the dataset.""" - if self.with_obtainer: - print('Obtaining Dataset...') - self.data_obtainer() - if self.with_converter: - print('Converting Dataset...') - self.data_converter() - if self.with_config_generator: - print('Generating base configs...') - self.config_generator() + if isinstance(splits, str): + splits = [splits] + assert set(splits).issubset(set(['train', 'test', + 'val'])), 'Invalid split name' + for split in splits: + self.loop(split, getattr(self, f'{split}_preparer')) + self.clean() + self.generate_config() - def parse_meta(self, cfg_path: str) -> None: - """Parse meta file. + @classmethod + def from_file(cls, cfg: ConfigType) -> 'DatasetPreparer': + """Create a DataPreparer from config file. Args: - cfg_path (str): Path to meta file. + cfg (ConfigType): A config used for building runner. Keys of + ``cfg`` can see :meth:`__init__`. + + Returns: + Runner: A DatasetPreparer build from ``cfg``. """ - try: - meta = Config.fromfile(osp.join(cfg_path, 'metafile.yml')) - except FileNotFoundError: + + cfg = copy.deepcopy(cfg) + data_preparer = cls( + data_root=cfg['data_root'], + dataset_name=cfg.get('dataset_name', ''), + task=cfg.get('task', 'textdet'), + nproc=cfg.get('nproc', 4), + train_preparer=cfg.get('train_preparer', None), + test_preparer=cfg.get('test_preparer', None), + val_preparer=cfg.get('val_preparer', None), + delete=cfg.get('delete', None), + config_generator=cfg.get('config_generator', None)) + return data_preparer + + def loop(self, split: str, cfg: ConfigType) -> None: + """Loop over the dataset. + + Args: + split (str): The split of the dataset. + cfg (ConfigType): A config used for building obtainer, gatherer, + parser, packer and dumper. + """ + if cfg is None: return - assert self.task in meta['Data']['Tasks'], \ - f'Task {self.task} not supported!' - # License related - if meta['Data']['License']['Type']: - print(f"\033[1;33;40mDataset Name: {meta['Name']}") - print(f"License Type: {meta['Data']['License']['Type']}") - print(f"License Link: {meta['Data']['License']['Link']}") - print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m") - print( - '\033[1;31;43mMMOCR does not own the dataset. Using this ' - 'dataset you must accept the license provided by the owners, ' - 'and cite the corresponding papers appropriately.') - print('If you do not agree with the above license, please cancel ' - 'the progress immediately by pressing ctrl+c. Otherwise, ' - 'you are deemed to accept the terms and conditions.\033[0m') - for i in range(5): - print(f'{5-i}...') - time.sleep(1) - def parse_cfg(self, cfg_path: str) -> None: - """Parse dataset config file. + # build obtainer and run + obtainer = cfg.get('obtainer', None) + if obtainer: + print(f'Obtaining {split} Dataset...') + obtainer.setdefault('task', default=self.task) + obtainer.setdefault('data_root', default=self.data_root) + obtainer = DATA_OBTAINERS.build(obtainer) + obtainer() - Args: - cfg_path (str): Path to dataset config file. - """ - cfg_path = osp.join(cfg_path, self.task + '.py') - assert osp.exists(cfg_path), f'Config file {cfg_path} not found!' - cfg = Config.fromfile(cfg_path) + # build gatherer + gatherer = cfg.get('gatherer', None) + parser = cfg.get('parser', None) + packer = cfg.get('packer', None) + dumper = cfg.get('dumper', None) + related = [gatherer, parser, packer, dumper] + if all(item is None for item in related): # no data process + return + if not all(item is not None for item in related): + raise ValueError('gatherer, parser, packer and dumper should be ' + 'either all None or not None') - if 'data_obtainer' in cfg: - cfg.data_obtainer.update(task=self.task) - self.data_obtainer = DATA_OBTAINERS.build(cfg.data_obtainer) - if 'data_converter' in cfg: - cfg.data_converter.update( - dict(nproc=self.nproc, dataset_name=self.dataset_name)) - self.data_converter = DATA_CONVERTERS.build(cfg.data_converter) - if 'config_generator' in cfg: - cfg.config_generator.update( - dict( - dataset_name=self.dataset_name, - overwrite_cfg=self.overwrite_cfg)) - self.config_generator = CFG_GENERATORS.build(cfg.config_generator) + print(f'Gathering {split} Dataset...') + gatherer.setdefault('split', default=split) + gatherer.setdefault('data_root', default=self.data_root) + gatherer.setdefault('ann_dir', default='annotations') + gatherer.setdefault( + 'img_dir', default=osp.join(f'{self.task}_imgs', split)) - @property - def with_obtainer(self) -> bool: - """bool: whether the data preparer has an obtainer""" - return getattr(self, 'data_obtainer', None) is not None + gatherer = DATA_GATHERERS.build(gatherer) + img_paths, ann_paths = gatherer() - @property - def with_converter(self) -> bool: - """bool: whether the data preparer has an converter""" - return getattr(self, 'data_converter', None) is not None + # build parser + print(f'Parsing {split} Images and Annotations...') + parser.setdefault('split', default=split) + parser.setdefault('nproc', default=self.nproc) + parser = DATA_PARSERS.build(parser) + # Convert dataset annotations to MMOCR format + samples = parser(img_paths, ann_paths) - @property - def with_config_generator(self) -> bool: - """bool: whether the data preparer has a config generator""" - return getattr(self, 'config_generator', None) is not None + # build packer + print(f'Packing {split} Annotations...') + packer.setdefault('split', default=split) + packer.setdefault('nproc', default=self.nproc) + packer.setdefault('data_root', default=self.data_root) + packer = DATA_PACKERS.build(packer) + samples = packer(samples) + + # build dumper + print(f'Dumping {split} Annotations...') + # Dump annotation files + dumper.setdefault('task', default=self.task) + dumper.setdefault('split', default=split) + dumper.setdefault('data_root', default=self.data_root) + dumper = DATA_DUMPERS.build(dumper) + dumper(samples) + + def generate_config(self): + if self.config_generator is None: + return + self.config_generator.setdefault( + 'dataset_name', default=self.dataset_name) + self.config_generator.setdefault('data_root', default=self.data_root) + config_generator = CFG_GENERATORS.build(self.config_generator) + print('Generating base configs...') + config_generator() + + def clean(self) -> None: + if self.delete is None: + return + for d in self.delete: + delete_file = osp.join(self.data_root, d) + if osp.exists(delete_file): + if osp.isdir(delete_file): + shutil.rmtree(delete_file) + else: + os.remove(delete_file) diff --git a/mmocr/datasets/preparers/dumpers/__init__.py b/mmocr/datasets/preparers/dumpers/__init__.py index 4dc93d9c..1a73468e 100644 --- a/mmocr/datasets/preparers/dumpers/__init__.py +++ b/mmocr/datasets/preparers/dumpers/__init__.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .dumpers import JsonDumper, WildreceiptOpensetDumper +from .base import BaseDumper +from .json_dumper import JsonDumper +from .wild_receipt_openset_dumper import WildreceiptOpensetDumper -__all__ = ['JsonDumper', 'WildreceiptOpensetDumper'] +__all__ = ['BaseDumper', 'JsonDumper', 'WildreceiptOpensetDumper'] diff --git a/mmocr/datasets/preparers/dumpers/base.py b/mmocr/datasets/preparers/dumpers/base.py new file mode 100644 index 00000000..3b4416a8 --- /dev/null +++ b/mmocr/datasets/preparers/dumpers/base.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + + +class BaseDumper: + """Base class for data dumpers. + + Args: + task (str): Task type. Options are 'textdet', 'textrecog', + 'textspotter', and 'kie'. It is usually set automatically and users + do not need to set it manually in config file in most cases. + split (str): It' s the partition of the datasets. Options are 'train', + 'val' or 'test'. It is usually set automatically and users do not + need to set it manually in config file in most cases. Defaults to + None. + data_root (str): The root directory of the image and + annotation. It is usually set automatically and users do not need + to set it manually in config file in most cases. Defaults to None. + """ + + def __init__(self, task: str, split: str, data_root: str) -> None: + self.task = task + self.split = split + self.data_root = data_root + + def __call__(self, data: Any) -> None: + """Call function. + + Args: + data (Any): Data to be dumped. + """ + self.dump(data) + + def dump(self, data: Any) -> None: + raise NotImplementedError diff --git a/mmocr/datasets/preparers/dumpers/dumpers.py b/mmocr/datasets/preparers/dumpers/dumpers.py deleted file mode 100644 index f00b36da..00000000 --- a/mmocr/datasets/preparers/dumpers/dumpers.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import os.path as osp -from typing import Dict, List - -import mmengine - -from mmocr.utils import list_to_file -from ..data_preparer import DATA_DUMPERS - - -@DATA_DUMPERS.register_module() -class JsonDumper: - - def __init__(self, task: str) -> None: - self.task = task - - def dump(self, data: Dict, data_root: str, split: str) -> None: - """Dump data to json file. - - Args: - data (Dict): Data to be dumped. - data_root (str): Root directory of data. - split (str): Split of data. - cfg_path (str): Path to configs. Defaults to 'configs/'. - """ - - filename = f'{self.task}_{split}.json' - dst_file = osp.join(data_root, filename) - mmengine.dump(data, dst_file) - - -@DATA_DUMPERS.register_module() -class WildreceiptOpensetDumper: - - def __init__(self, task: str) -> None: - self.task = task - - def dump(self, data: List, data_root: str, split: str): - """Dump data to txt file. - - Args: - data (List): Data to be dumped. - data_root (str): Root directory of data. - split (str): Split of data. - """ - - filename = f'openset_{split}.txt' - dst_file = osp.join(data_root, filename) - list_to_file(dst_file, data) diff --git a/mmocr/datasets/preparers/dumpers/json_dumper.py b/mmocr/datasets/preparers/dumpers/json_dumper.py new file mode 100644 index 00000000..cbe6d5ba --- /dev/null +++ b/mmocr/datasets/preparers/dumpers/json_dumper.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict + +import mmengine + +from ..data_preparer import DATA_DUMPERS +from .base import BaseDumper + + +@DATA_DUMPERS.register_module() +class JsonDumper(BaseDumper): + """Dumper for json file.""" + + def dump(self, data: Dict) -> None: + """Dump data to json file. + + Args: + data (Dict): Data to be dumped. + """ + + filename = f'{self.task}_{self.split}.json' + dst_file = osp.join(self.data_root, filename) + mmengine.dump(data, dst_file) diff --git a/mmocr/datasets/preparers/dumpers/wild_receipt_openset_dumper.py b/mmocr/datasets/preparers/dumpers/wild_receipt_openset_dumper.py new file mode 100644 index 00000000..494bc00b --- /dev/null +++ b/mmocr/datasets/preparers/dumpers/wild_receipt_openset_dumper.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List + +from mmocr.utils import list_to_file +from ..data_preparer import DATA_DUMPERS +from .base import BaseDumper + + +@DATA_DUMPERS.register_module() +class WildreceiptOpensetDumper(BaseDumper): + + def dump(self, data: List): + """Dump data to txt file. + + Args: + data (List): Data to be dumped. + """ + + filename = f'openset_{self.split}.txt' + dst_file = osp.join(self.data_root, filename) + list_to_file(dst_file, data) diff --git a/mmocr/datasets/preparers/gatherers/__init__.py b/mmocr/datasets/preparers/gatherers/__init__.py new file mode 100644 index 00000000..9a05c797 --- /dev/null +++ b/mmocr/datasets/preparers/gatherers/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from .base import BaseGatherer +from .mono_gatherer import MonoGatherer +from .naf_gatherer import NAFGatherer +from .pair_gatherer import PairGatherer + +__all__ = ['BaseGatherer', 'MonoGatherer', 'PairGatherer', 'NAFGatherer'] diff --git a/mmocr/datasets/preparers/gatherers/base.py b/mmocr/datasets/preparers/gatherers/base.py new file mode 100644 index 00000000..f982a1a5 --- /dev/null +++ b/mmocr/datasets/preparers/gatherers/base.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List, Optional, Tuple, Union + + +class BaseGatherer: + """Base class for gatherer. + + Note: Gatherer assumes that all the annotation file is in the same + directory and all the image files are in the same directory. + + Args: + img_dir(str): The directory of the images. It is usually set + automatically to f'text{task}_imgs/split' and users do not need to + set it manually in config file in most cases. When the image files + is not in 'text{task}_imgs/split' directory, users should set it. + Defaults to ''. + ann_dir (str): The directory of the annotation files. It is usually set + automatically to 'annotations' and users do not need to set it + manually in config file in most cases. When the annotation files + is not in 'annotations' directory, users should set it. Defaults to + 'annotations'. + split (str, optional): List of splits to gather. It' s the partition of + the datasets. Options are 'train', 'val' or 'test'. It is usually + set automatically and users do not need to set it manually in + config file in most cases. Defaults to None. + data_root (str, optional): The root directory of the image and + annotation. It is usually set automatically and users do not need + to set it manually in config file in most cases. Defaults to None. + """ + + def __init__(self, + img_dir: str = '', + ann_dir: str = 'annotations', + split: Optional[str] = None, + data_root: Optional[str] = None) -> None: + self.split = split + self.data_root = data_root + self.ann_dir = osp.join(data_root, ann_dir) + self.img_dir = osp.join(data_root, img_dir) + + def __call__(self) -> Union[Tuple[List[str], List[str]], Tuple[str, str]]: + """The return value of the gatherer is a tuple of two lists or strings. + + The first element is the list of image paths or the directory of the + images. The second element is the list of annotation paths or the path + of the annotation file which contains all the annotations. + """ + raise NotImplementedError diff --git a/mmocr/datasets/preparers/gatherers/mono_gatherer.py b/mmocr/datasets/preparers/gatherers/mono_gatherer.py new file mode 100644 index 00000000..0db6f854 --- /dev/null +++ b/mmocr/datasets/preparers/gatherers/mono_gatherer.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Tuple + +from ..data_preparer import DATA_GATHERERS +from .base import BaseGatherer + + +@DATA_GATHERERS.register_module() +class MonoGatherer(BaseGatherer): + """Gather the dataset file. Specifically for the case that only one + annotation file is needed. For example, + + img_001.jpg \ + img_002.jpg ---> train.json + img_003.jpg / + + Args: + ann_name (str): The name of the annotation file. + """ + + def __init__(self, ann_name: str, **kwargs) -> None: + super().__init__(**kwargs) + + self.ann_name = ann_name + + def __call__(self) -> Tuple[str, str]: + """ + Returns: + tuple(str, str): The directory of the image and the path of + annotation file. + """ + + return (self.img_dir, osp.join(self.ann_dir, self.ann_name)) diff --git a/mmocr/datasets/preparers/gatherers/naf_gatherer.py b/mmocr/datasets/preparers/gatherers/naf_gatherer.py new file mode 100644 index 00000000..ac0ab307 --- /dev/null +++ b/mmocr/datasets/preparers/gatherers/naf_gatherer.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp +import shutil +from typing import List, Tuple + +from ..data_preparer import DATA_GATHERERS +from .base import BaseGatherer + + +@DATA_GATHERERS.register_module() +class NAFGatherer(BaseGatherer): + """Gather the dataset file from NAF dataset. Specifically for the case that + there is a split file that contains the names of different splits. For + example, + + img_001.jpg train: img_001.jpg + img_002.jpg ---> split_file ---> test: img_002.jpg + img_003.jpg val: img_003.jpg + + Args: + split_file (str, optional): The name of the split file. Defaults to + "data_split.json". + temp_dir (str, optional): The directory of the temporary images. + Defaults to "temp_images". + """ + + def __init__(self, + split_file='data_split.json', + temp_dir: str = 'temp_images', + **kwargs) -> None: + super().__init__(**kwargs) + self.temp_dir = temp_dir + self.split_file = split_file + + def __call__(self) -> Tuple[List[str], List[str]]: + """ + Returns: + tuple(list[str], list[str]): The list of image paths and the list + of annotation paths. + """ + + split_file = osp.join(self.data_root, self.split_file) + with open(split_file, 'r') as f: + split_data = json.load(f) + img_list = list() + ann_list = list() + # Rename the key + split_data['val'] = split_data.pop('valid') + if not osp.exists(self.img_dir): + os.makedirs(self.img_dir) + current_split_data = split_data[self.split] + for groups in current_split_data: + for img_name in current_split_data[groups]: + src_img = osp.join(self.data_root, self.temp_dir, img_name) + dst_img = osp.join(self.img_dir, img_name) + if not osp.exists(src_img): + Warning(f'{src_img} does not exist!') + continue + # move the image to the new path + shutil.move(src_img, dst_img) + ann = osp.join(self.ann_dir, img_name.replace('.jpg', '.json')) + img_list.append(dst_img) + ann_list.append(ann) + return img_list, ann_list diff --git a/mmocr/datasets/preparers/gatherers/pair_gatherer.py b/mmocr/datasets/preparers/gatherers/pair_gatherer.py new file mode 100644 index 00000000..b86187b0 --- /dev/null +++ b/mmocr/datasets/preparers/gatherers/pair_gatherer.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import re +from typing import List, Optional, Tuple + +from mmocr.utils import list_files +from ..data_preparer import DATA_GATHERERS +from .base import BaseGatherer + + +@DATA_GATHERERS.register_module() +class PairGatherer(BaseGatherer): + """Gather the dataset files. Specifically for the paired annotations. That + is to say, each image has a corresponding annotation file. For example, + + img_1.jpg <---> gt_img_1.txt + img_2.jpg <---> gt_img_2.txt + img_3.jpg <---> gt_img_3.txt + + Args: + img_suffixes (List[str]): File suffixes that used for searching. + rule (Sequence): The rule for pairing the files. The first element is + the matching pattern for the file, and the second element is the + replacement pattern, which should be a regular expression. For + example, to map the image name img_1.jpg to the annotation name + gt_img_1.txt, the rule is + [r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt'] # noqa: W605 E501 + + Note: PairGatherer assumes that each split annotation file is in the + correspond split directory. For example, all the train annotation files are + in {ann_dir}/train. + """ + + def __init__(self, + img_suffixes: Optional[List[str]] = None, + rule: Optional[List[str]] = None, + **kwargs) -> None: + super().__init__(**kwargs) + self.rule = rule + self.img_suffixes = img_suffixes + # ann_dir = {ann_root}/{ann_dir}/{split} + self.ann_dir = osp.join(self.ann_dir, self.split) + + def __call__(self) -> Tuple[List[str], List[str]]: + """tuple(list, list): The list of image paths and the list of + annotation paths.""" + + img_list = list() + ann_list = list() + for img_path in list_files(self.img_dir, self.img_suffixes): + if not re.match(self.rule[0], osp.basename(img_path)): + continue + ann_name = re.sub(self.rule[0], self.rule[1], + osp.basename(img_path)) + ann_path = osp.join(self.ann_dir, ann_name) + img_list.append(img_path) + ann_list.append(ann_path) + + return img_list, ann_list diff --git a/mmocr/datasets/preparers/obtainers/__init__.py b/mmocr/datasets/preparers/obtainers/__init__.py new file mode 100644 index 00000000..55d484d9 --- /dev/null +++ b/mmocr/datasets/preparers/obtainers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .naive_data_obtainer import NaiveDataObtainer + +__all__ = ['NaiveDataObtainer'] diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py similarity index 83% rename from mmocr/datasets/preparers/data_obtainer.py rename to mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 98ffdfd1..e2a650a6 100644 --- a/mmocr/datasets/preparers/data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -10,7 +10,7 @@ from typing import Dict, List, Optional, Tuple from mmengine import mkdir_or_exist from mmocr.utils import check_integrity, is_archive -from .data_preparer import DATA_OBTAINERS +from ..data_preparer import DATA_OBTAINERS ssl._create_default_https_context = ssl._create_unverified_context @@ -24,8 +24,12 @@ class NaiveDataObtainer: Args: files (list[dict]): A list of file information. cache_path (str): The path to cache the downloaded files. - data_root (str): The root path of the dataset. - task (str): The task of the dataset. + data_root (str): The root path of the dataset. It is usually set auto- + matically and users do not need to set it manually in config file + in most cases. + task (str): The task of the dataset. It is usually set automatically + and users do not need to set it manually in config file + in most cases. """ def __init__(self, files: List[Dict], cache_path: str, data_root: str, @@ -114,6 +118,23 @@ class NaiveDataObtainer: dst_path = osp.join(osp.dirname(src_path), zip_name) else: dst_path = osp.join(dst_path, zip_name) + + extracted = False + if osp.exists(dst_path): + name = set(os.listdir(dst_path)) + if '.finish' in name: + extracted = True + elif '.finish' not in name and len(name) > 0: + while True: + c = input(f'{dst_path} already exists when extracting ' + '{zip_name}, whether to unzip again? (y/n)') + if c.lower() in ['y', 'n']: + extracted = c == 'n' + break + if extracted: + open(osp.join(dst_path, '.finish'), 'w').close() + print(f'{zip_name} has been extracted. Skip') + return mkdir_or_exist(dst_path) print(f'Extracting: {osp.basename(src_path)}') if src_path.endswith('.zip'): @@ -136,6 +157,8 @@ class NaiveDataObtainer: 'Please install tarfile by running "pip install tarfile".') with tarfile.open(src_path, mode) as tar_ref: tar_ref.extractall(dst_path) + + open(osp.join(dst_path, '.finish'), 'w').close() if delete: os.remove(src_path) diff --git a/mmocr/datasets/preparers/packers/__init__.py b/mmocr/datasets/preparers/packers/__init__.py new file mode 100644 index 00000000..78eb55dc --- /dev/null +++ b/mmocr/datasets/preparers/packers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BasePacker +from .textdet_packer import TextDetPacker +from .textrecog_packer import TextRecogCropPacker, TextRecogPacker +from .textspotting_packer import TextSpottingPacker +from .wildreceipt_packer import WildReceiptPacker + +__all__ = [ + 'BasePacker', 'TextDetPacker', 'TextRecogPacker', 'TextRecogCropPacker', + 'TextSpottingPacker', 'WildReceiptPacker' +] diff --git a/mmocr/datasets/preparers/packers/base.py b/mmocr/datasets/preparers/packers/base.py new file mode 100644 index 00000000..4826fd32 --- /dev/null +++ b/mmocr/datasets/preparers/packers/base.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod +from typing import Dict, List, Tuple + +from mmengine import track_parallel_progress + + +class BasePacker: + """Base class for packing the parsed annotation info to MMOCR format. + + Args: + data_root (str): The root path of the dataset. It is usually set auto- + matically and users do not need to set it manually in config file + in most cases. + split (str): The split of the dataset. It is usually set automatically + and users do not need to set it manually in config file in most + cases. + nproc (int): Number of processes to process the data. Defaults to 1. + It is usually set automatically and users do not need to set it + manually in config file in most cases. + """ + + def __init__(self, data_root: str, split: str, nproc: int = 1) -> None: + self.data_root = data_root + self.split = split + self.nproc = nproc + + @abstractmethod + def pack_instance(self, sample: Tuple, split: str) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, ann_file). + - img_path (str): Path to image file. + - instances (Sequence[Dict]): A list of converted annos. + split (str): The split of the instance. + + Returns: + Dict: An MMOCR format instance. + """ + + @abstractmethod + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + + def __call__(self, samples) -> Dict: + samples = track_parallel_progress( + self.pack_instance, samples, nproc=self.nproc) + samples = self.add_meta(samples) + return samples diff --git a/mmocr/datasets/preparers/packers/textdet_packer.py b/mmocr/datasets/preparers/packers/textdet_packer.py new file mode 100644 index 00000000..f66364a6 --- /dev/null +++ b/mmocr/datasets/preparers/packers/textdet_packer.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Tuple + +import mmcv + +from mmocr.utils import bbox2poly, poly2bbox +from ..data_preparer import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class TextDetPacker(BasePacker): + """Text detection packer. It is used to pack the parsed annotation info to. + + .. code-block:: python + + { + "metainfo": + { + "dataset_type": "TextDetDataset", + "task_name": "textdet", + "category": [{"id": 0, "name": "text"}] + }, + "data_list": + [ + { + "img_path": "test_img.jpg", + "height": 640, + "width": 640, + "instances": + [ + { + "polygon": [0, 0, 0, 10, 10, 20, 20, 0], + "bbox": [0, 0, 10, 20], + "bbox_label": 0, + "ignore": False + }, + // ... + ] + } + ] + } + """ + + def pack_instance(self, sample: Tuple, bbox_label: int = 0) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, instances). + - img_path (str): Path to the image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + + - 'poly' or 'box' + - 'ignore' + - 'bbox_label' (optional) + split (str): The split of the instance. + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + packed_instances = list() + for instance in instances: + poly = instance.get('poly', None) + box = instance.get('box', None) + assert box or poly + packed_sample = dict( + polygon=poly if poly else list( + bbox2poly(box).astype('float64')), + bbox=box if box else list(poly2bbox(poly).astype('float64')), + bbox_label=bbox_label, + ignore=instance['ignore']) + packed_instances.append(packed_sample) + + packed_instances = dict( + instances=packed_instances, + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + + return packed_instances + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + meta = { + 'metainfo': { + 'dataset_type': 'TextDetDataset', + 'task_name': 'textdet', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }, + 'data_list': sample + } + return meta diff --git a/mmocr/datasets/preparers/packers/textrecog_packer.py b/mmocr/datasets/preparers/packers/textrecog_packer.py new file mode 100644 index 00000000..5f68e4fc --- /dev/null +++ b/mmocr/datasets/preparers/packers/textrecog_packer.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Tuple + +import mmcv +from mmengine import mkdir_or_exist + +from mmocr.utils import bbox2poly, crop_img, poly2bbox, warp_img +from ..data_preparer import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class TextRecogPacker(BasePacker): + """Text recogntion packer. It is used to pack the parsed annotation info + to: + + .. code-block:: python + + { + "metainfo": + { + "dataset_type": "TextRecogDataset", + "task_name": "textrecog", + }, + "data_list": + [ + { + "img_path": "textrecog_imgs/train/test_img.jpg", + "instances": + [ + { + "text": "GRAND" + } + ] + } + ] + } + """ + + def pack_instance(self, sample: Tuple) -> Dict: + """Pack the text info to a recognition instance. + + Args: + samples (Tuple): A tuple of (img_name, text). + split (str): The split of the instance. + + Returns: + Dict: The packed instance. + """ + + img_name, text = sample + # TODO: remove hard code + packed_instance = dict( + instances=[dict(text=text)], + img_path=osp.join('textrecog_imgs', self.split, + osp.basename(img_name))) + + return packed_instance + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + meta = { + 'metainfo': { + 'dataset_type': 'TextRecogDataset', + 'task_name': 'textrecog' + }, + 'data_list': sample + } + return meta + + +@DATA_PACKERS.register_module() +class TextRecogCropPacker(TextRecogPacker): + """Text recognition packer with image cropper. It is used to pack the + parsed annotation info and crop out the word images from the full-size + ones. + + Args: + crop_with_warp (bool): Whether to crop the text from the original + image using opencv warpPerspective. + jitter (bool): (Applicable when crop_with_warp=True) + Whether to jitter the box. + jitter_ratio_x (float): (Applicable when crop_with_warp=True) + Horizontal jitter ratio relative to the height. + jitter_ratio_y (float): (Applicable when crop_with_warp=True) + Vertical jitter ratio relative to the height. + long_edge_pad_ratio (float): (Applicable when crop_with_warp=False) + The ratio of padding the long edge of the cropped image. + Defaults to 0.1. + short_edge_pad_ratio (float): (Applicable when crop_with_warp=False) + The ratio of padding the short edge of the cropped image. + Defaults to 0.05. + """ + + def __init__(self, + crop_with_warp: bool = False, + jitter: bool = False, + jitter_ratio_x: float = 0.0, + jitter_ratio_y: float = 0.0, + long_edge_pad_ratio: float = 0.0, + short_edge_pad_ratio: float = 0.0, + **kwargs): + super().__init__(**kwargs) + self.crop_with_warp = crop_with_warp + self.jitter = jitter + self.jrx = jitter_ratio_x + self.jry = jitter_ratio_y + self.lepr = long_edge_pad_ratio + self.sepr = short_edge_pad_ratio + # Crop converter crops the images of textdet to patches + self.cropped_img_dir = 'textrecog_imgs' + self.crop_save_path = osp.join(self.data_root, self.cropped_img_dir) + mkdir_or_exist(self.crop_save_path) + mkdir_or_exist(osp.join(self.crop_save_path, self.split)) + + def pack_instance(self, sample: Tuple) -> List: + """Crop patches from image. + + Args: + samples (Tuple): A tuple of (img_name, text). + + Return: + List: The list of cropped patches. + """ + + def get_box(instance: Dict) -> List: + if 'box' in instance: + return bbox2poly(instance['box']).tolist() + if 'poly' in instance: + return bbox2poly(poly2bbox(instance['poly'])).tolist() + + def get_poly(instance: Dict) -> List: + if 'poly' in instance: + return instance['poly'] + if 'box' in instance: + return bbox2poly(instance['box']).tolist() + + data_list = [] + img_path, instances = sample + img = mmcv.imread(img_path) + for i, instance in enumerate(instances): + if instance['ignore']: + continue + if self.crop_with_warp: + poly = get_poly(instance) + patch = warp_img(img, poly, self.jitter, self.jrx, self.jry) + else: + box = get_box(instance) + patch = crop_img(img, box, self.lepr, self.sepr) + if patch.shape[0] == 0 or patch.shape[1] == 0: + continue + text = instance['text'] + patch_name = osp.splitext( + osp.basename(img_path))[0] + f'_{i}' + osp.splitext( + osp.basename(img_path))[1] + dst_path = osp.join(self.crop_save_path, self.split, patch_name) + mmcv.imwrite(patch, dst_path) + rec_instance = dict( + instances=[dict(text=text)], + img_path=osp.join(self.cropped_img_dir, self.split, + patch_name)) + data_list.append(rec_instance) + + return data_list + + def add_meta(self, sample: List) -> Dict: + # Since the TextRecogCropConverter packs all of the patches in a single + # image into a list, we need to flatten the list. + sample = [item for sublist in sample for item in sublist] + return super().add_meta(sample) diff --git a/mmocr/datasets/preparers/packers/textspotting_packer.py b/mmocr/datasets/preparers/packers/textspotting_packer.py new file mode 100644 index 00000000..b0fc695b --- /dev/null +++ b/mmocr/datasets/preparers/packers/textspotting_packer.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Dict, List, Tuple + +import mmcv + +from mmocr.utils import bbox2poly, poly2bbox +from ..data_preparer import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class TextSpottingPacker(BasePacker): + """Text spotting packer. It is used to pack the parsed annotation info to: + + .. code-block:: python + + { + "metainfo": + { + "dataset_type": "TextDetDataset", + "task_name": "textdet", + "category": [{"id": 0, "name": "text"}] + }, + "data_list": + [ + { + "img_path": "test_img.jpg", + "height": 640, + "width": 640, + "instances": + [ + { + "polygon": [0, 0, 0, 10, 10, 20, 20, 0], + "bbox": [0, 0, 10, 20], + "bbox_label": 0, + "ignore": False, + "text": "mmocr" + }, + // ... + ] + } + ] + } + """ + + def pack_instance(self, sample: Tuple, bbox_label: int = 0) -> Dict: + """Pack the parsed annotation info to an MMOCR format instance. + + Args: + sample (Tuple): A tuple of (img_file, ann_file). + - img_path (str): Path to image file. + - instances (Sequence[Dict]): A list of converted annos. Each + element should be a dict with the following keys: + - 'poly' or 'box' + - 'text' + - 'ignore' + - 'bbox_label' (optional) + split (str): The split of the instance. + + Returns: + Dict: An MMOCR format instance. + """ + + img_path, instances = sample + + img = mmcv.imread(img_path) + h, w = img.shape[:2] + + packed_instances = list() + for instance in instances: + assert 'text' in instance, 'Text is not found in the instance.' + poly = instance.get('poly', None) + box = instance.get('box', None) + assert box or poly + packed_sample = dict( + polygon=poly if poly else list( + bbox2poly(box).astype('float64')), + bbox=box if box else list(poly2bbox(poly).astype('float64')), + bbox_label=bbox_label, + ignore=instance['ignore'], + text=instance['text']) + packed_instances.append(packed_sample) + + packed_instances = dict( + instances=packed_instances, + img_path=osp.relpath(img_path, self.data_root), + height=h, + width=w) + + return packed_instances + + def add_meta(self, sample: List) -> Dict: + """Add meta information to the sample. + + Args: + sample (List): A list of samples of the dataset. + + Returns: + Dict: A dict contains the meta information and samples. + """ + meta = { + 'metainfo': { + 'dataset_type': 'TextSpottingDataset', + 'task_name': 'textspotting', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }, + 'data_list': sample + } + return meta diff --git a/mmocr/datasets/preparers/packers/wildreceipt_packer.py b/mmocr/datasets/preparers/packers/wildreceipt_packer.py new file mode 100644 index 00000000..efdaf872 --- /dev/null +++ b/mmocr/datasets/preparers/packers/wildreceipt_packer.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +from typing import List + +from ..data_preparer import DATA_PACKERS +from .base import BasePacker + + +@DATA_PACKERS.register_module() +class WildReceiptPacker(BasePacker): + """Pack the wildreceipt annotation to MMOCR format. + + Args: + merge_bg_others (bool): If True, give the same label to "background" + class and "others" class. Defaults to True. + ignore_idx (int): Index for ``ignore`` class. Defaults to 0. + others_idx (int): Index for ``others`` class. Defaults to 25. + """ + + def __init__(self, + merge_bg_others: bool = False, + ignore_idx: int = 0, + others_idx: int = 25, + **kwargs) -> None: + super().__init__(**kwargs) + + self.ignore_idx = ignore_idx + self.others_idx = others_idx + self.merge_bg_others = merge_bg_others + + def add_meta(self, samples: List) -> List: + """No meta info is required for the wildreceipt dataset.""" + return samples + + def pack_instance(self, sample: str): + """Pack line-json str of close set to line-json str of open set. + + Args: + sample (str): The string to be deserialized to + the close set dictionary object. + split (str): The split of the instance. + """ + # Two labels at the same index of the following two lists + # make up a key-value pair. For example, in wildreceipt, + # closeset_key_inds[0] maps to "Store_name_key" + # and closeset_value_inds[0] maps to "Store_addr_value". + closeset_key_inds = list(range(2, self.others_idx, 2)) + closeset_value_inds = list(range(1, self.others_idx, 2)) + + openset_node_label_mapping = { + 'bg': 0, + 'key': 1, + 'value': 2, + 'others': 3 + } + if self.merge_bg_others: + openset_node_label_mapping['others'] = openset_node_label_mapping[ + 'bg'] + + closeset_obj = json.loads(sample) + openset_obj = { + 'file_name': + closeset_obj['file_name'].replace(self.data_root + '/', ''), + 'height': + closeset_obj['height'], + 'width': + closeset_obj['width'], + 'annotations': [] + } + + edge_idx = 1 + label_to_edge = {} + for anno in closeset_obj['annotations']: + label = anno['label'] + if label == self.ignore_idx: + anno['label'] = openset_node_label_mapping['bg'] + anno['edge'] = edge_idx + edge_idx += 1 + elif label == self.others_idx: + anno['label'] = openset_node_label_mapping['others'] + anno['edge'] = edge_idx + edge_idx += 1 + else: + edge = label_to_edge.get(label, None) + if edge is not None: + anno['edge'] = edge + if label in closeset_key_inds: + anno['label'] = openset_node_label_mapping['key'] + elif label in closeset_value_inds: + anno['label'] = openset_node_label_mapping['value'] + else: + tmp_key = 'key' + if label in closeset_key_inds: + label_with_same_edge = closeset_value_inds[ + closeset_key_inds.index(label)] + elif label in closeset_value_inds: + label_with_same_edge = closeset_key_inds[ + closeset_value_inds.index(label)] + tmp_key = 'value' + edge_counterpart = label_to_edge.get( + label_with_same_edge, None) + if edge_counterpart is not None: + anno['edge'] = edge_counterpart + else: + anno['edge'] = edge_idx + edge_idx += 1 + anno['label'] = openset_node_label_mapping[tmp_key] + label_to_edge[label] = anno['edge'] + + openset_obj['annotations'] = closeset_obj['annotations'] + + return json.dumps(openset_obj, ensure_ascii=False) diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index d5d95cca..9b818863 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseParser from .coco_parser import COCOTextDetAnnParser from .funsd_parser import FUNSDTextDetAnnParser from .icdar_txt_parser import (ICDARTxtTextDetAnnParser, @@ -10,7 +11,7 @@ from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser __all__ = [ - 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', + 'BaseParser', 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', 'SROIETextDetAnnParser', 'NAFAnnParser' diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py index 58bc35b1..dfe79e15 100644 --- a/mmocr/datasets/preparers/parsers/base.py +++ b/mmocr/datasets/preparers/parsers/base.py @@ -1,67 +1,87 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import abstractmethod -from functools import partial -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Tuple, Union -from mmengine import track_parallel_progress +from mmocr.utils import track_parallel_progress_multi_args class BaseParser: """Base class for parsing annotations. Args: - data_root (str, optional): Path to the data root. Defaults to None. - nproc (int, optional): Number of processes. Defaults to 1. + split (str): The split of the dataset. It is usually set automatically + and users do not need to set it manually in config file in most + cases. + nproc (int): Number of processes to process the data. Defaults to 1. + It is usually set automatically and users do not need to set it + manually in config file in most cases. """ - def __init__(self, - data_root: Optional[str] = None, - nproc: int = 1) -> None: - self.data_root = data_root + def __init__(self, split: str, nproc: int = 1) -> None: self.nproc = nproc + self.split = split - def __call__(self, files: List[Tuple], split: str) -> List: + def __call__(self, img_paths: Union[List[str], str], + ann_paths: Union[List[str], str]) -> List[Tuple]: """Parse annotations. Args: - files (List[Tuple]): A list of a tuple of - (image_path, annotation_path). - split (str): The split of the dataset. + img_paths (str or list[str]): the list of image paths or the + directory of the images. + ann_paths (str or list[str]): the list of annotation paths or the + path of the annotation file which contains all the annotations. Returns: List: A list of a tuple of (image_path, instances) """ - samples = self.parse_files(files, split) + samples = self.parse_files(img_paths, ann_paths) return samples - def parse_files(self, files: List[Tuple], split: str) -> List[Tuple]: + def parse_files(self, img_paths: Union[List[str], str], + ann_paths: Union[List[str], str]) -> List[Tuple]: """Convert annotations to MMOCR format. Args: - files (Tuple): A list of tuple of path to image and annotation. + img_paths (str or list[str]): the list of image paths or the + directory of the images. + ann_paths (str or list[str]): the list of annotation paths or the + path of the annotation file which contains all the annotations. Returns: - List[Tuple]: A list of a tuple of (image_path, instances) + List[Tuple]: A list of a tuple of (image_path, instances). + + - img_path (str): The path of image file, which can be read + directly by opencv. + - instance: instance is a list of dict containing parsed + annotations, which should contain the following keys: + + - 'poly' or 'box' (textdet or textspotting) + - 'text' (textspotting or textrecog) + - 'ignore' (all task) """ - func = partial(self.parse_file, split=split) - samples = track_parallel_progress(func, files, nproc=self.nproc) + samples = track_parallel_progress_multi_args( + self.parse_file, (img_paths, ann_paths), nproc=self.nproc) return samples @abstractmethod - def parse_file(self, file: Tuple, split: str) -> Tuple: + def parse_file(self, img_path: str, ann_path: str) -> Tuple: """Convert annotation for a single image. Args: - file (Tuple): A tuple of path to image and annotation - split (str): Current split. + img_path (str): The path of image. + ann_path (str): The path of annotation. Returns: - Tuple: A tuple of (img_path, instance). Instance is a list of dict - containing parsed annotations, which should contain the - following keys: - - 'poly' or 'box' (textdet or textspotting) - - 'text' (textspotting or textrecog) - - 'ignore' (all task) + Tuple: A tuple of (img_path, instance). + + - img_path (str): The path of image file, which can be read + directly by opencv. + - instance: instance is a list of dict containing parsed + annotations, which should contain the following keys: + + - 'poly' or 'box' (textdet or textspotting) + - 'text' (textspotting or textrecog) + - 'ignore' (all task) Examples: An example of returned values: diff --git a/mmocr/datasets/preparers/parsers/coco_parser.py b/mmocr/datasets/preparers/parsers/coco_parser.py index 0d78940a..b8b26d1f 100644 --- a/mmocr/datasets/preparers/parsers/coco_parser.py +++ b/mmocr/datasets/preparers/parsers/coco_parser.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp -from typing import Dict, Tuple +from typing import List from mmdet.datasets.api_wrappers import COCO @@ -21,25 +21,25 @@ class COCOTextDetAnnParser(BaseParser): """ def __init__(self, - data_root: str = None, + split: str, nproc: int = 1, variant: str = 'standard') -> None: - super().__init__(nproc=nproc, data_root=data_root) + super().__init__(nproc=nproc, split=split) assert variant in ['standard', 'cocotext', 'textocr'], \ f'variant {variant} is not supported' self.variant = variant - def parse_files(self, files: Tuple, split: str = None) -> Dict: + def parse_files(self, img_dir: str, ann_path: str) -> List: """Parse single annotation.""" samples = list() - coco = COCO(files) + coco = COCO(ann_path) if self.variant == 'cocotext' or self.variant == 'textocr': # cocotext stores both 'train' and 'val' split in one annotation # file, and uses the 'set' field to distinguish them. if self.variant == 'cocotext': for img in coco.dataset['imgs']: - if split == coco.dataset['imgs'][img]['set']: + if self.split == coco.dataset['imgs'][img]['set']: coco.imgs[img] = coco.dataset['imgs'][img] # textocr stores 'train' and 'val'split separately elif self.variant == 'textocr': @@ -60,8 +60,6 @@ class COCOTextDetAnnParser(BaseParser): img_info = coco.load_imgs([img_id])[0] img_info['img_id'] = img_id img_path = img_info['file_name'] - if self.data_root is not None: - img_path = osp.join(self.data_root, img_path) ann_ids = coco.get_ann_ids(img_ids=[img_id]) if len(ann_ids) == 0: continue @@ -96,5 +94,6 @@ class COCOTextDetAnnParser(BaseParser): instances.append( dict( poly=ann['points'], text=text, ignore=text == '.')) - samples.append((img_path, instances)) + samples.append((osp.join(img_dir, + osp.basename(img_path)), instances)) return samples diff --git a/mmocr/datasets/preparers/parsers/funsd_parser.py b/mmocr/datasets/preparers/parsers/funsd_parser.py index 6cb6651d..f8da6aa4 100644 --- a/mmocr/datasets/preparers/parsers/funsd_parser.py +++ b/mmocr/datasets/preparers/parsers/funsd_parser.py @@ -17,17 +17,13 @@ class FUNSDTextDetAnnParser(BaseParser): to 1. """ - def __init__(self, nproc: int = 1) -> None: - super().__init__(nproc=nproc) - - def parse_file(self, file: Tuple, split: str) -> Tuple: + def parse_file(self, img_path: str, ann_path: str) -> Tuple: """Parse single annotation.""" - img_file, json_file = file instances = list() - for poly, text, ignore in self.loader(json_file): + for poly, text, ignore in self.loader(ann_path): instances.append(dict(poly=poly, text=text, ignore=ignore)) - return img_file, instances + return img_path, instances def loader(self, file_path: str): with open(file_path, 'r') as f: diff --git a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py index 242b26d1..153d7d46 100644 --- a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py +++ b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp from typing import List, Optional, Tuple from mmocr.utils import bbox2poly @@ -35,22 +36,21 @@ class ICDARTxtTextDetAnnParser(BaseParser): ignore: str = '###', format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', encoding: str = 'utf-8', - nproc: int = 1, remove_strs: Optional[List[str]] = None, - mode: str = None) -> None: + mode: str = None, + **kwargs) -> None: self.sep = separator self.format = format self.encoding = encoding self.ignore = ignore self.mode = mode self.remove_strs = remove_strs - super().__init__(nproc=nproc) + super().__init__(**kwargs) - def parse_file(self, file: Tuple, split: str) -> Tuple: + def parse_file(self, img_path: str, ann_path: str) -> Tuple: """Parse single annotation.""" - img_file, txt_file = file instances = list() - for anno in self.loader(txt_file, self.sep, self.format, + for anno in self.loader(ann_path, self.sep, self.format, self.encoding): anno = list(anno.values()) if self.remove_strs is not None: @@ -66,7 +66,7 @@ class ICDARTxtTextDetAnnParser(BaseParser): instances.append( dict(poly=poly, text=text, ignore=text == self.ignore)) - return img_file, instances + return img_path, instances @DATA_PARSERS.register_module() @@ -97,21 +97,21 @@ class ICDARTxtTextRecogAnnParser(BaseParser): ignore: str = '#', format: str = 'img,text', encoding: str = 'utf-8', - nproc: int = 1, - remove_strs: Optional[List[str]] = ['"']) -> None: + remove_strs: Optional[List[str]] = ['"'], + **kwargs) -> None: self.sep = separator self.format = format self.encoding = encoding self.ignore = ignore self.remove_strs = remove_strs - super().__init__(nproc=nproc) + super().__init__(**kwargs) - def parse_files(self, files: str, split: str) -> List: + def parse_files(self, img_dir: str, ann_path: str) -> List: """Parse annotations.""" - assert isinstance(files, str) + assert isinstance(ann_path, str) samples = list() for anno in self.loader( - file_path=files, + file_path=ann_path, format=self.format, encoding=self.encoding, separator=self.sep): @@ -122,6 +122,6 @@ class ICDARTxtTextRecogAnnParser(BaseParser): if text == self.ignore: continue img_name = anno['img'] - samples.append((img_name, text)) + samples.append((osp.join(img_dir, osp.basename(img_name)), text)) return samples diff --git a/mmocr/datasets/preparers/parsers/naf_parser.py b/mmocr/datasets/preparers/parsers/naf_parser.py index 1e9a6164..bdcbf59b 100644 --- a/mmocr/datasets/preparers/parsers/naf_parser.py +++ b/mmocr/datasets/preparers/parsers/naf_parser.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import json -from typing import Dict, List, Tuple +from typing import List, Tuple import numpy as np @@ -28,32 +28,28 @@ class NAFAnnParser(BaseParser): "" (empty string) is if the field was blank Args: - data_root (str): Path to the dataset root. ignore (list(str)): The text of the ignored instances. Default: ['#']. det (bool): Whether to parse the detection annotation. Default: True. If False, the parser will consider special case in NAF dataset where the transcription is not available. - nproc (int): Number of processes to load the data. Default: 1. """ def __init__(self, - data_root: str, ignore: List[str] = ['#'], det: bool = True, - nproc: int = 1) -> None: + **kwargs) -> None: self.ignore = ignore self.det = det - super().__init__(data_root=data_root, nproc=nproc) + super().__init__(**kwargs) - def parse_file(self, file: Tuple, split: str) -> Dict: + def parse_file(self, img_path: str, ann_path: str) -> Tuple: """Convert single annotation.""" - img_file, json_file = file instances = list() - for poly, text in self.loader(json_file): + for poly, text in self.loader(ann_path): instances.append( dict(poly=poly, text=text, ignore=text in self.ignore)) - return img_file, instances + return img_path, instances def loader(self, file_path: str) -> str: """Load the annotation of the NAF dataset. diff --git a/mmocr/datasets/preparers/parsers/sroie_parser.py b/mmocr/datasets/preparers/parsers/sroie_parser.py index 9f97ad43..551e72c4 100644 --- a/mmocr/datasets/preparers/parsers/sroie_parser.py +++ b/mmocr/datasets/preparers/parsers/sroie_parser.py @@ -31,6 +31,7 @@ class SROIETextDetAnnParser(BaseParser): """ def __init__(self, + split: str, separator: str = ',', ignore: str = '###', format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', @@ -44,16 +45,15 @@ class SROIETextDetAnnParser(BaseParser): self.ignore = ignore self.mode = mode self.remove_strs = remove_strs - super().__init__(nproc=nproc) + super().__init__(nproc=nproc, split=split) - def parse_file(self, file: Tuple, split: str) -> Tuple: + def parse_file(self, img_path: str, ann_path: str) -> Tuple: """Parse single annotation.""" - img_file, txt_file = file instances = list() try: # there might be some illegal symbols in the annotation # which cannot be parsed by loader - for anno in self.loader(txt_file, self.sep, self.format, + for anno in self.loader(ann_path, self.sep, self.format, self.encoding): anno = list(anno.values()) if self.remove_strs is not None: @@ -71,4 +71,4 @@ class SROIETextDetAnnParser(BaseParser): except Exception: pass - return img_file, instances + return img_path, instances diff --git a/mmocr/datasets/preparers/parsers/svt_parser.py b/mmocr/datasets/preparers/parsers/svt_parser.py index 2cd28522..1a34d776 100644 --- a/mmocr/datasets/preparers/parsers/svt_parser.py +++ b/mmocr/datasets/preparers/parsers/svt_parser.py @@ -17,15 +17,13 @@ class SVTTextDetAnnParser(BaseParser): to 1. """ - def __init__(self, data_root: str = None, nproc: int = 1) -> None: - super().__init__(data_root=data_root, nproc=nproc) - - def parse_files(self, files: str, split: str) -> List: + def parse_files(self, img_dir: str, ann_path: str) -> List: """Parse annotations.""" - assert isinstance(files, str) + assert isinstance(ann_path, str) samples = list() - for img_name, instance in self.loader(files): - samples.append((img_name, instance)) + for img_name, instance in self.loader(ann_path): + samples.append((osp.join(img_dir, + osp.basename(img_name)), instance)) return samples @@ -45,8 +43,7 @@ class SVTTextDetAnnParser(BaseParser): tree = ET.parse(file_path) root = tree.getroot() for image in root.findall('image'): - image_name = osp.join(self.data_root, 'textdet_imgs', - image.find('imageName').text) + image_name = image.find('imageName').text instances = list() for rectangle in image.find('taggedRectangles'): x = int(rectangle.get('x')) diff --git a/mmocr/datasets/preparers/parsers/totaltext_parser.py b/mmocr/datasets/preparers/parsers/totaltext_parser.py index 1a7d65c5..a792e152 100644 --- a/mmocr/datasets/preparers/parsers/totaltext_parser.py +++ b/mmocr/datasets/preparers/parsers/totaltext_parser.py @@ -23,22 +23,18 @@ class TotaltextTextDetAnnParser(BaseParser): nproc (int): Number of processes to load the data. Default: 1. """ - def __init__(self, - data_root: str, - ignore: str = '#', - nproc: int = 1) -> None: + def __init__(self, ignore: str = '#', **kwargs) -> None: self.ignore = ignore - super().__init__(data_root=data_root, nproc=nproc) + super().__init__(**kwargs) - def parse_file(self, file: Tuple, split: str) -> Dict: + def parse_file(self, img_path: str, ann_path: str) -> Dict: """Convert single annotation.""" - img_file, txt_file = file instances = list() - for poly, text in self.loader(txt_file): + for poly, text in self.loader(ann_path): instances.append( dict(poly=poly, text=text, ignore=text == self.ignore)) - return img_file, instances + return img_path, instances def loader(self, file_path: str) -> str: """The annotation of the totaltext dataset may be stored in multiple diff --git a/mmocr/datasets/preparers/parsers/wildreceipt_parser.py b/mmocr/datasets/preparers/parsers/wildreceipt_parser.py index b1a95236..509677cf 100644 --- a/mmocr/datasets/preparers/parsers/wildreceipt_parser.py +++ b/mmocr/datasets/preparers/parsers/wildreceipt_parser.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import os.path as osp -from typing import Dict, Tuple +from typing import Dict from mmocr.utils import list_from_file from ..data_preparer import DATA_PARSERS @@ -30,21 +30,18 @@ class WildreceiptTextDetAnnParser(BaseParser): to 1. """ - def __init__(self, - data_root: str, - ignore: int = 0, - nproc: int = 1) -> None: + def __init__(self, ignore: int = 0, **kwargs) -> None: self.ignore = ignore - super().__init__(data_root=data_root, nproc=nproc) + super().__init__(**kwargs) - def parse_files(self, files: Tuple, split: str) -> Dict: + def parse_files(self, img_dir: str, ann_path) -> Dict: """Convert single annotation.""" - closeset_lines = list_from_file(files) + closeset_lines = list_from_file(ann_path) samples = list() for line in closeset_lines: instances = list() line = json.loads(line) - img_file = osp.join(self.data_root, line['file_name']) + img_file = osp.join(img_dir, osp.basename(line['file_name'])) for anno in line['annotations']: poly = anno['box'] text = anno['text'] @@ -72,21 +69,23 @@ class WildreceiptKIEAnnParser(BaseParser): ]} Args: - data_root (str): The root path of the dataset. ignore (int): The label to be ignored. Defaults to 0. nproc (int): The number of processes to parse the annotation. Defaults to 1. """ - def __init__(self, - data_root: str, - ignore: int = 0, - nproc: int = 1) -> None: + def __init__(self, ignore: int = 0, **kwargs) -> None: self.ignore = ignore - super().__init__(data_root=data_root, nproc=nproc) + super().__init__(**kwargs) - def parse_files(self, files: Tuple, split: str) -> Dict: + def parse_files(self, img_dir: str, ann_path: str) -> Dict: """Convert single annotation.""" - closeset_lines = list_from_file(files) + closeset_lines = list_from_file(ann_path) + samples = list() + for line in closeset_lines: + json_line = json.loads(line) + img_file = osp.join(img_dir, osp.basename(json_line['file_name'])) + json_line['file_name'] = img_file + samples.append(json.dumps(json_line)) - return closeset_lines + return samples diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index 8bbaccd9..3e4fb6fb 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -19,6 +19,7 @@ from .polygon_utils import (boundary_iou, crop_polygon, is_poly_inside_rect, poly_union, polys2shapely, rescale_polygon, rescale_polygons, shapely2poly, sort_points, sort_vertex, sort_vertex8) +from .processing import track_parallel_progress_multi_args from .setup_env import register_all_modules from .string_utils import StringStripper from .transform_utils import remove_pipeline_elements @@ -48,5 +49,6 @@ __all__ = [ 'OptTensor', 'ColorType', 'OptKIESampleList', 'KIESampleList', 'is_archive', 'check_integrity', 'list_files', 'get_md5', 'InstanceList', 'LabelList', 'OptInstanceList', 'OptLabelList', 'RangeType', - 'remove_pipeline_elements', 'bezier2poly', 'poly2bezier' + 'remove_pipeline_elements', 'bezier2poly', 'poly2bezier', + 'track_parallel_progress_multi_args' ] diff --git a/mmocr/utils/processing.py b/mmocr/utils/processing.py new file mode 100644 index 00000000..2da6ff2c --- /dev/null +++ b/mmocr/utils/processing.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import sys +from collections.abc import Iterable + +from mmengine.utils.progressbar import ProgressBar, init_pool + + +def track_parallel_progress_multi_args(func, + args, + nproc, + initializer=None, + initargs=None, + bar_width=50, + chunksize=1, + skip_first=False, + file=sys.stdout): + """Track the progress of parallel task execution with a progress bar. + + The built-in :mod:`multiprocessing` module is used for process pools and + tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`. + + Args: + func (callable): The function to be applied to each task. + tasks (tuple[Iterable]): A tuple of tasks. + nproc (int): Process (worker) number. + initializer (None or callable): Refer to :class:`multiprocessing.Pool` + for details. + initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for + details. + chunksize (int): Refer to :class:`multiprocessing.Pool` for details. + bar_width (int): Width of progress bar. + skip_first (bool): Whether to skip the first sample for each worker + when estimating fps, since the initialization step may takes + longer. + keep_order (bool): If True, :func:`Pool.imap` is used, otherwise + :func:`Pool.imap_unordered` is used. + + Returns: + list: The task results. + """ + assert isinstance(args, tuple) + for arg in args: + assert isinstance(arg, Iterable) + assert len(set([len(arg) + for arg in args])) == 1, 'args must have same length' + task_num = len(args[0]) + tasks = zip(*args) + + pool = init_pool(nproc, initializer, initargs) + start = not skip_first + task_num -= nproc * chunksize * int(skip_first) + prog_bar = ProgressBar(task_num, bar_width, start, file=file) + results = [] + gen = pool.starmap(func, tasks, chunksize) + for result in gen: + results.append(result) + if skip_first: + if len(results) < nproc * chunksize: + continue + elif len(results) == nproc * chunksize: + prog_bar.start() + continue + prog_bar.update() + prog_bar.file.write('\n') + pool.close() + pool.join() + return results diff --git a/tests/data/preparer/dummy/metafile.yml b/tests/data/preparer/dummy/metafile.yml deleted file mode 100644 index 7706ef53..00000000 --- a/tests/data/preparer/dummy/metafile.yml +++ /dev/null @@ -1,24 +0,0 @@ -Name: Dummy Dataset -Paper: - Title: Dummy Dataset - URL: https://github.com/open-mmlab/mmocr - Venue: MMOCR - Year: 2022 - BibTeX: '' -Data: - Website: https://github.com/open-mmlab/mmocr - Language: - - English - - Chinese - Scene: - - Natural Scene - Granularity: - - Word - Tasks: - - textdet - - textrecog - - textspotting - License: - Type: CC BY 4.0 - Link: https://creativecommons.org/licenses/by/4.0/ - Format: .txt diff --git a/tests/data/preparer/dummy/textdet.py b/tests/data/preparer/dummy/textdet.py deleted file mode 100644 index 2fa11b20..00000000 --- a/tests/data/preparer/dummy/textdet.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -data_root = 'tests/data/preparer/dummy' -cache_path = 'tests/data/preparer/dummy' diff --git a/tests/test_datasets/test_preparers/test_config_generators/test_textdet_config_generator.py b/tests/test_datasets/test_preparers/test_config_generators/test_textdet_config_generator.py new file mode 100644 index 00000000..988e9a58 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_config_generators/test_textdet_config_generator.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers import TextDetConfigGenerator + + +class TestTextDetConfigGenerator(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def test_textdet_config_generator(self): + config_generator = TextDetConfigGenerator( + data_root=self.root.name, + dataset_name='dummy', + train_anns=[ + dict(ann_file='textdet_train.json', dataset_postfix='') + ], + val_anns=[], + test_anns=[ + dict(ann_file='textdet_test.json', dataset_postfix='fake') + ], + config_path=self.root.name, + ) + cfg_path = osp.join(self.root.name, 'textdet', '_base_', 'datasets', + 'dummy.py') + config_generator() + self.assertTrue(osp.exists(cfg_path)) + f = open(cfg_path, 'r') + lines = ''.join(f.readlines()) + + self.assertEquals( + lines, (f"dummy_textdet_data_root = '{self.root.name}'\n" + '\n' + 'dummy_textdet_train = dict(\n' + " type='OCRDataset',\n" + ' data_root=dummy_textdet_data_root,\n' + " ann_file='textdet_train.json',\n" + ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' + ' pipeline=None)\n' + '\n' + 'dummy_fake_textdet_test = dict(\n' + " type='OCRDataset',\n" + ' data_root=dummy_textdet_data_root,\n' + " ann_file='textdet_test.json',\n" + ' test_mode=True,\n' + ' pipeline=None)\n')) + with self.assertRaises(ValueError): + TextDetConfigGenerator( + data_root=self.root.name, + dataset_name='dummy', + train_anns=[ + dict(ann_file='textdet_train.json', dataset_postfix='1'), + dict(ann_file='textdet_train_1.json', dataset_postfix='1') + ], + config_path=self.root.name, + ) diff --git a/tests/test_datasets/test_preparers/test_config_generators/test_textrecog_config_generator.py b/tests/test_datasets/test_preparers/test_config_generators/test_textrecog_config_generator.py new file mode 100644 index 00000000..7a5cc83d --- /dev/null +++ b/tests/test_datasets/test_preparers/test_config_generators/test_textrecog_config_generator.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers import TextRecogConfigGenerator + + +class TestTextRecogConfigGenerator(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def test_textrecog_config_generator(self): + config_generator = TextRecogConfigGenerator( + data_root=self.root.name, + dataset_name='dummy', + train_anns=[ + dict(ann_file='textrecog_train.json', dataset_postfix='') + ], + val_anns=[], + test_anns=[ + dict(ann_file='textrecog_test.json', dataset_postfix='fake') + ], + config_path=self.root.name, + ) + cfg_path = osp.join(self.root.name, 'textrecog', '_base_', 'datasets', + 'dummy.py') + config_generator() + self.assertTrue(osp.exists(cfg_path)) + f = open(cfg_path, 'r') + lines = ''.join(f.readlines()) + + self.assertEquals(lines, + (f"dummy_textrecog_data_root = '{self.root.name}'\n" + '\n' + 'dummy_textrecog_train = dict(\n' + " type='OCRDataset',\n" + ' data_root=dummy_textrecog_data_root,\n' + " ann_file='textrecog_train.json',\n" + ' pipeline=None)\n' + '\n' + 'dummy_fake_textrecog_test = dict(\n' + " type='OCRDataset',\n" + ' data_root=dummy_textrecog_data_root,\n' + " ann_file='textrecog_test.json',\n" + ' test_mode=True,\n' + ' pipeline=None)\n')) + with self.assertRaises(ValueError): + TextRecogConfigGenerator( + data_root=self.root.name, + dataset_name='dummy', + train_anns=[ + dict(ann_file='textrecog_train.json', dataset_postfix='1'), + dict( + ann_file='textrecog_train_1.json', dataset_postfix='1') + ], + config_path=self.root.name, + ) diff --git a/tests/test_datasets/test_preparers/test_config_generators/test_textspotting_config_generator.py b/tests/test_datasets/test_preparers/test_config_generators/test_textspotting_config_generator.py new file mode 100644 index 00000000..cab434cf --- /dev/null +++ b/tests/test_datasets/test_preparers/test_config_generators/test_textspotting_config_generator.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers import TextSpottingConfigGenerator + + +class TestTextSpottingConfigGenerator(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def test_textspotting_config_generator(self): + config_generator = TextSpottingConfigGenerator( + data_root=self.root.name, + dataset_name='dummy', + train_anns=[ + dict(ann_file='textspotting_train.json', dataset_postfix='') + ], + val_anns=[], + test_anns=[ + dict( + ann_file='textspotting_test.json', dataset_postfix='fake') + ], + config_path=self.root.name, + ) + cfg_path = osp.join(self.root.name, 'textspotting', '_base_', + 'datasets', 'dummy.py') + config_generator() + self.assertTrue(osp.exists(cfg_path)) + f = open(cfg_path, 'r') + lines = ''.join(f.readlines()) + + self.assertEquals( + lines, (f"dummy_textspotting_data_root = '{self.root.name}'\n" + '\n' + 'dummy_textspotting_train = dict(\n' + " type='OCRDataset',\n" + ' data_root=dummy_textspotting_data_root,\n' + " ann_file='textspotting_train.json',\n" + ' filter_cfg=dict(filter_empty_gt=True, min_size=32),\n' + ' pipeline=None)\n' + '\n' + 'dummy_fake_textspotting_test = dict(\n' + " type='OCRDataset',\n" + ' data_root=dummy_textspotting_data_root,\n' + " ann_file='textspotting_test.json',\n" + ' test_mode=True,\n' + ' pipeline=None)\n')) + with self.assertRaises(ValueError): + TextSpottingConfigGenerator( + data_root=self.root.name, + dataset_name='dummy', + train_anns=[ + dict( + ann_file='textspotting_train.json', + dataset_postfix='1'), + dict( + ann_file='textspotting_train_1.json', + dataset_postfix='1') + ], + config_path=self.root.name, + ) diff --git a/tests/test_datasets/test_preparers/test_data_preparer.py b/tests/test_datasets/test_preparers/test_data_preparer.py index c531db6f..59ee0af1 100644 --- a/tests/test_datasets/test_preparers/test_data_preparer.py +++ b/tests/test_datasets/test_preparers/test_data_preparer.py @@ -1,15 +1,60 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp import unittest -from mmocr.datasets.preparers.data_preparer import DatasetPreparer +from mmengine import Config + +from mmocr.datasets.preparers import DatasetPreparer +from mmocr.datasets.preparers.data_preparer import (CFG_GENERATORS, + DATA_DUMPERS, + DATA_GATHERERS, + DATA_OBTAINERS, + DATA_PACKERS, DATA_PARSERS) + + +class Fake: + + def __init__(self, *args, **kwargs): + pass + + def __call__(self, *args, **kwargs): + return None, None + + +DATA_OBTAINERS.register_module(module=Fake) +DATA_GATHERERS.register_module(module=Fake) +DATA_PARSERS.register_module(module=Fake) +DATA_DUMPERS.register_module(module=Fake) +DATA_PACKERS.register_module(module=Fake) +CFG_GENERATORS.register_module(module=Fake) class TestDataPreparer(unittest.TestCase): - def setUp(self) -> None: - self.cfg_path = 'tests/data/preparer' - self.dataset_name = 'dummy' + def _create_config(self): + cfg_path = 'config.py' + cfg = '' + cfg += "data_root = ''\n" + cfg += 'train_preparer=dict(\n' + cfg += ' obtainer=dict(type="Fake"),\n' + cfg += ' gatherer=dict(type="Fake"),\n' + cfg += ' parser=dict(type="Fake"),\n' + cfg += ' packer=dict(type="Fake"),\n' + cfg += ' dumper=dict(type="Fake"),\n' + cfg += ')\n' + cfg += 'test_preparer=dict(\n' + cfg += ' obtainer=dict(type="Fake"),\n' + cfg += ')\n' + cfg += 'cfg_generator=dict(type="Fake")\n' + cfg += f"delete = ['{cfg_path}']\n" + + with open(cfg_path, 'w') as f: + f.write(cfg) + return cfg_path def test_dataset_preparer(self): - preparer = DatasetPreparer(self.cfg_path, self.dataset_name, 'textdet') - preparer() + cfg_path = self._create_config() + cfg = Config.fromfile(cfg_path) + preparer = DatasetPreparer.from_file(cfg) + preparer.run() + self.assertFalse(osp.exists(cfg_path)) diff --git a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py index 57e9a2f4..fe6a6118 100644 --- a/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py +++ b/tests/test_datasets/test_preparers/test_dumpers/test_dumpers.py @@ -21,8 +21,8 @@ class TestDumpers(unittest.TestCase): task_name='textdet', category=[dict(id=0, name='text')])) - dumper = JsonDumper(task) - dumper.dump(fake_data, self.root.name, split) + dumper = JsonDumper(task, split, self.root.name) + dumper.dump(fake_data) with open(osp.join(self.root.name, f'{task}_{split}.json'), 'r') as f: data = json.load(f) self.assertEqual(data, fake_data) @@ -31,8 +31,8 @@ class TestDumpers(unittest.TestCase): task, split = 'kie', 'train' fake_data = ['test1', 'test2'] - dumper = WildreceiptOpensetDumper(task) - dumper.dump(fake_data, self.root.name, split) + dumper = WildreceiptOpensetDumper(task, split, self.root.name) + dumper.dump(fake_data) with open(osp.join(self.root.name, f'openset_{split}.txt'), 'r') as f: data = f.read().splitlines() self.assertEqual(data, fake_data) diff --git a/tests/test_datasets/test_preparers/test_gatherers/test_mono_gatherer.py b/tests/test_datasets/test_preparers/test_gatherers/test_mono_gatherer.py new file mode 100644 index 00000000..84818431 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_gatherers/test_mono_gatherer.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import unittest + +from mmocr.datasets.preparers.gatherers import MonoGatherer + + +class TestMonoGatherer(unittest.TestCase): + + def test_mono_text_gatherer(self): + data_root = 'dummpy' + img_dir = 'dummy_img' + ann_dir = 'dummy_ann' + ann_name = 'dummy_ann.json' + split = 'train' + gatherer = MonoGatherer( + data_root=data_root, + img_dir=img_dir, + ann_dir=ann_dir, + ann_name=ann_name, + split=split) + gather_img_dir, ann_path = gatherer() + self.assertEqual(gather_img_dir, osp.join(data_root, img_dir)) + self.assertEqual(ann_path, osp.join(data_root, ann_dir, ann_name)) diff --git a/tests/test_datasets/test_preparers/test_gatherers/test_pair_gatherer.py b/tests/test_datasets/test_preparers/test_gatherers/test_pair_gatherer.py new file mode 100644 index 00000000..7a9d448b --- /dev/null +++ b/tests/test_datasets/test_preparers/test_gatherers/test_pair_gatherer.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import tempfile +import unittest + +import cv2 +import numpy as np + +from mmocr.datasets.preparers.gatherers import PairGatherer + + +class TestPairGatherer(unittest.TestCase): + + def test_pair_text_gatherer(self): + root = tempfile.TemporaryDirectory() + data_root = root.name + img_dir = 'dummy_img' + ann_dir = 'dummy_ann' + split = 'train' + img = np.random.randint(0, 100, size=(100, 100, 3)) + os.makedirs(osp.join(data_root, img_dir)) + os.makedirs(osp.join(data_root, ann_dir)) + for i in range(10): + cv2.imwrite(osp.join(data_root, img_dir, f'img_{i}.jpg'), img) + f = open(osp.join(data_root, ann_dir, f'img_{i}.txt'), 'w') + f.close() + f = open(osp.join(data_root, ann_dir, 'img_10.mmocr'), 'w') + f.close() + gatherer = PairGatherer( + data_root=data_root, + img_dir=img_dir, + ann_dir=ann_dir, + split=split, + img_suffixes=['.jpg'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'img_\1.txt']) + img_list, ann_list = gatherer() + self.assertEqual(len(img_list), 10) + self.assertEqual(len(ann_list), 10) + self.assertNotIn( + osp.join(data_root, ann_dir, 'img_10.mmocr'), ann_list) + root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_packers/test_textdet_packer.py b/tests/test_datasets/test_preparers/test_packers/test_textdet_packer.py new file mode 100644 index 00000000..5e70d87d --- /dev/null +++ b/tests/test_datasets/test_preparers/test_packers/test_textdet_packer.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +import cv2 +import numpy as np + +from mmocr.datasets.preparers import TextDetPacker + + +class TestTextDetPacker(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + img = np.random.randint(0, 255, (30, 20, 3), dtype=np.uint8) + cv2.imwrite(osp.join(self.root.name, 'test_img.jpg'), img) + self.instance = [{ + 'poly': [0, 0, 0, 10, 10, 20, 20, 0], + 'ignore': False + }, { + 'box': [0, 0, 10, 20], + 'ignore': False + }] + self.img_path = osp.join(self.root.name, 'test_img.jpg') + self.sample = (self.img_path, self.instance) + + def test_pack_instance(self): + packer = TextDetPacker(data_root=self.root.name, split='test') + instance = packer.pack_instance(self.sample) + self.assertEquals(instance['img_path'], 'test_img.jpg') + self.assertEquals(instance['height'], 30) + self.assertEquals(instance['width'], 20) + self.assertEquals(instance['instances'][0]['polygon'], + [0, 0, 0, 10, 10, 20, 20, 0]) + self.assertEquals(instance['instances'][0]['bbox'], + [float(x) for x in [0, 0, 20, 20]]) + self.assertEquals(instance['instances'][0]['bbox_label'], 0) + self.assertEquals(instance['instances'][0]['ignore'], False) + self.assertEquals(instance['instances'][1]['polygon'], + [0.0, 0.0, 10.0, 0.0, 10.0, 20.0, 0.0, 20.0]) + self.assertEquals(instance['instances'][1]['bbox'], + [float(x) for x in [0, 0, 10, 20]]) + self.assertEquals(instance['instances'][1]['bbox_label'], 0) + self.assertEquals(instance['instances'][1]['ignore'], False) + + def test_add_meta(self): + packer = TextDetPacker(data_root=self.root.name, split='test') + instance = packer.pack_instance(self.sample) + meta = packer.add_meta(instance) + self.assertDictEqual( + meta['metainfo'], { + 'dataset_type': 'TextDetDataset', + 'task_name': 'textdet', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_packers/test_textrecog_packer.py b/tests/test_datasets/test_preparers/test_packers/test_textrecog_packer.py new file mode 100644 index 00000000..54e68415 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_packers/test_textrecog_packer.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +import cv2 +import numpy as np + +from mmocr.datasets.preparers import TextRecogCropPacker, TextRecogPacker + + +class TestTextRecogPacker(unittest.TestCase): + + def test_pack_instance(self): + + packer = TextRecogPacker(data_root='', split='test') + sample = ('test.jpg', 'text') + results = packer.pack_instance(sample) + self.assertDictEqual( + results, + dict( + img_path=osp.join('textrecog_imgs', 'test', 'test.jpg'), + instances=[dict(text='text')])) + + def test_add_meta(self): + packer = TextRecogPacker(data_root='', split='test') + sample = [dict(img_path='test.jpg', instances=[dict(text='text')])] + results = packer.add_meta(sample) + self.assertDictEqual( + results, + dict( + metainfo=dict( + dataset_type='TextRecogDataset', task_name='textrecog'), + data_list=sample)) + + +class TestTextRecogCropPacker(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + img = np.random.randint(0, 255, (30, 40, 3), dtype=np.uint8) + cv2.imwrite(osp.join(self.root.name, 'test_img.jpg'), img) + self.instance = [{ + 'poly': [0, 0, 0, 10, 10, 20, 20, 0], + 'ignore': False, + 'text': 'text1' + }, { + 'box': [0, 0, 10, 20], + 'ignore': False, + 'text': 'text2' + }] + self.img_path = osp.join(self.root.name, 'test_img.jpg') + self.sample = (self.img_path, self.instance) + + def test_pack_instance(self): + packer = TextRecogCropPacker(data_root=self.root.name, split='test') + instance = packer.pack_instance(self.sample) + self.assertListEqual(instance, [ + dict( + img_path=osp.join('textrecog_imgs', 'test', 'test_img_0.jpg'), + instances=[dict(text='text1')]), + dict( + img_path=osp.join('textrecog_imgs', 'test', 'test_img_1.jpg'), + instances=[dict(text='text2')]) + ]) + + def test_add_meta(self): + packer = TextRecogCropPacker(data_root=self.root.name, split='test') + instance = packer.pack_instance(self.sample) + results = packer.add_meta([instance]) + self.assertDictEqual( + results, + dict( + metainfo=dict( + dataset_type='TextRecogDataset', task_name='textrecog'), + data_list=instance)) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_packers/test_textspotting_packer.py b/tests/test_datasets/test_preparers/test_packers/test_textspotting_packer.py new file mode 100644 index 00000000..e3d4a85d --- /dev/null +++ b/tests/test_datasets/test_preparers/test_packers/test_textspotting_packer.py @@ -0,0 +1,69 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +import cv2 +import numpy as np + +from mmocr.datasets.preparers import TextSpottingPacker + + +class TestTextSpottingPacker(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + img = np.random.randint(0, 255, (30, 20, 3), dtype=np.uint8) + cv2.imwrite(osp.join(self.root.name, 'test_img.jpg'), img) + self.instance = [{ + 'poly': [0, 0, 0, 10, 10, 20, 20, 0], + 'ignore': False, + 'text': 'text1' + }, { + 'box': [0, 0, 10, 20], + 'ignore': False, + 'text': 'text2' + }] + self.img_path = osp.join(self.root.name, 'test_img.jpg') + self.sample = (self.img_path, self.instance) + + def test_pack_instance(self): + packer = TextSpottingPacker(data_root=self.root.name, split='test') + instance = packer.pack_instance(self.sample) + self.assertEquals(instance['img_path'], 'test_img.jpg') + self.assertEquals(instance['height'], 30) + self.assertEquals(instance['width'], 20) + self.assertEquals(instance['instances'][0]['polygon'], + [0, 0, 0, 10, 10, 20, 20, 0]) + self.assertEquals(instance['instances'][0]['bbox'], + [float(x) for x in [0, 0, 20, 20]]) + self.assertEquals(instance['instances'][0]['bbox_label'], 0) + self.assertEquals(instance['instances'][0]['ignore'], False) + self.assertEquals(instance['instances'][0]['text'], 'text1') + self.assertEquals(instance['instances'][1]['polygon'], + [0.0, 0.0, 10.0, 0.0, 10.0, 20.0, 0.0, 20.0]) + self.assertEquals(instance['instances'][1]['bbox'], + [float(x) for x in [0, 0, 10, 20]]) + self.assertEquals(instance['instances'][1]['bbox_label'], 0) + self.assertEquals(instance['instances'][1]['ignore'], False) + self.assertEquals(instance['instances'][1]['text'], 'text2') + + def test_add_meta(self): + packer = TextSpottingPacker(data_root=self.root.name, split='test') + instance = packer.pack_instance(self.sample) + meta = packer.add_meta(instance) + self.assertDictEqual( + meta, { + 'metainfo': { + 'dataset_type': 'TextSpottingDataset', + 'task_name': 'textspotting', + 'category': [{ + 'id': 0, + 'name': 'text' + }] + }, + 'data_list': instance + }) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_funsd_parser.py b/tests/test_datasets/test_preparers/test_parsers/test_funsd_parser.py new file mode 100644 index 00000000..078a0e84 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_parsers/test_funsd_parser.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers import FUNSDTextDetAnnParser + + +class TestFUNSDTextDetAnnParser(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def _create_fake_sample(self): + fake_sample = { + 'form': [{ + 'box': [91, 279, 123, 294], + 'text': 'Date:', + 'label': 'question', + 'words': [{ + 'box': [91, 279, 123, 294], + 'text': 'Date:' + }], + 'linking': [[0, 16]], + 'id': 0 + }, { + 'box': [92, 310, 130, 324], + 'text': 'From:', + 'label': 'question', + 'words': [{ + 'box': [92, 310, 130, 324], + 'text': '' + }], + 'linking': [[1, 22]], + 'id': 1 + }] + } + ann_path = osp.join(self.root.name, 'funsd.json') + with open(ann_path, 'w') as f: + json.dump(fake_sample, f) + return ann_path + + def test_textdet_parsers(self): + ann_path = self._create_fake_sample() + parser = FUNSDTextDetAnnParser(split='train') + _, instances = parser.parse_file('fake.jpg', ann_path) + self.assertEqual(len(instances), 2) + self.assertEqual(instances[0]['text'], 'Date:') + self.assertEqual(instances[0]['ignore'], False) + self.assertEqual(instances[1]['ignore'], True) + self.assertListEqual(instances[0]['poly'], + [91, 279, 123, 279, 123, 294, 91, 294]) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py index 02a9848d..edcdfba2 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py @@ -35,9 +35,9 @@ class TestIC15Parsers(unittest.TestCase): def test_textdet_parsers(self): file = self._create_dummy_ic15_det() - parser = ICDARTxtTextDetAnnParser() + parser = ICDARTxtTextDetAnnParser(split='train') - img, instances = parser.parse_file(file, 'train') + img, instances = parser.parse_file(*file) self.assertEqual(img, file[0]) self.assertEqual(len(instances), 4) self.assertIn('poly', instances[0]) @@ -48,12 +48,15 @@ class TestIC15Parsers(unittest.TestCase): self.assertEqual(instances[3]['text'], '100,000') def test_textrecog_parsers(self): - parser = ICDARTxtTextRecogAnnParser() + parser = ICDARTxtTextRecogAnnParser(split='train') file = self._create_dummy_ic15_recog() - samples = parser.parse_files(file, 'train') + samples = parser.parse_files(self.root.name, file) self.assertEqual(len(samples), 4) img, text = samples[0] - self.assertEqual(img, 'word_1.png') + self.assertEqual(img, osp.join(self.root.name, 'word_1.png')) self.assertEqual(text, 'Genaxis Theatre') img, text = samples[3] self.assertEqual(text, '62-,03') + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_naf_parser.py b/tests/test_datasets/test_preparers/test_parsers/test_naf_parser.py new file mode 100644 index 00000000..2d70ff20 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_parsers/test_naf_parser.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers import NAFAnnParser + + +class TestNAFAnnParser(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def _create_fake_sample(self): + fake_sample = { + 'fieldBBs': [{ + 'poly_points': [[1357, 322], [1636, 324], [1636, 402], + [1357, 400]], + 'type': + 'field', + 'id': + 'f0', + 'isBlank': + 1 + }, { + 'poly_points': [[1831, 352], [1908, 353], [1908, 427], + [1830, 427]], + 'type': + 'blank', + 'id': + 'f1', + 'isBlank': + 1 + }], + 'textBBs': [{ + 'poly_points': [[1388, 80], [2003, 82], [2003, 133], + [1388, 132]], + 'type': + 'text', + 'id': + 't0' + }, { + 'poly_points': [[1065, 366], [1320, 366], [1320, 413], + [1065, 412]], + 'type': + 'text', + 'id': + 't1' + }], + 'imageFilename': + '004173988_00005.jpg', + 'transcriptions': { + 'f0': '7/24', + 'f1': '9', + 't0': 'REGISTRY RETURN RECEIPT.', + 't1': 'Date of delivery', + } + } + ann_path = osp.join(self.root.name, 'naf.json') + with open(ann_path, 'w') as f: + json.dump(fake_sample, f) + return ann_path + + def test_parsers(self): + ann_path = self._create_fake_sample() + parser = NAFAnnParser(split='train') + _, instances = parser.parse_file('fake.jpg', ann_path) + self.assertEqual(len(instances), 3) + self.assertEqual(instances[0]['ignore'], False) + self.assertEqual(instances[1]['ignore'], False) + self.assertListEqual(instances[2]['poly'], + [1357, 322, 1636, 324, 1636, 402, 1357, 400]) + + parser = NAFAnnParser(split='train', det=False) + _, instances = parser.parse_file('fake.jpg', ann_path) + self.assertEqual(len(instances), 2) + self.assertEqual(instances[0]['text'], '7/24') + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_sroie_parser.py b/tests/test_datasets/test_preparers/test_parsers/test_sroie_parser.py new file mode 100644 index 00000000..93c0bc36 --- /dev/null +++ b/tests/test_datasets/test_preparers/test_parsers/test_sroie_parser.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers import SROIETextDetAnnParser +from mmocr.utils import list_to_file + + +class TestSROIETextDetAnnParser(unittest.TestCase): + + def setUp(self) -> None: + self.root = tempfile.TemporaryDirectory() + + def _create_dummy_sroie_det(self): + fake_anno = [ + '114,54,326,54,326,92,114,92,TAN CHAY YEE', + '60,119,300,119,300,136,60,136,###', + '100,139,267,139,267,162,100,162,ROC NO: 538358-H', + '83,163,277,163,277,183,83,183,NO 2 & 4, JALAN BAYU 4,', + ] + ann_file = osp.join(self.root.name, 'sroie_det.txt') + list_to_file(ann_file, fake_anno) + return (osp.join(self.root.name, 'sroie_det.jpg'), ann_file) + + def test_textdet_parsers(self): + file = self._create_dummy_sroie_det() + parser = SROIETextDetAnnParser(split='train') + + img, instances = parser.parse_file(*file) + self.assertEqual(img, file[0]) + self.assertEqual(len(instances), 4) + self.assertIn('poly', instances[0]) + self.assertIn('text', instances[0]) + self.assertIn('ignore', instances[0]) + self.assertEqual(instances[0]['text'], 'TAN CHAY YEE') + self.assertEqual(instances[1]['ignore'], True) + self.assertEqual(instances[3]['text'], 'NO 2 & 4, JALAN BAYU 4,') + self.assertListEqual(instances[2]['poly'], + [100, 139, 267, 139, 267, 162, 100, 162]) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_svt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_svt_parsers.py index 03a238a5..1f7ddb65 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_svt_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_svt_parsers.py @@ -38,11 +38,11 @@ class TestSVTParsers(unittest.TestCase): return ann_file def test_textdet_parsers(self): - parser = SVTTextDetAnnParser(self.root.name) + parser = SVTTextDetAnnParser(split='train') file = self._create_dummy_svt_det() - samples = parser.parse_files(file, 'train') + samples = parser.parse_files(self.root.name, file) self.assertEqual(len(samples), 1) - self.assertEqual(osp.basename(samples[0][0]), 'test.jpg') + self.assertEqual(samples[0][0], osp.join(self.root.name, 'test.jpg')) self.assertEqual(len(samples[0][1]), 3) self.assertEqual(samples[0][1][0]['text'], 'living') self.assertEqual(samples[0][1][1]['text'], 'room') @@ -50,3 +50,6 @@ class TestSVTParsers(unittest.TestCase): self.assertEqual(samples[0][1][0]['poly'], [375, 253, 611, 253, 611, 328, 375, 328]) self.assertEqual(samples[0][1][0]['ignore'], False) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py index 713d7fc7..5f933e04 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_tt_parsers.py @@ -24,9 +24,9 @@ class TestTTParsers(unittest.TestCase): return (osp.join(self.root.name, 'tt_det.jpg'), ann_file) def test_textdet_parsers(self): - parser = TotaltextTextDetAnnParser(self.root.name) + parser = TotaltextTextDetAnnParser(split='train') file = self._create_dummy_tt_det() - img, instances = parser.parse_file(file, 'train') + img, instances = parser.parse_file(*file) self.assertEqual(img, file[0]) self.assertEqual(len(instances), 3) self.assertIn('poly', instances[0]) @@ -34,3 +34,6 @@ class TestTTParsers(unittest.TestCase): self.assertIn('ignore', instances[0]) self.assertEqual(instances[0]['text'], 'PERUNDING') self.assertEqual(instances[2]['ignore'], True) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py index f4e5510d..045333d4 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_wildreceipt_parsers.py @@ -39,8 +39,8 @@ class TestWildReceiptParsers(unittest.TestCase): list_to_file(self.anno, fake_sample) def test_textdet_parsers(self): - parser = WildreceiptTextDetAnnParser(self.root.name) - samples = parser.parse_files(self.anno, 'train') + parser = WildreceiptTextDetAnnParser(split='train') + samples = parser.parse_files(self.root.name, self.anno) self.assertEqual(len(samples), 1) self.assertEqual(osp.basename(samples[0][0]), 'test.jpg') instances = samples[0][1] @@ -52,6 +52,9 @@ class TestWildReceiptParsers(unittest.TestCase): self.assertEqual(instances[1]['ignore'], True) def test_kie_parsers(self): - parser = WildreceiptKIEAnnParser(self.root.name) - samples = parser.parse_files(self.anno, 'train') + parser = WildreceiptKIEAnnParser(split='train') + samples = parser.parse_files(self.root.name, self.anno) self.assertEqual(len(samples), 1) + + def tearDown(self) -> None: + self.root.cleanup() diff --git a/tests/test_utils/test_processing.py b/tests/test_utils/test_processing.py new file mode 100644 index 00000000..44953f7e --- /dev/null +++ b/tests/test_utils/test_processing.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +from mmocr.utils import track_parallel_progress_multi_args + + +def func(a, b): + return a + b + + +class TestProcessing(unittest.TestCase): + + def test_track_parallel_progress_multi_args(self): + + args = ([1, 2, 3], [4, 5, 6]) + results = track_parallel_progress_multi_args(func, args, nproc=1) + self.assertEqual(results, [5, 7, 9]) + + results = track_parallel_progress_multi_args(func, args, nproc=2) + self.assertEqual(results, [5, 7, 9]) + + with self.assertRaises(AssertionError): + track_parallel_progress_multi_args(func, 1, nproc=1) + + with self.assertRaises(AssertionError): + track_parallel_progress_multi_args(func, ([1, 2], 1), nproc=1) + + with self.assertRaises(AssertionError): + track_parallel_progress_multi_args( + func, ([1, 2], [1, 2, 3]), nproc=1) diff --git a/tools/dataset_converters/prepare_dataset.py b/tools/dataset_converters/prepare_dataset.py index 632532ef..a075804c 100644 --- a/tools/dataset_converters/prepare_dataset.py +++ b/tools/dataset_converters/prepare_dataset.py @@ -1,8 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import os.path as osp +import time import warnings +from mmengine import Config + from mmocr.datasets.preparers import DatasetPreparer @@ -21,6 +24,11 @@ def parse_args(): choices=['textdet', 'textrecog', 'textspotting', 'kie'], help='Task type. Options are "textdet", "textrecog", "textspotting"' ' and "kie".') + parser.add_argument( + '--splits', + default=['train', 'test', 'val'], + help='A list of the split that would like to prepare.', + nargs='+') parser.add_argument( '--overwrite-cfg', action='store_true', @@ -36,6 +44,35 @@ def parse_args(): return args +def parse_meta(task: str, meta_path: str) -> None: + """Parse meta file. + + Args: + cfg_path (str): Path to meta file. + """ + try: + meta = Config.fromfile(meta_path) + except FileNotFoundError: + return + assert task in meta['Data']['Tasks'], \ + f'Task {task} not supported!' + # License related + if meta['Data']['License']['Type']: + print(f"\033[1;33;40mDataset Name: {meta['Name']}") + print(f"License Type: {meta['Data']['License']['Type']}") + print(f"License Link: {meta['Data']['License']['Link']}") + print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m") + print('\033[1;31;43mMMOCR does not own the dataset. Using this ' + 'dataset you must accept the license provided by the owners, ' + 'and cite the corresponding papers appropriately.') + print('If you do not agree with the above license, please cancel ' + 'the progress immediately by pressing ctrl+c. Otherwise, ' + 'you are deemed to accept the terms and conditions.\033[0m') + for i in range(5): + print(f'{5-i}...') + time.sleep(1) + + def main(): args = parse_args() for dataset in args.datasets: @@ -43,13 +80,18 @@ def main(): warnings.warn(f'{dataset} is not supported yet. Please check ' 'dataset zoo for supported datasets.') continue - preparer = DatasetPreparer( - cfg_path=args.dataset_zoo_path, - dataset_name=dataset, - task=args.task, - nproc=args.nproc, - overwrite_cfg=args.overwrite_cfg) - preparer() + meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml') + parse_meta(args.task, meta_path) + cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py') + cfg = Config.fromfile(cfg_path) + if args.overwrite_cfg and cfg.get('config_generator', + None) is not None: + cfg.config_generator.overwrite = args.overwrite_cfg + cfg.nproc = args.nproc + cfg.task = args.task + cfg.dataset_name = dataset + preparer = DatasetPreparer.from_file(cfg) + preparer.run(args.splits) if __name__ == '__main__':