mirror of https://github.com/open-mmlab/mmocr.git
154 lines
5.5 KiB
Python
154 lines
5.5 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
|
import argparse
|
|
import os.path as osp
|
|
import time
|
|
import warnings
|
|
|
|
from mmengine import Config
|
|
|
|
from mmocr.datasets.preparers import DatasetPreparer
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(
|
|
description='Preparing datasets used in MMOCR.')
|
|
parser.add_argument(
|
|
'datasets',
|
|
help='A list of the dataset names that would like to prepare.',
|
|
nargs='+')
|
|
parser.add_argument(
|
|
'--nproc', help='Number of processes to run', default=4, type=int)
|
|
parser.add_argument(
|
|
'--task',
|
|
default='textdet',
|
|
choices=['textdet', 'textrecog', 'textspotting', 'kie'],
|
|
help='Task type. Options are "textdet", "textrecog", "textspotting"'
|
|
' and "kie".')
|
|
parser.add_argument(
|
|
'--splits',
|
|
default=['train', 'test', 'val'],
|
|
help='A list of the split that would like to prepare.',
|
|
nargs='+')
|
|
parser.add_argument(
|
|
'--lmdb',
|
|
action='store_true',
|
|
default=False,
|
|
help='Whether to dump the textrecog dataset to LMDB format, It\'s a '
|
|
'shortcut to force the dataset to be dumped in lmdb format. '
|
|
'Applicable when --task=textrecog')
|
|
parser.add_argument(
|
|
'--overwrite-cfg',
|
|
action='store_true',
|
|
default=False,
|
|
help='Whether to overwrite the dataset config file if it already'
|
|
' exists. If not specified, Dataset Preparer will not generate'
|
|
' new config for datasets whose configs are already in base.')
|
|
parser.add_argument(
|
|
'--dataset-zoo-path',
|
|
default='./dataset_zoo',
|
|
help='Path to dataset zoo config files.')
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def parse_meta(task: str, meta_path: str) -> None:
|
|
"""Parse meta file.
|
|
|
|
Args:
|
|
cfg_path (str): Path to meta file.
|
|
"""
|
|
try:
|
|
meta = Config.fromfile(meta_path)
|
|
except FileNotFoundError:
|
|
return
|
|
assert task in meta['Data']['Tasks'], \
|
|
f'Task {task} not supported!'
|
|
# License related
|
|
if meta['Data']['License']['Type']:
|
|
print(f"\033[1;33;40mDataset Name: {meta['Name']}")
|
|
print(f"License Type: {meta['Data']['License']['Type']}")
|
|
print(f"License Link: {meta['Data']['License']['Link']}")
|
|
print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
|
|
print('\033[1;31;43mMMOCR does not own the dataset. Using this '
|
|
'dataset you must accept the license provided by the owners, '
|
|
'and cite the corresponding papers appropriately.')
|
|
print('If you do not agree with the above license, please cancel '
|
|
'the progress immediately by pressing ctrl+c. Otherwise, '
|
|
'you are deemed to accept the terms and conditions.\033[0m')
|
|
for i in range(5):
|
|
print(f'{5-i}...')
|
|
time.sleep(1)
|
|
|
|
|
|
def force_lmdb(cfg):
|
|
"""Force the dataset to be dumped in lmdb format.
|
|
|
|
Args:
|
|
cfg (Config): Config object.
|
|
|
|
Returns:
|
|
Config: Config object.
|
|
"""
|
|
for split in ['train', 'val', 'test']:
|
|
preparer_cfg = cfg.get(f'{split}_preparer')
|
|
if preparer_cfg:
|
|
if preparer_cfg.get('dumper') is None:
|
|
raise ValueError(
|
|
f'{split} split does not come with a dumper, '
|
|
'so most likely the annotations are MMOCR-ready and do '
|
|
'not need any adaptation, and it '
|
|
'cannot be dumped in LMDB format.')
|
|
preparer_cfg.dumper['type'] = 'TextRecogLMDBDumper'
|
|
|
|
cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_lmdb'
|
|
|
|
for split in ['train_anns', 'val_anns', 'test_anns']:
|
|
if split in cfg.config_generator:
|
|
# It can be None when users want to clear out the default
|
|
# value
|
|
if not cfg.config_generator[split]:
|
|
continue
|
|
ann_list = cfg.config_generator[split]
|
|
for ann_dict in ann_list:
|
|
ann_dict['ann_file'] = (
|
|
osp.splitext(ann_dict['ann_file'])[0] + '.lmdb')
|
|
else:
|
|
if split == 'train_anns':
|
|
ann_list = [dict(ann_file='textrecog_train.lmdb')]
|
|
elif split == 'test_anns':
|
|
ann_list = [dict(ann_file='textrecog_test.lmdb')]
|
|
else:
|
|
ann_list = []
|
|
cfg.config_generator[split] = ann_list
|
|
|
|
return cfg
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
if args.lmdb and args.task != 'textrecog':
|
|
raise ValueError('--lmdb only works with --task=textrecog')
|
|
for dataset in args.datasets:
|
|
if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)):
|
|
warnings.warn(f'{dataset} is not supported yet. Please check '
|
|
'dataset zoo for supported datasets.')
|
|
continue
|
|
meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml')
|
|
parse_meta(args.task, meta_path)
|
|
cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py')
|
|
cfg = Config.fromfile(cfg_path)
|
|
if args.overwrite_cfg and cfg.get('config_generator',
|
|
None) is not None:
|
|
cfg.config_generator.overwrite_cfg = args.overwrite_cfg
|
|
cfg.nproc = args.nproc
|
|
cfg.task = args.task
|
|
cfg.dataset_name = dataset
|
|
if args.lmdb:
|
|
cfg = force_lmdb(cfg)
|
|
preparer = DatasetPreparer.from_file(cfg)
|
|
preparer.run(args.splits)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|