mmocr/tools/data/textrecog/ic13_converter.py

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
import os.path as osp

from mmocr.utils.fileio import list_to_file


def convert_annotations(root_path, split, format):
    """Convert original annotations to mmocr format
    The annotation format is as the following:
        word_1.png, "flying"
        word_2.png, "today"
        word_3.png, "means"
    After this module, the annotation has been changed to the format below:
        txt:
        word_1.png flying
        word_2.png today
        word_3.png means

        jsonl:
        {'filename': 'word_1.png', 'text': 'flying'}
        {'filename': 'word_2.png', 'text': 'today'}
        {'filename': 'word_3.png', 'text': 'means'}

    Args:
        root_path (str): The root path of the dataset
        split (str): The split of dataset. Namely: training or test
        format (str): Annotation format, should be either 'txt' or 'jsonl'

    """
    assert isinstance(root_path, str)
    assert isinstance(split, str)

    lines = []
    with open(
            osp.join(root_path, 'annotations',
                     f'Challenge2_{split}_Task3_GT.txt'),
            'r',
            encoding='"utf-8-sig') as f:
        annos = f.readlines()
    dst_image_root = osp.join(root_path, split)
    for anno in annos:
        # text may contain comma ','
        dst_img_name, word = anno.split(', "')
        word = word.replace('"\n', '')

        if format == 'txt':
            lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} '
                         f'{word}')
        elif format == 'jsonl':
            lines.append(
                json.dumps({
                    'filename':
                    f'{osp.basename(dst_image_root)}/{dst_img_name}',
                    'text': word
                }))
        else:
            raise NotImplementedError

    list_to_file(osp.join(root_path, f'{split}_label.{format}'), lines)


def parse_args():
    parser = argparse.ArgumentParser(
        description='Generate training and test set of IC13')
    parser.add_argument('root_path', help='Root dir path of IC13')
    parser.add_argument(
        '--format',
        default='jsonl',
        help='Use jsonl or string to format annotations',
        choices=['jsonl', 'txt'])
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    root_path = args.root_path

    for split in ['Train', 'Test']:
        convert_annotations(root_path, split, args.format)
        print(f'{split} split converted.')


if __name__ == '__main__':
    main()
[Feature] Add IC13 (Focused Scene Text) Data Converter (#861) * add ic13 data converter * fix extension * add docs * fix doc * fix doc * update docs * move directory tree * fix indentation * revert docs Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> 2022-03-30 15:13:29 +08:00			`# Copyright (c) OpenMMLab. All rights reserved.`
			`import argparse`
			`import json`
			`import os.path as osp`

			`from mmocr.utils.fileio import list_to_file`


			`def convert_annotations(root_path, split, format):`
			`"""Convert original annotations to mmocr format`
			`The annotation format is as the following:`
			`word_1.png, "flying"`
			`word_2.png, "today"`
			`word_3.png, "means"`
			`After this module, the annotation has been changed to the format below:`
			`txt:`
			`word_1.png flying`
			`word_2.png today`
			`word_3.png means`

			`jsonl:`
			`{'filename': 'word_1.png', 'text': 'flying'}`
			`{'filename': 'word_2.png', 'text': 'today'}`
			`{'filename': 'word_3.png', 'text': 'means'}`

			`Args:`
			`root_path (str): The root path of the dataset`
			`split (str): The split of dataset. Namely: training or test`
			`format (str): Annotation format, should be either 'txt' or 'jsonl'`

			`"""`
			`assert isinstance(root_path, str)`
			`assert isinstance(split, str)`

			`lines = []`
			`with open(`
			`osp.join(root_path, 'annotations',`
			`f'Challenge2_{split}_Task3_GT.txt'),`
			`'r',`
			`encoding='"utf-8-sig') as f:`
			`annos = f.readlines()`
			`dst_image_root = osp.join(root_path, split)`
			`for anno in annos:`
			`# text may contain comma ','`
			`dst_img_name, word = anno.split(', "')`
			`word = word.replace('"\n', '')`

			`if format == 'txt':`
			`lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} '`
			`f'{word}')`
			`elif format == 'jsonl':`
			`lines.append(`
			`json.dumps({`
			`'filename':`
			`f'{osp.basename(dst_image_root)}/{dst_img_name}',`
			`'text': word`
			`}))`
			`else:`
			`raise NotImplementedError`

			`list_to_file(osp.join(root_path, f'{split}_label.{format}'), lines)`


			`def parse_args():`
			`parser = argparse.ArgumentParser(`
			`description='Generate training and test set of IC13')`
			`parser.add_argument('root_path', help='Root dir path of IC13')`
			`parser.add_argument(`
			`'--format',`
			`default='jsonl',`
			`help='Use jsonl or string to format annotations',`
			`choices=['jsonl', 'txt'])`
			`args = parser.parse_args()`
			`return args`


			`def main():`
			`args = parse_args()`
			`root_path = args.root_path`

			`for split in ['Train', 'Test']:`
			`convert_annotations(root_path, split, args.format)`
			`print(f'{split} split converted.')`


			`if __name__ == '__main__':`
			`main()`