diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py index bac8eada..aa637cd7 100644 --- a/mmocr/utils/__init__.py +++ b/mmocr/utils/__init__.py @@ -6,7 +6,8 @@ from .box_util import (bezier_to_polygon, is_on_same_line, sort_points, from .check_argument import (equal_len, is_2dlist, is_3dlist, is_none_or_type, is_type_list, valid_boundary) from .collect_env import collect_env -from .data_convert_util import convert_annotations, dump_ocr_data +from .data_convert_util import (convert_annotations, dump_ocr_data, + recog_anno_to_imginfo) from .fileio import list_from_file, list_to_file from .img_util import drop_orientation, is_not_png from .lmdb_util import recog2lmdb @@ -23,6 +24,6 @@ __all__ = [ 'convert_annotations', 'is_not_png', 'list_to_file', 'list_from_file', 'is_on_same_line', 'stitch_boxes_into_lines', 'StringStrip', 'revert_sync_batchnorm', 'bezier_to_polygon', 'sort_points', - 'setup_multi_processes', 'recog2lmdb', 'dump_ocr_data', 'rescale_polygons', - 'rescale_polygon' + 'setup_multi_processes', 'recog2lmdb', 'dump_ocr_data', + 'recog_anno_to_imginfo', 'rescale_polygons', 'rescale_polygon' ] diff --git a/mmocr/utils/data_convert_util.py b/mmocr/utils/data_convert_util.py index 2e899a0d..e5f57a5a 100644 --- a/mmocr/utils/data_convert_util.py +++ b/mmocr/utils/data_convert_util.py @@ -4,6 +4,8 @@ from typing import Dict, Sequence import mmcv +from mmocr.utils import is_type_list + # TODO: Remove it when all converters no longer need it def convert_annotations(image_infos, out_json_name): @@ -164,3 +166,65 @@ def dump_ocr_data(image_infos: Sequence[Dict], out_json_name: str, mmcv.dump(out_json, out_json_name) return out_json + + +def recog_anno_to_imginfo( + file_paths: Sequence[str], + labels: Sequence[str], +) -> Sequence[Dict]: + """Convert a list of file_paths and labels for recognition tasks into the + format of image_infos acceptable by :func:`dump_ocr_data()`. It's meant to + maintain compatibility with the legacy annotation format in MMOCR 0.x. + + In MMOCR 0.x, data converters for recognition usually converts the + annotations into a list of file paths and a list of labels, which look + like the following: + + .. code-block:: python + + file_paths = ['1.jpg', '2.jpg', ...] + labels = ['aaa', 'bbb', ...] + + This utility merges them into a list of dictionaries parsable by + :func:`dump_ocr_data()`: + + .. code-block:: python + + [ # A list of dicts. Each dict stands for a single image. + { + "file_name": "1.jpg", + "anno_info": [ + { + "text": "aaa" + } + ] + }, + { + "file_name": "2.jpg", + "anno_info": [ + { + "text": "bbb" + } + ] + }, + ... + ] + + Args: + file_paths (list[str]): A list of file paths to images. + labels (list[str]): A list of text labels. + + Returns: + list[dict]: Annotations parsable by :func:`dump_ocr_data()`. + """ + assert is_type_list(file_paths, str) + assert is_type_list(labels, str) + assert len(file_paths) == len(labels) + + results = [] + for i in range(len(file_paths)): + result = dict( + file_name=file_paths[i], anno_info=[dict(text=labels[i])]) + results.append(result) + + return results diff --git a/tests/test_utils/test_data_convert_util.py b/tests/test_utils/test_data_convert_util.py index 9d8d2262..43507771 100644 --- a/tests/test_utils/test_data_convert_util.py +++ b/tests/test_utils/test_data_convert_util.py @@ -5,7 +5,7 @@ from unittest import TestCase import mmcv -from mmocr.utils.data_convert_util import dump_ocr_data +from mmocr.utils.data_convert_util import dump_ocr_data, recog_anno_to_imginfo class TestDataConvertUtil(TestCase): @@ -136,3 +136,27 @@ class TestDataConvertUtil(TestCase): dump_ocr_data(input_data, output_path, 'textrecog') result = mmcv.load(output_path) self.assertDictEqual(result, recog_target) + + def test_recog_anno_to_imginfo(self): + file_paths = ['a.jpg', 'b.jpg'] + labels = ['aaa'] + with self.assertRaises(AssertionError): + recog_anno_to_imginfo(file_paths, labels) + + file_paths = ['a.jpg', 'b.jpg'] + labels = ['aaa', 'bbb'] + target = [ + { + 'file_name': 'a.jpg', + 'anno_info': [{ + 'text': 'aaa' + }] + }, + { + 'file_name': 'b.jpg', + 'anno_info': [{ + 'text': 'bbb' + }] + }, + ] + self.assertListEqual(target, recog_anno_to_imginfo(file_paths, labels)) diff --git a/tools/data/textrecog/data_migrator.py b/tools/data/textrecog/data_migrator.py new file mode 100644 index 00000000..c703c0ce --- /dev/null +++ b/tools/data/textrecog/data_migrator.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +from typing import List, Tuple + +from mmocr.datasets.utils.loader import AnnFileLoader +from mmocr.utils import StringStrip, dump_ocr_data, recog_anno_to_imginfo + + +def parse_legacy_data(in_path: str, + format: str) -> Tuple[List[str], List[str]]: + """Load legacy data and return a list of file paths and labels. + + Args: + in_path (str): Path to annotation file. + format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'. + + Returns: + tuple(list[str], list[str]): File paths and labels. + """ + file_paths = [] + labels = [] + strip_cls = StringStrip() + if format == 'lmdb': + # TODO: Backend might be deprecated + loader = AnnFileLoader( + in_path, + parser=dict(type='LineJsonParser', keys=['filename', 'text']), + file_format='lmdb') + num = len(loader) + for i in range(num): + line_json = loader[i] + file_path, label = line_json['filename'], line_json['text'] + file_paths.append(file_path) + labels.append(label) + return file_paths, labels + else: + with open(in_path) as f: + if format == 'txt': + for line in f: + line = strip_cls(line) + file_path, label = line.split() # Assume no extra spaces + file_paths.append(file_path) + labels.append(label) + elif format == 'jsonl': + for line in f: + datum = json.loads(line) + file_paths.append(datum['filename']) + labels.append(datum['text']) + + return file_paths, labels + + +def parse_args(): + """Parse input arguments.""" + parser = argparse.ArgumentParser( + description='Convert annotations for' + 'text recognition tasks in MMOCR 0.x into the latest openmmlab format.' + ) + parser.add_argument( + 'in_path', help='The path to legacy recognition data file') + parser.add_argument( + 'out_path', help='The output json path in openmmlab format') + parser.add_argument( + '--format', + choices=['txt', 'jsonl', 'lmdb'], + type=str, + default='txt', + help='Legacy data format') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + file_paths, labels = parse_legacy_data(args.in_path, args.format) + img_infos = recog_anno_to_imginfo(file_paths, labels) + dump_ocr_data(img_infos, args.out_path, 'textrecog') + print('finish') + + +if __name__ == '__main__': + main()