Add recognition data migrator

2022-05-18 09:31:06 +00:00 · 2022-05-18 09:31:06 +00:00 · df2f7b69db
parent 9acc3680cb
commit df2f7b69db
4 changed files with 176 additions and 4 deletions
--- a/mmocr/utils/init.py
+++ b/mmocr/utils/init.py
@ -6,7 +6,8 @@ from .box_util import (bezier_to_polygon, is_on_same_line, sort_points,
 from .check_argument import (equal_len, is_2dlist, is_3dlist, is_none_or_type,
                             is_type_list, valid_boundary)
 from .collect_env import collect_env
-from .data_convert_util import convert_annotations, dump_ocr_data
+from .data_convert_util import (convert_annotations, dump_ocr_data,
+                                recog_anno_to_imginfo)
 from .fileio import list_from_file, list_to_file
 from .img_util import drop_orientation, is_not_png
 from .lmdb_util import recog2lmdb
@ -23,6 +24,6 @@ __all__ = [
    'convert_annotations', 'is_not_png', 'list_to_file', 'list_from_file',
    'is_on_same_line', 'stitch_boxes_into_lines', 'StringStrip',
    'revert_sync_batchnorm', 'bezier_to_polygon', 'sort_points',
-    'setup_multi_processes', 'recog2lmdb', 'dump_ocr_data', 'rescale_polygons',
-    'rescale_polygon'
+    'setup_multi_processes', 'recog2lmdb', 'dump_ocr_data',
+    'recog_anno_to_imginfo', 'rescale_polygons', 'rescale_polygon'
 ]
--- a/mmocr/utils/data_convert_util.py
+++ b/mmocr/utils/data_convert_util.py
@ -4,6 +4,8 @@ from typing import Dict, Sequence

 import mmcv

+from mmocr.utils import is_type_list
+

 # TODO: Remove it when all converters no longer need it
 def convert_annotations(image_infos, out_json_name):
@ -164,3 +166,65 @@ def dump_ocr_data(image_infos: Sequence[Dict], out_json_name: str,
    mmcv.dump(out_json, out_json_name)

    return out_json
+
+
+def recog_anno_to_imginfo(
+    file_paths: Sequence[str],
+    labels: Sequence[str],
+) -> Sequence[Dict]:
+    """Convert a list of file_paths and labels for recognition tasks into the
+    format of image_infos acceptable by :func:`dump_ocr_data()`. It's meant to
+    maintain compatibility with the legacy annotation format in MMOCR 0.x.
+
+    In MMOCR 0.x, data converters for recognition usually converts the
+    annotations into a list of file paths and a list of labels, which look
+    like the following:
+
+    .. code-block:: python
+
+        file_paths = ['1.jpg', '2.jpg', ...]
+        labels = ['aaa', 'bbb', ...]
+
+    This utility merges them into a list of dictionaries parsable by
+    :func:`dump_ocr_data()`:
+
+    .. code-block:: python
+
+        [   # A list of dicts. Each dict stands for a single image.
+            {
+                "file_name": "1.jpg",
+                "anno_info": [
+                    {
+                        "text": "aaa"
+                    }
+                ]
+            },
+            {
+                "file_name": "2.jpg",
+                "anno_info": [
+                    {
+                        "text": "bbb"
+                    }
+                ]
+            },
+            ...
+        ]
+
+    Args:
+        file_paths (list[str]): A list of file paths to images.
+        labels (list[str]): A list of text labels.
+
+    Returns:
+        list[dict]: Annotations parsable by :func:`dump_ocr_data()`.
+    """
+    assert is_type_list(file_paths, str)
+    assert is_type_list(labels, str)
+    assert len(file_paths) == len(labels)
+
+    results = []
+    for i in range(len(file_paths)):
+        result = dict(
+            file_name=file_paths[i], anno_info=[dict(text=labels[i])])
+        results.append(result)
+
+    return results
--- a/tests/test_utils/test_data_convert_util.py
+++ b/tests/test_utils/test_data_convert_util.py
@ -5,7 +5,7 @@ from unittest import TestCase

 import mmcv

-from mmocr.utils.data_convert_util import dump_ocr_data
+from mmocr.utils.data_convert_util import dump_ocr_data, recog_anno_to_imginfo


 class TestDataConvertUtil(TestCase):
@ -136,3 +136,27 @@ class TestDataConvertUtil(TestCase):
            dump_ocr_data(input_data, output_path, 'textrecog')
            result = mmcv.load(output_path)
            self.assertDictEqual(result, recog_target)
+
+    def test_recog_anno_to_imginfo(self):
+        file_paths = ['a.jpg', 'b.jpg']
+        labels = ['aaa']
+        with self.assertRaises(AssertionError):
+            recog_anno_to_imginfo(file_paths, labels)
+
+        file_paths = ['a.jpg', 'b.jpg']
+        labels = ['aaa', 'bbb']
+        target = [
+            {
+                'file_name': 'a.jpg',
+                'anno_info': [{
+                    'text': 'aaa'
+                }]
+            },
+            {
+                'file_name': 'b.jpg',
+                'anno_info': [{
+                    'text': 'bbb'
+                }]
+            },
+        ]
+        self.assertListEqual(target, recog_anno_to_imginfo(file_paths, labels))
--- a/tools/data/textrecog/data_migrator.py
+++ b/tools/data/textrecog/data_migrator.py
@ -0,0 +1,83 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from typing import List, Tuple
+
+from mmocr.datasets.utils.loader import AnnFileLoader
+from mmocr.utils import StringStrip, dump_ocr_data, recog_anno_to_imginfo
+
+
+def parse_legacy_data(in_path: str,
+                      format: str) -> Tuple[List[str], List[str]]:
+    """Load legacy data and return a list of file paths and labels.
+
+    Args:
+        in_path (str): Path to annotation file.
+        format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'.
+
+    Returns:
+        tuple(list[str], list[str]): File paths and labels.
+    """
+    file_paths = []
+    labels = []
+    strip_cls = StringStrip()
+    if format == 'lmdb':
+        # TODO: Backend might be deprecated
+        loader = AnnFileLoader(
+            in_path,
+            parser=dict(type='LineJsonParser', keys=['filename', 'text']),
+            file_format='lmdb')
+        num = len(loader)
+        for i in range(num):
+            line_json = loader[i]
+            file_path, label = line_json['filename'], line_json['text']
+            file_paths.append(file_path)
+            labels.append(label)
+        return file_paths, labels
+    else:
+        with open(in_path) as f:
+            if format == 'txt':
+                for line in f:
+                    line = strip_cls(line)
+                    file_path, label = line.split()  # Assume no extra spaces
+                    file_paths.append(file_path)
+                    labels.append(label)
+            elif format == 'jsonl':
+                for line in f:
+                    datum = json.loads(line)
+                    file_paths.append(datum['filename'])
+                    labels.append(datum['text'])
+
+    return file_paths, labels
+
+
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(
+        description='Convert annotations for'
+        'text recognition tasks in MMOCR 0.x into the latest openmmlab format.'
+    )
+    parser.add_argument(
+        'in_path', help='The path to legacy recognition data file')
+    parser.add_argument(
+        'out_path', help='The output json path in openmmlab format')
+    parser.add_argument(
+        '--format',
+        choices=['txt', 'jsonl', 'lmdb'],
+        type=str,
+        default='txt',
+        help='Legacy data format')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    file_paths, labels = parse_legacy_data(args.in_path, args.format)
+    img_infos = recog_anno_to_imginfo(file_paths, labels)
+    dump_ocr_data(img_infos, args.out_path, 'textrecog')
+    print('finish')
+
+
+if __name__ == '__main__':
+    main()