Add recognition data migrator

pull/1178/head
gaotongxiao 2022-05-18 09:31:06 +00:00
parent 9acc3680cb
commit df2f7b69db
4 changed files with 176 additions and 4 deletions

View File

@ -6,7 +6,8 @@ from .box_util import (bezier_to_polygon, is_on_same_line, sort_points,
from .check_argument import (equal_len, is_2dlist, is_3dlist, is_none_or_type,
is_type_list, valid_boundary)
from .collect_env import collect_env
from .data_convert_util import convert_annotations, dump_ocr_data
from .data_convert_util import (convert_annotations, dump_ocr_data,
recog_anno_to_imginfo)
from .fileio import list_from_file, list_to_file
from .img_util import drop_orientation, is_not_png
from .lmdb_util import recog2lmdb
@ -23,6 +24,6 @@ __all__ = [
'convert_annotations', 'is_not_png', 'list_to_file', 'list_from_file',
'is_on_same_line', 'stitch_boxes_into_lines', 'StringStrip',
'revert_sync_batchnorm', 'bezier_to_polygon', 'sort_points',
'setup_multi_processes', 'recog2lmdb', 'dump_ocr_data', 'rescale_polygons',
'rescale_polygon'
'setup_multi_processes', 'recog2lmdb', 'dump_ocr_data',
'recog_anno_to_imginfo', 'rescale_polygons', 'rescale_polygon'
]

View File

@ -4,6 +4,8 @@ from typing import Dict, Sequence
import mmcv
from mmocr.utils import is_type_list
# TODO: Remove it when all converters no longer need it
def convert_annotations(image_infos, out_json_name):
@ -164,3 +166,65 @@ def dump_ocr_data(image_infos: Sequence[Dict], out_json_name: str,
mmcv.dump(out_json, out_json_name)
return out_json
def recog_anno_to_imginfo(
file_paths: Sequence[str],
labels: Sequence[str],
) -> Sequence[Dict]:
"""Convert a list of file_paths and labels for recognition tasks into the
format of image_infos acceptable by :func:`dump_ocr_data()`. It's meant to
maintain compatibility with the legacy annotation format in MMOCR 0.x.
In MMOCR 0.x, data converters for recognition usually converts the
annotations into a list of file paths and a list of labels, which look
like the following:
.. code-block:: python
file_paths = ['1.jpg', '2.jpg', ...]
labels = ['aaa', 'bbb', ...]
This utility merges them into a list of dictionaries parsable by
:func:`dump_ocr_data()`:
.. code-block:: python
[ # A list of dicts. Each dict stands for a single image.
{
"file_name": "1.jpg",
"anno_info": [
{
"text": "aaa"
}
]
},
{
"file_name": "2.jpg",
"anno_info": [
{
"text": "bbb"
}
]
},
...
]
Args:
file_paths (list[str]): A list of file paths to images.
labels (list[str]): A list of text labels.
Returns:
list[dict]: Annotations parsable by :func:`dump_ocr_data()`.
"""
assert is_type_list(file_paths, str)
assert is_type_list(labels, str)
assert len(file_paths) == len(labels)
results = []
for i in range(len(file_paths)):
result = dict(
file_name=file_paths[i], anno_info=[dict(text=labels[i])])
results.append(result)
return results

View File

@ -5,7 +5,7 @@ from unittest import TestCase
import mmcv
from mmocr.utils.data_convert_util import dump_ocr_data
from mmocr.utils.data_convert_util import dump_ocr_data, recog_anno_to_imginfo
class TestDataConvertUtil(TestCase):
@ -136,3 +136,27 @@ class TestDataConvertUtil(TestCase):
dump_ocr_data(input_data, output_path, 'textrecog')
result = mmcv.load(output_path)
self.assertDictEqual(result, recog_target)
def test_recog_anno_to_imginfo(self):
file_paths = ['a.jpg', 'b.jpg']
labels = ['aaa']
with self.assertRaises(AssertionError):
recog_anno_to_imginfo(file_paths, labels)
file_paths = ['a.jpg', 'b.jpg']
labels = ['aaa', 'bbb']
target = [
{
'file_name': 'a.jpg',
'anno_info': [{
'text': 'aaa'
}]
},
{
'file_name': 'b.jpg',
'anno_info': [{
'text': 'bbb'
}]
},
]
self.assertListEqual(target, recog_anno_to_imginfo(file_paths, labels))

View File

@ -0,0 +1,83 @@
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
from typing import List, Tuple
from mmocr.datasets.utils.loader import AnnFileLoader
from mmocr.utils import StringStrip, dump_ocr_data, recog_anno_to_imginfo
def parse_legacy_data(in_path: str,
format: str) -> Tuple[List[str], List[str]]:
"""Load legacy data and return a list of file paths and labels.
Args:
in_path (str): Path to annotation file.
format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'.
Returns:
tuple(list[str], list[str]): File paths and labels.
"""
file_paths = []
labels = []
strip_cls = StringStrip()
if format == 'lmdb':
# TODO: Backend might be deprecated
loader = AnnFileLoader(
in_path,
parser=dict(type='LineJsonParser', keys=['filename', 'text']),
file_format='lmdb')
num = len(loader)
for i in range(num):
line_json = loader[i]
file_path, label = line_json['filename'], line_json['text']
file_paths.append(file_path)
labels.append(label)
return file_paths, labels
else:
with open(in_path) as f:
if format == 'txt':
for line in f:
line = strip_cls(line)
file_path, label = line.split() # Assume no extra spaces
file_paths.append(file_path)
labels.append(label)
elif format == 'jsonl':
for line in f:
datum = json.loads(line)
file_paths.append(datum['filename'])
labels.append(datum['text'])
return file_paths, labels
def parse_args():
"""Parse input arguments."""
parser = argparse.ArgumentParser(
description='Convert annotations for'
'text recognition tasks in MMOCR 0.x into the latest openmmlab format.'
)
parser.add_argument(
'in_path', help='The path to legacy recognition data file')
parser.add_argument(
'out_path', help='The output json path in openmmlab format')
parser.add_argument(
'--format',
choices=['txt', 'jsonl', 'lmdb'],
type=str,
default='txt',
help='Legacy data format')
args = parser.parse_args()
return args
def main():
args = parse_args()
file_paths, labels = parse_legacy_data(args.in_path, args.format)
img_infos = recog_anno_to_imginfo(file_paths, labels)
dump_ocr_data(img_infos, args.out_path, 'textrecog')
print('finish')
if __name__ == '__main__':
main()