[TODO] update recog data_migrator

This commit is contained in:
jiangqing.vendor 2022-07-12 11:40:13 +00:00 committed by gaotongxiao
parent 23e1f2432a
commit ee1212a5cd

View File

@ -3,7 +3,7 @@ import argparse
import json import json
from typing import List, Tuple from typing import List, Tuple
from mmocr.datasets.utils.loader import AnnFileLoader from mmocr.datasets import RecogLMDBDataset
from mmocr.utils import StringStrip, dump_ocr_data, recog_anno_to_imginfo from mmocr.utils import StringStrip, dump_ocr_data, recog_anno_to_imginfo
@ -14,7 +14,8 @@ def parse_legacy_data(in_path: str,
Args: Args:
in_path (str): Path to annotation file. in_path (str): Path to annotation file.
format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'. format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'.
For 'lmdb' format, the lmdb file should only contains labels. For
lmdb file with labels and images, the conversion is unnecessary.
Returns: Returns:
tuple(list[str], list[str]): File paths and labels. tuple(list[str], list[str]): File paths and labels.
""" """
@ -22,15 +23,12 @@ def parse_legacy_data(in_path: str,
labels = [] labels = []
strip_cls = StringStrip() strip_cls = StringStrip()
if format == 'lmdb': if format == 'lmdb':
# TODO: Backend might be deprecated dataset = RecogLMDBDataset(
loader = AnnFileLoader(
in_path, in_path,
parser=dict(type='LineJsonParser', keys=['filename', 'text']), parser_cfg=dict(type='LineJsonParser', keys=['filename', 'text']))
file_format='lmdb') for data_info in dataset:
num = len(loader) file_path = data_info['img_path']
for i in range(num): label = data_info['instances'][0]['text']
line_json = loader[i]
file_path, label = line_json['filename'], line_json['text']
file_path = strip_cls(file_path) file_path = strip_cls(file_path)
label = strip_cls(label) label = strip_cls(label)
# MJ's file_path starts with './' # MJ's file_path starts with './'
@ -83,6 +81,8 @@ def parse_args():
default='txt', default='txt',
help='Legacy data format') help='Legacy data format')
args = parser.parse_args() args = parser.parse_args()
if args.out_path.split('.')[-1] != 'json':
raise ValueError('The output path must be a json file.')
return args return args