[TODO] update recog data_migrator

pull/1178/head
jiangqing.vendor 2022-07-12 11:40:13 +00:00 committed by gaotongxiao
parent 23e1f2432a
commit ee1212a5cd
1 changed files with 10 additions and 10 deletions
tools/data/textrecog

View File

@ -3,7 +3,7 @@ import argparse
import json
from typing import List, Tuple
from mmocr.datasets.utils.loader import AnnFileLoader
from mmocr.datasets import RecogLMDBDataset
from mmocr.utils import StringStrip, dump_ocr_data, recog_anno_to_imginfo
@ -14,7 +14,8 @@ def parse_legacy_data(in_path: str,
Args:
in_path (str): Path to annotation file.
format (str): Annotation format. Choices are 'txt', 'json' and 'lmdb'.
For 'lmdb' format, the lmdb file should only contains labels. For
lmdb file with labels and images, the conversion is unnecessary.
Returns:
tuple(list[str], list[str]): File paths and labels.
"""
@ -22,15 +23,12 @@ def parse_legacy_data(in_path: str,
labels = []
strip_cls = StringStrip()
if format == 'lmdb':
# TODO: Backend might be deprecated
loader = AnnFileLoader(
dataset = RecogLMDBDataset(
in_path,
parser=dict(type='LineJsonParser', keys=['filename', 'text']),
file_format='lmdb')
num = len(loader)
for i in range(num):
line_json = loader[i]
file_path, label = line_json['filename'], line_json['text']
parser_cfg=dict(type='LineJsonParser', keys=['filename', 'text']))
for data_info in dataset:
file_path = data_info['img_path']
label = data_info['instances'][0]['text']
file_path = strip_cls(file_path)
label = strip_cls(label)
# MJ's file_path starts with './'
@ -83,6 +81,8 @@ def parse_args():
default='txt',
help='Legacy data format')
args = parser.parse_args()
if args.out_path.split('.')[-1] != 'json':
raise ValueError('The output path must be a json file.')
return args