mirror of https://github.com/open-mmlab/mmocr.git
[Fix] MJSynth & SynthText Dataset Preparer config (#1805)
* [Fix] MJSynth * update * fix * fixpull/1820/head
parent
bb591d2b1b
commit
4b887676a3
|
@ -5,3 +5,9 @@ mjsynth_textrecog_train = dict(
|
|||
data_root=mjsynth_textrecog_data_root,
|
||||
ann_file='textrecog_train.json',
|
||||
pipeline=None)
|
||||
|
||||
mjsynth_sub_textrecog_train = dict(
|
||||
type='OCRDataset',
|
||||
data_root=mjsynth_textrecog_data_root,
|
||||
ann_file='subset_textrecog_train.json',
|
||||
pipeline=None)
|
||||
|
|
|
@ -23,12 +23,19 @@ train_preparer = dict(
|
|||
'annotations/annotation.txt'
|
||||
]
|
||||
]),
|
||||
dict(
|
||||
url='https://download.openmmlab.com/mmocr/data/1.x/recog/'
|
||||
'Syn90k/subset_textrecog_train.json',
|
||||
save_name='subset_textrecog_train.json',
|
||||
md5='ba958d87bb170980f39e194180c15b9e',
|
||||
split=['train'],
|
||||
content=['annotation'])
|
||||
]),
|
||||
gatherer=dict(type='MonoGatherer', ann_name='annotation.txt'),
|
||||
parser=dict(
|
||||
type='ICDARTxtTextRecogAnnParser',
|
||||
type='MJSynthAnnParser',
|
||||
separator=' ',
|
||||
format='img text',
|
||||
format='img num',
|
||||
remove_strs=None),
|
||||
packer=dict(type='TextRecogPacker'),
|
||||
dumper=dict(type='JsonDumper'),
|
||||
|
@ -37,4 +44,10 @@ train_preparer = dict(
|
|||
delete = ['mjsynth', 'annotations']
|
||||
|
||||
config_generator = dict(
|
||||
type='TextRecogConfigGenerator', data_root=data_root, test_anns=None)
|
||||
type='TextRecogConfigGenerator',
|
||||
data_root=data_root,
|
||||
train_anns=[
|
||||
dict(ann_file='textrecog_train.json', dataset_postfix=''),
|
||||
dict(ann_file='subset_textrecog_train.json', dataset_postfix='sub'),
|
||||
],
|
||||
test_anns=None)
|
||||
|
|
|
@ -27,4 +27,5 @@ config_generator = dict(
|
|||
dict(
|
||||
ann_file='alphanumeric_textrecog_train.json',
|
||||
dataset_postfix='an'),
|
||||
])
|
||||
],
|
||||
test_anns=None)
|
||||
|
|
|
@ -5,6 +5,7 @@ from .ctw1500_parser import CTW1500AnnParser
|
|||
from .funsd_parser import FUNSDTextDetAnnParser
|
||||
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
|
||||
ICDARTxtTextRecogAnnParser)
|
||||
from .mjsynth_parser import MJSynthAnnParser
|
||||
from .naf_parser import NAFAnnParser
|
||||
from .sroie_parser import SROIETextDetAnnParser
|
||||
from .svt_parser import SVTTextDetAnnParser
|
||||
|
@ -17,5 +18,5 @@ __all__ = [
|
|||
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
|
||||
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
|
||||
'SROIETextDetAnnParser', 'NAFAnnParser', 'CTW1500AnnParser',
|
||||
'SynthTextAnnParser'
|
||||
'SynthTextAnnParser', 'MJSynthAnnParser'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import os.path as osp
|
||||
from typing import List
|
||||
|
||||
from mmocr.registry import DATA_PARSERS
|
||||
from .icdar_txt_parser import ICDARTxtTextRecogAnnParser
|
||||
|
||||
|
||||
@DATA_PARSERS.register_module()
|
||||
class MJSynthAnnParser(ICDARTxtTextRecogAnnParser):
|
||||
"""MJSynth Text Recognition Annotation Parser.
|
||||
|
||||
The original annotation format of this dataset is stored in txt files,
|
||||
which is formed as the following format:
|
||||
img_path, transcription
|
||||
|
||||
Args:
|
||||
separator (str): The separator between each element in a line. Defaults
|
||||
to ','.
|
||||
ignore (str): The text to be ignored. Defaults to '#'.
|
||||
format (str): The format of the annotation. Defaults to 'img, text'.
|
||||
encoding (str): The encoding of the annotation file. Defaults to
|
||||
'utf-8-sig'.
|
||||
nproc (int): The number of processes to parse the annotation. Defaults
|
||||
to 1.
|
||||
base_name (bool): Whether to use the basename of the image path as the
|
||||
image name. Defaults to False.
|
||||
remove_strs (List[str], Optional): Used to remove redundant strings in
|
||||
the transcription. Defaults to ['"'].
|
||||
"""
|
||||
|
||||
def parse_files(self, img_dir: str, ann_path: str) -> List:
|
||||
"""Parse annotations."""
|
||||
assert isinstance(ann_path, str)
|
||||
samples = list()
|
||||
for anno in self.loader(
|
||||
file_path=ann_path,
|
||||
format=self.format,
|
||||
encoding=self.encoding,
|
||||
separator=self.sep):
|
||||
text = osp.basename(anno['img']).split('_')[1]
|
||||
if self.remove_strs is not None:
|
||||
for strs in self.remove_strs:
|
||||
text = text.replace(strs, '')
|
||||
if text == self.ignore:
|
||||
continue
|
||||
img_name = anno['img']
|
||||
samples.append((osp.join(img_dir, img_name), text))
|
||||
|
||||
return samples
|
Loading…
Reference in New Issue