[Enhancement] Simplify mono_gather ()

* [Enhancement] Simplify mono_gather

* remove mono gather split

Co-authored-by: liukuikun <641417025@qq.com>
pull/1567/head
Tong Gao 2022-12-06 16:03:12 +08:00 committed by GitHub
parent 3a0aa05d9c
commit d9ea92191e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 41 additions and 23 deletions
dataset_zoo
mmocr/datasets/preparers

View File

@ -41,7 +41,7 @@ data_obtainer = dict(
# type='TextRecogDataConverter',
# splits=['test'],
# data_root=data_root,
# gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
# gatherer=dict(type='mono_gather', test_ann='test.txt'),
# parser=dict(
# type='ICDARTxtTextRecogAnnParser',
# separator=' ',

View File

@ -79,9 +79,11 @@ data_obtainer = dict(
# type='TextRecogDataConverter',
# splits=['train', 'test'],
# data_root=data_root,
# gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
# gatherer=dict(
# type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
# parser=dict(
# type='ICDARTxtTextRecogAnnParser', separator=', ', format='img, text'), # noqa
# type='ICDARTxtTextRecogAnnParser', separator=', ',
# format='img, text'), # noqa
# dumper=dict(type='JsonDumper'))
config_generator = dict(

View File

@ -51,7 +51,8 @@ data_converter = dict(
type='TextRecogDataConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
gatherer=dict(
type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'),
dumper=dict(type='JsonDumper'))

View File

@ -37,7 +37,8 @@ data_converter = dict(
type='TextRecogDataConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
gatherer=dict(
type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
parser=dict(
type='ICDARTxtTextRecogAnnParser',
encoding='utf-8',

View File

@ -21,7 +21,8 @@ data_converter = dict(
type='TextDetDataConverter',
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(type='mono_gather', mapping="f'{split}.xml'"),
gatherer=dict(
type='mono_gather', train_ann='train.xml', test_ann='test.xml'),
parser=dict(type='SVTTextDetAnnParser', data_root=data_root),
dumper=dict(type='JsonDumper'),
delete=['annotations', 'svt'])

View File

@ -20,7 +20,7 @@ data_converter = dict(
type='TextRecogDataConverter',
splits=['test'],
data_root=data_root,
gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
gatherer=dict(type='mono_gather', test_ann='test.txt'),
parser=dict(
type='ICDARTxtTextRecogAnnParser', separator=' ', format='img text'),
dumper=dict(type='JsonDumper'),

View File

@ -26,7 +26,10 @@ data_converter = dict(
splits=['train', 'test'],
data_root=data_root,
gatherer=dict(
type='mono_gather', mapping="f'{split}.txt'", ann_path=data_root),
type='mono_gather',
train_ann='train.txt',
test_ann='test.txt',
ann_path=data_root),
parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root),
dumper=dict(type='WildreceiptOpensetDumper'),
delete=['wildreceipt'])

View File

@ -72,23 +72,23 @@ class BaseDataConverter:
files.
"""
# Convert and dump annotations to MMOCR format
for split in self.splits:
print(f'Parsing {split} split...')
for self.current_split in self.splits:
print(f'Parsing {self.current_split} split...')
# Gather the info such as file names required by parser
img_path = osp.join(self.data_root, self.img_dir, split)
img_path = osp.join(self.data_root, self.img_dir,
self.current_split)
ann_path = osp.join(self.data_root, 'annotations')
gatherer_args = dict(
img_path=img_path, ann_path=ann_path, split=split)
gatherer_args = dict(img_path=img_path, ann_path=ann_path)
gatherer_args.update(self.gatherer_args)
files = self.gatherer(**gatherer_args)
# Convert dataset annotations to MMOCR format
samples = self.parser.parse_files(files, split)
print(f'Packing {split} annotations...')
func = partial(self.pack_instance, split=split)
samples = self.parser.parse_files(files, self.current_split)
print(f'Packing {self.current_split} annotations...')
func = partial(self.pack_instance, split=self.current_split)
samples = track_parallel_progress(func, samples, nproc=self.nproc)
samples = self.add_meta(samples)
# Dump annotation files
self.dumper.dump(samples, self.data_root, split)
self.dumper.dump(samples, self.data_root, self.current_split)
self.clean()
@abstractmethod
@ -116,7 +116,11 @@ class BaseDataConverter:
Dict: A dict contains the meta information and samples.
"""
def mono_gather(self, ann_path: str, mapping: str, split: str,
def mono_gather(self,
ann_path: str,
train_ann: Optional[str] = None,
val_ann: Optional[str] = None,
test_ann: Optional[str] = None,
**kwargs) -> str:
"""Gather the dataset file. Specifically for the case that only one
annotation file is needed. For example,
@ -127,16 +131,22 @@ class BaseDataConverter:
Args:
anno_path (str): Path to the annotations.
mapping (str): Mapping rule of the annotation names. For example,
"f'{split}.json'" will return 'train.json' when the split is
'train'.
split (str): The current split.
train_ann (str, optional): The annotation file name of the train
split in the original dataset. Defaults to None.
val_ann (str, optional): The annotation file name of the val split
in the original dataset. Defaults to None.
test_ann (str, optional): The annotation file name of the test
split in the original dataset. Defaults to None.
Returns:
str: Path to the annotation file.
"""
return osp.join(ann_path, eval(mapping))
ann_file = eval(f'{self.current_split}_ann')
if ann_file is None:
raise ValueError(
f'{self.current_split}_ann must be specified in gatherer!')
return osp.join(ann_path, ann_file)
def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
**kwargs) -> List[Tuple]: