mirror of https://github.com/open-mmlab/mmocr.git
[Enhancement] Simplify mono_gather (#1588)
* [Enhancement] Simplify mono_gather * remove mono gather split Co-authored-by: liukuikun <641417025@qq.com>pull/1567/head
parent
3a0aa05d9c
commit
d9ea92191e
dataset_zoo
cute80
icdar2013
icdar2015
iiit5k
svt
svtp
wildreceipt
mmocr/datasets/preparers
|
@ -41,7 +41,7 @@ data_obtainer = dict(
|
|||
# type='TextRecogDataConverter',
|
||||
# splits=['test'],
|
||||
# data_root=data_root,
|
||||
# gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
|
||||
# gatherer=dict(type='mono_gather', test_ann='test.txt'),
|
||||
# parser=dict(
|
||||
# type='ICDARTxtTextRecogAnnParser',
|
||||
# separator=' ',
|
||||
|
|
|
@ -79,9 +79,11 @@ data_obtainer = dict(
|
|||
# type='TextRecogDataConverter',
|
||||
# splits=['train', 'test'],
|
||||
# data_root=data_root,
|
||||
# gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
|
||||
# gatherer=dict(
|
||||
# type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
|
||||
# parser=dict(
|
||||
# type='ICDARTxtTextRecogAnnParser', separator=', ', format='img, text'), # noqa
|
||||
# type='ICDARTxtTextRecogAnnParser', separator=', ',
|
||||
# format='img, text'), # noqa
|
||||
# dumper=dict(type='JsonDumper'))
|
||||
|
||||
config_generator = dict(
|
||||
|
|
|
@ -51,7 +51,8 @@ data_converter = dict(
|
|||
type='TextRecogDataConverter',
|
||||
splits=['train', 'test'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
|
||||
gatherer=dict(
|
||||
type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
|
||||
parser=dict(type='ICDARTxtTextRecogAnnParser', encoding='utf-8-sig'),
|
||||
dumper=dict(type='JsonDumper'))
|
||||
|
||||
|
|
|
@ -37,7 +37,8 @@ data_converter = dict(
|
|||
type='TextRecogDataConverter',
|
||||
splits=['train', 'test'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
|
||||
gatherer=dict(
|
||||
type='mono_gather', train_ann='train.txt', test_ann='test.txt'),
|
||||
parser=dict(
|
||||
type='ICDARTxtTextRecogAnnParser',
|
||||
encoding='utf-8',
|
||||
|
|
|
@ -21,7 +21,8 @@ data_converter = dict(
|
|||
type='TextDetDataConverter',
|
||||
splits=['train', 'test'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(type='mono_gather', mapping="f'{split}.xml'"),
|
||||
gatherer=dict(
|
||||
type='mono_gather', train_ann='train.xml', test_ann='test.xml'),
|
||||
parser=dict(type='SVTTextDetAnnParser', data_root=data_root),
|
||||
dumper=dict(type='JsonDumper'),
|
||||
delete=['annotations', 'svt'])
|
||||
|
|
|
@ -20,7 +20,7 @@ data_converter = dict(
|
|||
type='TextRecogDataConverter',
|
||||
splits=['test'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(type='mono_gather', mapping="f'{split}.txt'"),
|
||||
gatherer=dict(type='mono_gather', test_ann='test.txt'),
|
||||
parser=dict(
|
||||
type='ICDARTxtTextRecogAnnParser', separator=' ', format='img text'),
|
||||
dumper=dict(type='JsonDumper'),
|
||||
|
|
|
@ -26,7 +26,10 @@ data_converter = dict(
|
|||
splits=['train', 'test'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(
|
||||
type='mono_gather', mapping="f'{split}.txt'", ann_path=data_root),
|
||||
type='mono_gather',
|
||||
train_ann='train.txt',
|
||||
test_ann='test.txt',
|
||||
ann_path=data_root),
|
||||
parser=dict(type='WildreceiptKIEAnnParser', data_root=data_root),
|
||||
dumper=dict(type='WildreceiptOpensetDumper'),
|
||||
delete=['wildreceipt'])
|
||||
|
|
|
@ -72,23 +72,23 @@ class BaseDataConverter:
|
|||
files.
|
||||
"""
|
||||
# Convert and dump annotations to MMOCR format
|
||||
for split in self.splits:
|
||||
print(f'Parsing {split} split...')
|
||||
for self.current_split in self.splits:
|
||||
print(f'Parsing {self.current_split} split...')
|
||||
# Gather the info such as file names required by parser
|
||||
img_path = osp.join(self.data_root, self.img_dir, split)
|
||||
img_path = osp.join(self.data_root, self.img_dir,
|
||||
self.current_split)
|
||||
ann_path = osp.join(self.data_root, 'annotations')
|
||||
gatherer_args = dict(
|
||||
img_path=img_path, ann_path=ann_path, split=split)
|
||||
gatherer_args = dict(img_path=img_path, ann_path=ann_path)
|
||||
gatherer_args.update(self.gatherer_args)
|
||||
files = self.gatherer(**gatherer_args)
|
||||
# Convert dataset annotations to MMOCR format
|
||||
samples = self.parser.parse_files(files, split)
|
||||
print(f'Packing {split} annotations...')
|
||||
func = partial(self.pack_instance, split=split)
|
||||
samples = self.parser.parse_files(files, self.current_split)
|
||||
print(f'Packing {self.current_split} annotations...')
|
||||
func = partial(self.pack_instance, split=self.current_split)
|
||||
samples = track_parallel_progress(func, samples, nproc=self.nproc)
|
||||
samples = self.add_meta(samples)
|
||||
# Dump annotation files
|
||||
self.dumper.dump(samples, self.data_root, split)
|
||||
self.dumper.dump(samples, self.data_root, self.current_split)
|
||||
self.clean()
|
||||
|
||||
@abstractmethod
|
||||
|
@ -116,7 +116,11 @@ class BaseDataConverter:
|
|||
Dict: A dict contains the meta information and samples.
|
||||
"""
|
||||
|
||||
def mono_gather(self, ann_path: str, mapping: str, split: str,
|
||||
def mono_gather(self,
|
||||
ann_path: str,
|
||||
train_ann: Optional[str] = None,
|
||||
val_ann: Optional[str] = None,
|
||||
test_ann: Optional[str] = None,
|
||||
**kwargs) -> str:
|
||||
"""Gather the dataset file. Specifically for the case that only one
|
||||
annotation file is needed. For example,
|
||||
|
@ -127,16 +131,22 @@ class BaseDataConverter:
|
|||
|
||||
Args:
|
||||
anno_path (str): Path to the annotations.
|
||||
mapping (str): Mapping rule of the annotation names. For example,
|
||||
"f'{split}.json'" will return 'train.json' when the split is
|
||||
'train'.
|
||||
split (str): The current split.
|
||||
train_ann (str, optional): The annotation file name of the train
|
||||
split in the original dataset. Defaults to None.
|
||||
val_ann (str, optional): The annotation file name of the val split
|
||||
in the original dataset. Defaults to None.
|
||||
test_ann (str, optional): The annotation file name of the test
|
||||
split in the original dataset. Defaults to None.
|
||||
|
||||
Returns:
|
||||
str: Path to the annotation file.
|
||||
"""
|
||||
|
||||
return osp.join(ann_path, eval(mapping))
|
||||
ann_file = eval(f'{self.current_split}_ann')
|
||||
if ann_file is None:
|
||||
raise ValueError(
|
||||
f'{self.current_split}_ann must be specified in gatherer!')
|
||||
return osp.join(ann_path, ann_file)
|
||||
|
||||
def pair_gather(self, img_path: str, suffixes: List, rule: Sequence,
|
||||
**kwargs) -> List[Tuple]:
|
||||
|
|
Loading…
Reference in New Issue