mirror of https://github.com/open-mmlab/mmocr.git
[Fix] fix icdar data parse for text containing seperator (#1587)
* [Fix] fix icdar data parse for text containing seperator * Update mmocr/datasets/preparers/parsers/base.py Co-authored-by: Tong Gao <gaotongxiao@gmail.com>pull/1240/head
parent
d9356252af
commit
b8c445b04f
|
@ -79,7 +79,8 @@ class BaseParser:
|
|||
separator: str = ',',
|
||||
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
|
||||
encoding='utf-8') -> Union[Dict, str]:
|
||||
"""A basic loader designed for .txt format annotation.
|
||||
"""A basic loader designed for .txt format annotation. It greedily
|
||||
extracts information separated by separators.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the txt file.
|
||||
|
@ -96,5 +97,8 @@ class BaseParser:
|
|||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
for line in f.readlines():
|
||||
line = line.strip()
|
||||
values = line.split(separator)
|
||||
values = values[:len(keys) -
|
||||
1] + [separator.join(values[len(keys) - 1:])]
|
||||
if line:
|
||||
yield dict(zip(keys, line.split(separator)))
|
||||
yield dict(zip(keys, values))
|
||||
|
|
|
@ -18,6 +18,7 @@ class TestIC15Parsers(unittest.TestCase):
|
|||
'377,117,463,117,465,130,378,130,Genaxis Theatre',
|
||||
'493,115,519,115,519,131,493,131,[06]',
|
||||
'374,155,409,155,409,170,374,170,###',
|
||||
'374,155,409,155,409,170,374,170,100,000', ' '
|
||||
]
|
||||
ann_file = osp.join(self.root.name, 'ic15_det.txt')
|
||||
list_to_file(ann_file, fake_anno)
|
||||
|
@ -25,31 +26,34 @@ class TestIC15Parsers(unittest.TestCase):
|
|||
|
||||
def _create_dummy_ic15_recog(self):
|
||||
fake_anno = [
|
||||
'word_1.png, "Genaxis Theatre"',
|
||||
'word_2.png, "[06]"',
|
||||
'word_3.png, "62-03"',
|
||||
'word_1.png, "Genaxis Theatre"', 'word_2.png, "[06]"',
|
||||
'word_3.png, "62-03"', 'word_4.png, "62-,03"', ''
|
||||
]
|
||||
ann_file = osp.join(self.root.name, 'ic15_recog.txt')
|
||||
list_to_file(ann_file, fake_anno)
|
||||
return ann_file
|
||||
|
||||
def test_textdet_parsers(self):
|
||||
parser = ICDARTxtTextDetAnnParser()
|
||||
file = self._create_dummy_ic15_det()
|
||||
parser = ICDARTxtTextDetAnnParser()
|
||||
|
||||
img, instances = parser.parse_file(file, 'train')
|
||||
self.assertEqual(img, file[0])
|
||||
self.assertEqual(len(instances), 3)
|
||||
self.assertEqual(len(instances), 4)
|
||||
self.assertIn('poly', instances[0])
|
||||
self.assertIn('text', instances[0])
|
||||
self.assertIn('ignore', instances[0])
|
||||
self.assertEqual(instances[0]['text'], 'Genaxis Theatre')
|
||||
self.assertEqual(instances[2]['ignore'], True)
|
||||
self.assertEqual(instances[3]['text'], '100,000')
|
||||
|
||||
def test_textrecog_parsers(self):
|
||||
parser = ICDARTxtTextRecogAnnParser()
|
||||
file = self._create_dummy_ic15_recog()
|
||||
samples = parser.parse_files(file, 'train')
|
||||
self.assertEqual(len(samples), 3)
|
||||
self.assertEqual(len(samples), 4)
|
||||
img, text = samples[0]
|
||||
self.assertEqual(img, 'word_1.png')
|
||||
self.assertEqual(text, 'Genaxis Theatre')
|
||||
img, text = samples[3]
|
||||
self.assertEqual(text, '62-,03')
|
||||
|
|
Loading…
Reference in New Issue