[Fix] fix icdar data parse for text containing seperator (#1587)

* [Fix] fix icdar data parse for text containing seperator

* Update mmocr/datasets/preparers/parsers/base.py

Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
pull/1240/head
liukuikun 2022-12-01 18:43:09 +08:00 committed by GitHub
parent d9356252af
commit b8c445b04f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 8 deletions

View File

@ -79,7 +79,8 @@ class BaseParser:
separator: str = ',',
format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
encoding='utf-8') -> Union[Dict, str]:
"""A basic loader designed for .txt format annotation.
"""A basic loader designed for .txt format annotation. It greedily
extracts information separated by separators.
Args:
file_path (str): Path to the txt file.
@ -96,5 +97,8 @@ class BaseParser:
with open(file_path, 'r', encoding=encoding) as f:
for line in f.readlines():
line = line.strip()
values = line.split(separator)
values = values[:len(keys) -
1] + [separator.join(values[len(keys) - 1:])]
if line:
yield dict(zip(keys, line.split(separator)))
yield dict(zip(keys, values))

View File

@ -18,6 +18,7 @@ class TestIC15Parsers(unittest.TestCase):
'377,117,463,117,465,130,378,130,Genaxis Theatre',
'493,115,519,115,519,131,493,131,[06]',
'374,155,409,155,409,170,374,170,###',
'374,155,409,155,409,170,374,170,100,000', ' '
]
ann_file = osp.join(self.root.name, 'ic15_det.txt')
list_to_file(ann_file, fake_anno)
@ -25,31 +26,34 @@ class TestIC15Parsers(unittest.TestCase):
def _create_dummy_ic15_recog(self):
fake_anno = [
'word_1.png, "Genaxis Theatre"',
'word_2.png, "[06]"',
'word_3.png, "62-03"',
'word_1.png, "Genaxis Theatre"', 'word_2.png, "[06]"',
'word_3.png, "62-03"', 'word_4.png, "62-,03"', ''
]
ann_file = osp.join(self.root.name, 'ic15_recog.txt')
list_to_file(ann_file, fake_anno)
return ann_file
def test_textdet_parsers(self):
parser = ICDARTxtTextDetAnnParser()
file = self._create_dummy_ic15_det()
parser = ICDARTxtTextDetAnnParser()
img, instances = parser.parse_file(file, 'train')
self.assertEqual(img, file[0])
self.assertEqual(len(instances), 3)
self.assertEqual(len(instances), 4)
self.assertIn('poly', instances[0])
self.assertIn('text', instances[0])
self.assertIn('ignore', instances[0])
self.assertEqual(instances[0]['text'], 'Genaxis Theatre')
self.assertEqual(instances[2]['ignore'], True)
self.assertEqual(instances[3]['text'], '100,000')
def test_textrecog_parsers(self):
parser = ICDARTxtTextRecogAnnParser()
file = self._create_dummy_ic15_recog()
samples = parser.parse_files(file, 'train')
self.assertEqual(len(samples), 3)
self.assertEqual(len(samples), 4)
img, text = samples[0]
self.assertEqual(img, 'word_1.png')
self.assertEqual(text, 'Genaxis Theatre')
img, text = samples[3]
self.assertEqual(text, '62-,03')