From b8c445b04fb3db37a8ffb9a1f408a99518913426 Mon Sep 17 00:00:00 2001 From: liukuikun <24622904+Harold-lkk@users.noreply.github.com> Date: Thu, 1 Dec 2022 18:43:09 +0800 Subject: [PATCH] [Fix] fix icdar data parse for text containing seperator (#1587) * [Fix] fix icdar data parse for text containing seperator * Update mmocr/datasets/preparers/parsers/base.py Co-authored-by: Tong Gao --- mmocr/datasets/preparers/parsers/base.py | 8 ++++++-- .../test_parsers/test_icdar_txt_parsers.py | 16 ++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py index c910e225..58bc35b1 100644 --- a/mmocr/datasets/preparers/parsers/base.py +++ b/mmocr/datasets/preparers/parsers/base.py @@ -79,7 +79,8 @@ class BaseParser: separator: str = ',', format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', encoding='utf-8') -> Union[Dict, str]: - """A basic loader designed for .txt format annotation. + """A basic loader designed for .txt format annotation. It greedily + extracts information separated by separators. Args: file_path (str): Path to the txt file. @@ -96,5 +97,8 @@ class BaseParser: with open(file_path, 'r', encoding=encoding) as f: for line in f.readlines(): line = line.strip() + values = line.split(separator) + values = values[:len(keys) - + 1] + [separator.join(values[len(keys) - 1:])] if line: - yield dict(zip(keys, line.split(separator))) + yield dict(zip(keys, values)) diff --git a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py index e12820a6..02a9848d 100644 --- a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py +++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py @@ -18,6 +18,7 @@ class TestIC15Parsers(unittest.TestCase): '377,117,463,117,465,130,378,130,Genaxis Theatre', '493,115,519,115,519,131,493,131,[06]', '374,155,409,155,409,170,374,170,###', + '374,155,409,155,409,170,374,170,100,000', ' ' ] ann_file = osp.join(self.root.name, 'ic15_det.txt') list_to_file(ann_file, fake_anno) @@ -25,31 +26,34 @@ class TestIC15Parsers(unittest.TestCase): def _create_dummy_ic15_recog(self): fake_anno = [ - 'word_1.png, "Genaxis Theatre"', - 'word_2.png, "[06]"', - 'word_3.png, "62-03"', + 'word_1.png, "Genaxis Theatre"', 'word_2.png, "[06]"', + 'word_3.png, "62-03"', 'word_4.png, "62-,03"', '' ] ann_file = osp.join(self.root.name, 'ic15_recog.txt') list_to_file(ann_file, fake_anno) return ann_file def test_textdet_parsers(self): - parser = ICDARTxtTextDetAnnParser() file = self._create_dummy_ic15_det() + parser = ICDARTxtTextDetAnnParser() + img, instances = parser.parse_file(file, 'train') self.assertEqual(img, file[0]) - self.assertEqual(len(instances), 3) + self.assertEqual(len(instances), 4) self.assertIn('poly', instances[0]) self.assertIn('text', instances[0]) self.assertIn('ignore', instances[0]) self.assertEqual(instances[0]['text'], 'Genaxis Theatre') self.assertEqual(instances[2]['ignore'], True) + self.assertEqual(instances[3]['text'], '100,000') def test_textrecog_parsers(self): parser = ICDARTxtTextRecogAnnParser() file = self._create_dummy_ic15_recog() samples = parser.parse_files(file, 'train') - self.assertEqual(len(samples), 3) + self.assertEqual(len(samples), 4) img, text = samples[0] self.assertEqual(img, 'word_1.png') self.assertEqual(text, 'Genaxis Theatre') + img, text = samples[3] + self.assertEqual(text, '62-,03')