[Fix] fix icdar data parse for text containing seperator (#1587)

* [Fix] fix icdar data parse for text containing seperator * Update mmocr/datasets/preparers/parsers/base.py Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
2022-12-01 18:43:09 +08:00 · 2022-12-01 18:43:09 +08:00 · b8c445b04f
parent d9356252af
commit b8c445b04f
2 changed files with 16 additions and 8 deletions
--- a/mmocr/datasets/preparers/parsers/base.py
+++ b/mmocr/datasets/preparers/parsers/base.py
@ -79,7 +79,8 @@ class BaseParser:
               separator: str = ',',
               format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
               encoding='utf-8') -> Union[Dict, str]:
-        """A basic loader designed for .txt format annotation.
+        """A basic loader designed for .txt format annotation. It greedily
+        extracts information separated by separators.

        Args:
            file_path (str): Path to the txt file.
@ -96,5 +97,8 @@ class BaseParser:
        with open(file_path, 'r', encoding=encoding) as f:
            for line in f.readlines():
                line = line.strip()
+                values = line.split(separator)
+                values = values[:len(keys) -
+                                1] + [separator.join(values[len(keys) - 1:])]
                if line:
-                    yield dict(zip(keys, line.split(separator)))
+                    yield dict(zip(keys, values))
--- a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
@ -18,6 +18,7 @@ class TestIC15Parsers(unittest.TestCase):
            '377,117,463,117,465,130,378,130,Genaxis Theatre',
            '493,115,519,115,519,131,493,131,[06]',
            '374,155,409,155,409,170,374,170,###',
+            '374,155,409,155,409,170,374,170,100,000', ' '
        ]
        ann_file = osp.join(self.root.name, 'ic15_det.txt')
        list_to_file(ann_file, fake_anno)
@ -25,31 +26,34 @@ class TestIC15Parsers(unittest.TestCase):

    def _create_dummy_ic15_recog(self):
        fake_anno = [
-            'word_1.png, "Genaxis Theatre"',
-            'word_2.png, "[06]"',
-            'word_3.png, "62-03"',
+            'word_1.png, "Genaxis Theatre"', 'word_2.png, "[06]"',
+            'word_3.png, "62-03"', 'word_4.png, "62-,03"', ''
        ]
        ann_file = osp.join(self.root.name, 'ic15_recog.txt')
        list_to_file(ann_file, fake_anno)
        return ann_file

    def test_textdet_parsers(self):
-        parser = ICDARTxtTextDetAnnParser()
        file = self._create_dummy_ic15_det()
+        parser = ICDARTxtTextDetAnnParser()
+
        img, instances = parser.parse_file(file, 'train')
        self.assertEqual(img, file[0])
-        self.assertEqual(len(instances), 3)
+        self.assertEqual(len(instances), 4)
        self.assertIn('poly', instances[0])
        self.assertIn('text', instances[0])
        self.assertIn('ignore', instances[0])
        self.assertEqual(instances[0]['text'], 'Genaxis Theatre')
        self.assertEqual(instances[2]['ignore'], True)
+        self.assertEqual(instances[3]['text'], '100,000')

    def test_textrecog_parsers(self):
        parser = ICDARTxtTextRecogAnnParser()
        file = self._create_dummy_ic15_recog()
        samples = parser.parse_files(file, 'train')
-        self.assertEqual(len(samples), 3)
+        self.assertEqual(len(samples), 4)
        img, text = samples[0]
        self.assertEqual(img, 'word_1.png')
        self.assertEqual(text, 'Genaxis Theatre')
+        img, text = samples[3]
+        self.assertEqual(text, '62-,03')