From b8c445b04fb3db37a8ffb9a1f408a99518913426 Mon Sep 17 00:00:00 2001
From: liukuikun <24622904+Harold-lkk@users.noreply.github.com>
Date: Thu, 1 Dec 2022 18:43:09 +0800
Subject: [PATCH] [Fix] fix icdar data parse for text containing seperator
 (#1587)

* [Fix] fix icdar data parse for text containing seperator

* Update mmocr/datasets/preparers/parsers/base.py

Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
---
 mmocr/datasets/preparers/parsers/base.py         |  8 ++++++--
 .../test_parsers/test_icdar_txt_parsers.py       | 16 ++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/mmocr/datasets/preparers/parsers/base.py b/mmocr/datasets/preparers/parsers/base.py
index c910e225..58bc35b1 100644
--- a/mmocr/datasets/preparers/parsers/base.py
+++ b/mmocr/datasets/preparers/parsers/base.py
@@ -79,7 +79,8 @@ class BaseParser:
                separator: str = ',',
                format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
                encoding='utf-8') -> Union[Dict, str]:
-        """A basic loader designed for .txt format annotation.
+        """A basic loader designed for .txt format annotation. It greedily
+        extracts information separated by separators.
 
         Args:
             file_path (str): Path to the txt file.
@@ -96,5 +97,8 @@ class BaseParser:
         with open(file_path, 'r', encoding=encoding) as f:
             for line in f.readlines():
                 line = line.strip()
+                values = line.split(separator)
+                values = values[:len(keys) -
+                                1] + [separator.join(values[len(keys) - 1:])]
                 if line:
-                    yield dict(zip(keys, line.split(separator)))
+                    yield dict(zip(keys, values))
diff --git a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
index e12820a6..02a9848d 100644
--- a/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
+++ b/tests/test_datasets/test_preparers/test_parsers/test_icdar_txt_parsers.py
@@ -18,6 +18,7 @@ class TestIC15Parsers(unittest.TestCase):
             '377,117,463,117,465,130,378,130,Genaxis Theatre',
             '493,115,519,115,519,131,493,131,[06]',
             '374,155,409,155,409,170,374,170,###',
+            '374,155,409,155,409,170,374,170,100,000', ' '
         ]
         ann_file = osp.join(self.root.name, 'ic15_det.txt')
         list_to_file(ann_file, fake_anno)
@@ -25,31 +26,34 @@ class TestIC15Parsers(unittest.TestCase):
 
     def _create_dummy_ic15_recog(self):
         fake_anno = [
-            'word_1.png, "Genaxis Theatre"',
-            'word_2.png, "[06]"',
-            'word_3.png, "62-03"',
+            'word_1.png, "Genaxis Theatre"', 'word_2.png, "[06]"',
+            'word_3.png, "62-03"', 'word_4.png, "62-,03"', ''
         ]
         ann_file = osp.join(self.root.name, 'ic15_recog.txt')
         list_to_file(ann_file, fake_anno)
         return ann_file
 
     def test_textdet_parsers(self):
-        parser = ICDARTxtTextDetAnnParser()
         file = self._create_dummy_ic15_det()
+        parser = ICDARTxtTextDetAnnParser()
+
         img, instances = parser.parse_file(file, 'train')
         self.assertEqual(img, file[0])
-        self.assertEqual(len(instances), 3)
+        self.assertEqual(len(instances), 4)
         self.assertIn('poly', instances[0])
         self.assertIn('text', instances[0])
         self.assertIn('ignore', instances[0])
         self.assertEqual(instances[0]['text'], 'Genaxis Theatre')
         self.assertEqual(instances[2]['ignore'], True)
+        self.assertEqual(instances[3]['text'], '100,000')
 
     def test_textrecog_parsers(self):
         parser = ICDARTxtTextRecogAnnParser()
         file = self._create_dummy_ic15_recog()
         samples = parser.parse_files(file, 'train')
-        self.assertEqual(len(samples), 3)
+        self.assertEqual(len(samples), 4)
         img, text = samples[0]
         self.assertEqual(img, 'word_1.png')
         self.assertEqual(text, 'Genaxis Theatre')
+        img, text = samples[3]
+        self.assertEqual(text, '62-,03')