From 1413b5043a4627df395688152d2b22489c08c9d4 Mon Sep 17 00:00:00 2001
From: Ferry Huang <71176040+FerryHuang@users.noreply.github.com>
Date: Thu, 29 Dec 2022 16:52:51 +0800
Subject: [PATCH] [Feature] CodeCamp #116 Add SROIE to dataset preparer (#1639)

* added sroie/metafile.yml

* add sample_anno.md and textdet.py

* modify and add all

* fix lint

* fix lint

* fix lint

* Update mmocr/datasets/preparers/data_converpyter.

Co-authored-by: Tong Gao <gaotongxiao@gmail.com>

* fix the reviewed

* add comment of try to sroie_parser.py

* modify data_obtainer.py

* fix lint errors

* fix download link

Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
---
 dataset_zoo/sroie/metafile.yml                | 31 ++++++++
 dataset_zoo/sroie/sample_anno.md              |  9 +++
 dataset_zoo/sroie/textdet.py                  | 55 ++++++++++++++
 dataset_zoo/sroie/textrecog.py                |  5 ++
 dataset_zoo/sroie/textspotting.py             |  5 ++
 mmocr/datasets/preparers/data_converter.py    |  2 +
 mmocr/datasets/preparers/parsers/__init__.py  |  3 +-
 .../preparers/parsers/sroie_parser.py         | 74 +++++++++++++++++++
 8 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 dataset_zoo/sroie/metafile.yml
 create mode 100644 dataset_zoo/sroie/sample_anno.md
 create mode 100644 dataset_zoo/sroie/textdet.py
 create mode 100644 dataset_zoo/sroie/textrecog.py
 create mode 100644 dataset_zoo/sroie/textspotting.py
 create mode 100644 mmocr/datasets/preparers/parsers/sroie_parser.py

diff --git a/dataset_zoo/sroie/metafile.yml b/dataset_zoo/sroie/metafile.yml
new file mode 100644
index 00000000..804530eb
--- /dev/null
+++ b/dataset_zoo/sroie/metafile.yml
@@ -0,0 +1,31 @@
+Name: 'Scanned Receipts OCR and Information Extraction'
+Paper:
+  Title: ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction
+  URL: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8977955
+  Venue: ICDAR
+  Year: '2019'
+  BibTeX: '@INPROCEEDINGS{8977955,
+  author={Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.},
+  booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
+  title={ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction},
+  year={2019},
+  volume={},
+  number={},
+  pages={1516-1520},
+  doi={10.1109/ICDAR.2019.00244}}'
+Data:
+  Website: https://rrc.cvc.uab.es/?ch=13
+  Language:
+    - English
+  Scene:
+    - Document
+  Granularity:
+    - Word
+  Tasks:
+    - textdet
+    - textrecog
+    - textspotting
+  License:
+    Type: CC BY 4.0
+    Link: https://creativecommons.org/licenses/by/4.0/
+  Format: .txt
diff --git a/dataset_zoo/sroie/sample_anno.md b/dataset_zoo/sroie/sample_anno.md
new file mode 100644
index 00000000..86efab56
--- /dev/null
+++ b/dataset_zoo/sroie/sample_anno.md
@@ -0,0 +1,9 @@
+**Text Detection, Text Recognition and Text Spotting**
+
+```text
+# x1,y1,x2,y2,x3,y3,x4,y4,trans
+
+72,25,326,25,326,64,72,64,TAN WOON YANN
+50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
+205,121,285,121,285,139,205,139,789417-W
+```
diff --git a/dataset_zoo/sroie/textdet.py b/dataset_zoo/sroie/textdet.py
new file mode 100644
index 00000000..78bb399a
--- /dev/null
+++ b/dataset_zoo/sroie/textdet.py
@@ -0,0 +1,55 @@
+data_root = 'data/sroie'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+    type='NaiveDataObtainer',
+    cache_path=cache_path,
+    data_root=data_root,
+    files=[
+        dict(
+            url='https://download.openmmlab.com/mmocr/data/'
+            'sroie/0325updated.task1train(626p).zip',
+            save_name='0325updated.task1train(626p).zip',
+            md5='16137490f6865caac75772b9111d348c',
+            split=['train'],
+            content=['image', 'annotation'],
+            mapping=[[
+                '0325updated/0325updated.task1train(626p)/*.jpg',
+                'textdet_imgs/train'
+            ],
+                     [
+                         '0325updated/0325updated.task1train(626p)/*.txt',
+                         'annotations/train'
+                     ]]),
+        dict(
+            url='https://download.openmmlab.com/mmocr/data/'
+            'sroie/task1&2_test(361p).zip',
+            save_name='task1&2_test(361p).zip',
+            md5='1bde54705db0995c57a6e34cce437fea',
+            split=['test'],
+            content=['image'],
+            mapping=[[
+                'task1&2_test(361p)/fulltext_test(361p)', 'textdet_imgs/test'
+            ]]),
+        dict(
+            url='https://download.openmmlab.com/mmocr/data/sroie/text.zip',
+            save_name='text.zip',
+            md5='8c534653f252ff4d3943fa27a956a74b',
+            split=['test'],
+            content=['annotation'],
+            mapping=[['text', 'annotations/test']]),
+    ])
+
+data_converter = dict(
+    type='TextDetDataConverter',
+    splits=['train', 'test'],
+    data_root=data_root,
+    gatherer=dict(
+        type='pair_gather',
+        suffixes=['.jpg'],
+        rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
+    parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
+    dumper=dict(type='JsonDumper'),
+    delete=['text', 'task1&2_test(361p)', '0325updated', 'annotations'])
+
+config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
diff --git a/dataset_zoo/sroie/textrecog.py b/dataset_zoo/sroie/textrecog.py
new file mode 100644
index 00000000..212c7e7d
--- /dev/null
+++ b/dataset_zoo/sroie/textrecog.py
@@ -0,0 +1,5 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextRecogCropConverter')
+
+config_generator = dict(type='TextRecogConfigGenerator')
diff --git a/dataset_zoo/sroie/textspotting.py b/dataset_zoo/sroie/textspotting.py
new file mode 100644
index 00000000..88486337
--- /dev/null
+++ b/dataset_zoo/sroie/textspotting.py
@@ -0,0 +1,5 @@
+_base_ = ['textdet.py']
+
+data_converter = dict(type='TextSpottingDataConverter')
+
+config_generator = dict(type='TextSpottingConfigGenerator')
diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py
index a1d0b1f6..fc7177e6 100644
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@@ -177,6 +177,8 @@ class BaseDataConverter:
         """
         files = list()
         for file in list_files(img_path, suffixes):
+            if not re.match(rule[0], osp.basename(file)):
+                continue
             file2 = re.sub(rule[0], rule[1], osp.basename(file))
             file2 = file.replace(osp.basename(file), file2)
             file2 = file2.replace(self.img_dir, 'annotations')
diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py
index cdd08de1..d5d95cca 100644
--- a/mmocr/datasets/preparers/parsers/__init__.py
+++ b/mmocr/datasets/preparers/parsers/__init__.py
@@ -4,6 +4,7 @@ from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                                ICDARTxtTextRecogAnnParser)
 from .naf_parser import NAFAnnParser
+from .sroie_parser import SROIETextDetAnnParser
 from .svt_parser import SVTTextDetAnnParser
 from .totaltext_parser import TotaltextTextDetAnnParser
 from .wildreceipt_parser import WildreceiptKIEAnnParser
@@ -12,5 +13,5 @@ __all__ = [
     'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
     'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
     'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
-    'NAFAnnParser'
+    'SROIETextDetAnnParser', 'NAFAnnParser'
 ]
diff --git a/mmocr/datasets/preparers/parsers/sroie_parser.py b/mmocr/datasets/preparers/parsers/sroie_parser.py
new file mode 100644
index 00000000..9f97ad43
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/sroie_parser.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+from mmocr.utils import bbox2poly
+from ..data_preparer import DATA_PARSERS
+from .base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class SROIETextDetAnnParser(BaseParser):
+    """SROIE Txt Format Text Detection Annotation Parser.
+
+    The original annotation format of this dataset is stored in txt files,
+    which is formed as the following format:
+        x1, y1, x2, y2, x3, y3, x4, y4, transcription
+
+    Args:
+        separator (str): The separator between each element in a line. Defaults
+            to ','.
+        ignore (str): The text to be ignored. Defaults to '###'.
+        format (str): The format of the annotation. Defaults to
+            'x1,y1,x2,y2,x3,y3,x4,trans'.
+        encoding (str): The encoding of the annotation file. Defaults to
+            'utf-8-sig'.
+        nproc (int): The number of processes to parse the annotation. Defaults
+            to 1.
+        remove_strs (List[str], Optional): Used to remove redundant strings in
+            the transcription. Defaults to None.
+        mode (str, optional): The mode of the box converter. Supported modes
+            are 'xywh' and 'xyxy'. Defaults to None.
+    """
+
+    def __init__(self,
+                 separator: str = ',',
+                 ignore: str = '###',
+                 format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
+                 encoding: str = 'utf-8-sig',
+                 nproc: int = 1,
+                 remove_strs: Optional[List[str]] = None,
+                 mode: str = None) -> None:
+        self.sep = separator
+        self.format = format
+        self.encoding = encoding
+        self.ignore = ignore
+        self.mode = mode
+        self.remove_strs = remove_strs
+        super().__init__(nproc=nproc)
+
+    def parse_file(self, file: Tuple, split: str) -> Tuple:
+        """Parse single annotation."""
+        img_file, txt_file = file
+        instances = list()
+        try:
+            # there might be some illegal symbols in the annotation
+            # which cannot be parsed by loader
+            for anno in self.loader(txt_file, self.sep, self.format,
+                                    self.encoding):
+                anno = list(anno.values())
+                if self.remove_strs is not None:
+                    for strs in self.remove_strs:
+                        for i in range(len(anno)):
+                            if strs in anno[i]:
+                                anno[i] = anno[i].replace(strs, '')
+                poly = list(map(float, anno[0:-1]))
+                if self.mode is not None:
+                    poly = bbox2poly(poly, self.mode)
+                    poly = poly.tolist()
+                text = anno[-1]
+                instances.append(
+                    dict(poly=poly, text=text, ignore=text == self.ignore))
+        except Exception:
+            pass
+
+        return img_file, instances