[Feature] CodeCamp #116 Add SROIE to dataset preparer (#1639)

* added sroie/metafile.yml * add sample_anno.md and textdet.py * modify and add all * fix lint * fix lint * fix lint * Update mmocr/datasets/preparers/data_converpyter. Co-authored-by: Tong Gao <gaotongxiao@gmail.com> * fix the reviewed * add comment of try to sroie_parser.py * modify data_obtainer.py * fix lint errors * fix download link Co-authored-by: Tong Gao <gaotongxiao@gmail.com>
2025-06-03 21:54:47 +08:00 · 2022-12-29 16:52:51 +08:00 · 2022-12-29 16:52:51 +08:00 · 1413b5043a
commit 1413b5043a
parent b79382cd6b
8 changed files with 183 additions and 1 deletions
--- a/dataset_zoo/sroie/metafile.yml
+++ b/dataset_zoo/sroie/metafile.yml
@ -0,0 +1,31 @@
 Name: 'Scanned Receipts OCR and Information Extraction'
 Paper:
  Title: ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction
  URL: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8977955
  Venue: ICDAR
  Year: '2019'
  BibTeX: '@INPROCEEDINGS{8977955,
  author={Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.},
  booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
  title={ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction},
  year={2019},
  volume={},
  number={},
  pages={1516-1520},
  doi={10.1109/ICDAR.2019.00244}}'
 Data:
  Website: https://rrc.cvc.uab.es/?ch=13
  Language:
    - English
  Scene:
    - Document
  Granularity:
    - Word
  Tasks:
    - textdet
    - textrecog
    - textspotting
  License:
    Type: CC BY 4.0
    Link: https://creativecommons.org/licenses/by/4.0/
  Format: .txt
--- a/dataset_zoo/sroie/sample_anno.md
+++ b/dataset_zoo/sroie/sample_anno.md
@ -0,0 +1,9 @@
 **Text Detection, Text Recognition and Text Spotting**
 ```text
 # x1,y1,x2,y2,x3,y3,x4,y4,trans
 72,25,326,25,326,64,72,64,TAN WOON YANN
 50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND
 205,121,285,121,285,139,205,139,789417-W
 ```
--- a/dataset_zoo/sroie/textdet.py
+++ b/dataset_zoo/sroie/textdet.py
@ -0,0 +1,55 @@
 data_root = 'data/sroie'
 cache_path = 'data/cache'
 data_obtainer = dict(
    type='NaiveDataObtainer',
    cache_path=cache_path,
    data_root=data_root,
    files=[
        dict(
            url='https://download.openmmlab.com/mmocr/data/'
            'sroie/0325updated.task1train(626p).zip',
            save_name='0325updated.task1train(626p).zip',
            md5='16137490f6865caac75772b9111d348c',
            split=['train'],
            content=['image', 'annotation'],
            mapping=[[
                '0325updated/0325updated.task1train(626p)/*.jpg',
                'textdet_imgs/train'
            ],
                     [
                         '0325updated/0325updated.task1train(626p)/*.txt',
                         'annotations/train'
                     ]]),
        dict(
            url='https://download.openmmlab.com/mmocr/data/'
            'sroie/task1&2_test(361p).zip',
            save_name='task1&2_test(361p).zip',
            md5='1bde54705db0995c57a6e34cce437fea',
            split=['test'],
            content=['image'],
            mapping=[[
                'task1&2_test(361p)/fulltext_test(361p)', 'textdet_imgs/test'
            ]]),
        dict(
            url='https://download.openmmlab.com/mmocr/data/sroie/text.zip',
            save_name='text.zip',
            md5='8c534653f252ff4d3943fa27a956a74b',
            split=['test'],
            content=['annotation'],
            mapping=[['text', 'annotations/test']]),
    ])
 data_converter = dict(
    type='TextDetDataConverter',
    splits=['train', 'test'],
    data_root=data_root,
    gatherer=dict(
        type='pair_gather',
        suffixes=['.jpg'],
        rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']),
    parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'),
    dumper=dict(type='JsonDumper'),
    delete=['text', 'task1&2_test(361p)', '0325updated', 'annotations'])
 config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
--- a/dataset_zoo/sroie/textrecog.py
+++ b/dataset_zoo/sroie/textrecog.py
@ -0,0 +1,5 @@
 _base_ = ['textdet.py']
 data_converter = dict(type='TextRecogCropConverter')
 config_generator = dict(type='TextRecogConfigGenerator')
--- a/dataset_zoo/sroie/textspotting.py
+++ b/dataset_zoo/sroie/textspotting.py
@ -0,0 +1,5 @@
 _base_ = ['textdet.py']
 data_converter = dict(type='TextSpottingDataConverter')
 config_generator = dict(type='TextSpottingConfigGenerator')
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@ -177,6 +177,8 @@ class BaseDataConverter:
        """
        files = list()
        for file in list_files(img_path, suffixes):
            if not re.match(rule[0], osp.basename(file)):
                continue
            file2 = re.sub(rule[0], rule[1], osp.basename(file))
            file2 = file.replace(osp.basename(file), file2)
            file2 = file2.replace(self.img_dir, 'annotations')
--- a/mmocr/datasets/preparers/parsers/init.py
+++ b/mmocr/datasets/preparers/parsers/init.py
@ -4,6 +4,7 @@ from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                               ICDARTxtTextRecogAnnParser)
 from .naf_parser import NAFAnnParser
 from .sroie_parser import SROIETextDetAnnParser
 from .svt_parser import SVTTextDetAnnParser
 from .totaltext_parser import TotaltextTextDetAnnParser
 from .wildreceipt_parser import WildreceiptKIEAnnParser
@ -12,5 +13,5 @@ __all__ = [
    'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
    'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
-    'NAFAnnParser'
+    'SROIETextDetAnnParser', 'NAFAnnParser'
 ]
--- a/mmocr/datasets/preparers/parsers/sroie_parser.py
+++ b/mmocr/datasets/preparers/parsers/sroie_parser.py
@ -0,0 +1,74 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import List, Optional, Tuple
 from mmocr.utils import bbox2poly
 from ..data_preparer import DATA_PARSERS
 from .base import BaseParser
@DATA_PARSERS.register_module()
 class SROIETextDetAnnParser(BaseParser):
    """SROIE Txt Format Text Detection Annotation Parser.
    The original annotation format of this dataset is stored in txt files,
    which is formed as the following format:
        x1, y1, x2, y2, x3, y3, x4, y4, transcription
    Args:
        separator (str): The separator between each element in a line. Defaults
            to ','.
        ignore (str): The text to be ignored. Defaults to '###'.
        format (str): The format of the annotation. Defaults to
            'x1,y1,x2,y2,x3,y3,x4,trans'.
        encoding (str): The encoding of the annotation file. Defaults to
            'utf-8-sig'.
        nproc (int): The number of processes to parse the annotation. Defaults
            to 1.
        remove_strs (List[str], Optional): Used to remove redundant strings in
            the transcription. Defaults to None.
        mode (str, optional): The mode of the box converter. Supported modes
            are 'xywh' and 'xyxy'. Defaults to None.
    """
    def __init__(self,
                 separator: str = ',',
                 ignore: str = '###',
                 format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans',
                 encoding: str = 'utf-8-sig',
                 nproc: int = 1,
                 remove_strs: Optional[List[str]] = None,
                 mode: str = None) -> None:
        self.sep = separator
        self.format = format
        self.encoding = encoding
        self.ignore = ignore
        self.mode = mode
        self.remove_strs = remove_strs
        super().__init__(nproc=nproc)
    def parse_file(self, file: Tuple, split: str) -> Tuple:
        """Parse single annotation."""
        img_file, txt_file = file
        instances = list()
        try:
            # there might be some illegal symbols in the annotation
            # which cannot be parsed by loader
            for anno in self.loader(txt_file, self.sep, self.format,
                                    self.encoding):
                anno = list(anno.values())
                if self.remove_strs is not None:
                    for strs in self.remove_strs:
                        for i in range(len(anno)):
                            if strs in anno[i]:
                                anno[i] = anno[i].replace(strs, '')
                poly = list(map(float, anno[0:-1]))
                if self.mode is not None:
                    poly = bbox2poly(poly, self.mode)
                    poly = poly.tolist()
                text = anno[-1]
                instances.append(
                    dict(poly=poly, text=text, ignore=text == self.ignore))
        except Exception:
            pass
        return img_file, instances