diff --git a/dataset_zoo/funsd/metafile.yml b/dataset_zoo/funsd/metafile.yml new file mode 100644 index 00000000..b31b7f4e --- /dev/null +++ b/dataset_zoo/funsd/metafile.yml @@ -0,0 +1,27 @@ +Name: 'FUNSD' +Paper: + Title: 'FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents' + URL: https://arxiv.org/pdf/1905.13538.pdf + Venue: ICDAR + Year: '2019' + BibTeX: '@inproceedings{jaume2019, + title = {FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author = {Guillaume Jaume, Hazim Kemal Ekenel, Jean-Philippe Thiran}, + booktitle = {Accepted to ICDAR-OST}, + year = {2019}}' +Data: + Website: https://guillaumejaume.github.io/FUNSD/ + Language: + - English + Scene: + - Document + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: FUNSD License + Link: https://guillaumejaume.github.io/FUNSD/work/ + Format: .json diff --git a/dataset_zoo/funsd/sample_anno.md b/dataset_zoo/funsd/sample_anno.md new file mode 100644 index 00000000..36b287cc --- /dev/null +++ b/dataset_zoo/funsd/sample_anno.md @@ -0,0 +1,73 @@ +**Text Detection/Recognition/Spotting** + +```json +{ + "form": [ + { + "id": 0, + "text": "Registration No.", + "box": [ + 94, + 169, + 191, + 186 + ], + "linking": [ + [ + 0, + 1 + ] + ], + "label": "question", + "words": [ + { + "text": "Registration", + "box": [ + 94, + 169, + 168, + 186 + ] + }, + { + "text": "No.", + "box": [ + 170, + 169, + 191, + 183 + ] + } + ] + }, + { + "id": 1, + "text": "533", + "box": [ + 209, + 169, + 236, + 182 + ], + "label": "answer", + "words": [ + { + "box": [ + 209, + 169, + 236, + 182 + ], + "text": "533" + } + ], + "linking": [ + [ + 0, + 1 + ] + ] + } + ] +} +``` diff --git a/dataset_zoo/funsd/textdet.py b/dataset_zoo/funsd/textdet.py new file mode 100644 index 00000000..8e958096 --- /dev/null +++ b/dataset_zoo/funsd/textdet.py @@ -0,0 +1,38 @@ +data_root = 'data/funsd' +cache_path = 'data/cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://guillaumejaume.github.io/FUNSD/dataset.zip', + save_name='funsd.zip', + md5='e05de47de238aa343bf55d8807d659a9', + split=['train', 'test'], + content=['image', 'annotation'], + mapping=[ + ['funsd/dataset/training_data/images', 'textdet_imgs/train'], + ['funsd/dataset/testing_data/images', 'textdet_imgs/test'], + [ + 'funsd/dataset/training_data/annotations', + 'annotations/train' + ], + ['funsd/dataset/testing_data/annotations', 'annotations/test'], + ]), + ]) + +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test'], + data_root=data_root, + gatherer=dict( + type='pair_gather', + suffixes=['.png'], + rule=[r'(\w+)\.png', r'\1.json']), + parser=dict(type='FUNSDTextDetAnnParser'), + dumper=dict(type='JsonDumper'), + delete=['annotations', 'funsd']) + +config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) diff --git a/dataset_zoo/funsd/textrecog.py b/dataset_zoo/funsd/textrecog.py new file mode 100644 index 00000000..212c7e7d --- /dev/null +++ b/dataset_zoo/funsd/textrecog.py @@ -0,0 +1,5 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextRecogCropConverter') + +config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/funsd/textspotting.py b/dataset_zoo/funsd/textspotting.py new file mode 100644 index 00000000..88486337 --- /dev/null +++ b/dataset_zoo/funsd/textspotting.py @@ -0,0 +1,5 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextSpottingDataConverter') + +config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index 16ad41a0..aa0ed8e5 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .coco_parser import COCOTextDetAnnParser +from .funsd_parser import FUNSDTextDetAnnParser from .icdar_txt_parser import (ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser) from .svt_parser import SVTTextDetAnnParser @@ -9,5 +10,5 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser __all__ = [ 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', - 'COCOTextDetAnnParser', 'SVTTextDetAnnParser' + 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/funsd_parser.py b/mmocr/datasets/preparers/parsers/funsd_parser.py new file mode 100644 index 00000000..6cb6651d --- /dev/null +++ b/mmocr/datasets/preparers/parsers/funsd_parser.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +from typing import Tuple + +from mmocr.utils import bbox2poly +from ..data_preparer import DATA_PARSERS +from .base import BaseParser + + +@DATA_PARSERS.register_module() +class FUNSDTextDetAnnParser(BaseParser): + """FUNSD Text Detection Annotation Parser. See + dataset_zoo/funsd/sample_anno.md for annotation example. + + Args: + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + """ + + def __init__(self, nproc: int = 1) -> None: + super().__init__(nproc=nproc) + + def parse_file(self, file: Tuple, split: str) -> Tuple: + """Parse single annotation.""" + img_file, json_file = file + instances = list() + for poly, text, ignore in self.loader(json_file): + instances.append(dict(poly=poly, text=text, ignore=ignore)) + + return img_file, instances + + def loader(self, file_path: str): + with open(file_path, 'r') as f: + data = json.load(f) + for form in data['form']: + for word in form['words']: + poly = bbox2poly(word['box']).tolist() + text = word['text'] + ignore = len(text) == 0 + yield poly, text, ignore