mirror of https://github.com/open-mmlab/mmocr.git
[Feature] Add Funsd to dataset preparer (#1550)
* add funsd * done * done Co-authored-by: gaotongxiao <gaotongxiao@gmail.com>pull/1636/head
parent
4396e8f5d8
commit
fb78c942d6
|
@ -0,0 +1,27 @@
|
|||
Name: 'FUNSD'
|
||||
Paper:
|
||||
Title: 'FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents'
|
||||
URL: https://arxiv.org/pdf/1905.13538.pdf
|
||||
Venue: ICDAR
|
||||
Year: '2019'
|
||||
BibTeX: '@inproceedings{jaume2019,
|
||||
title = {FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
|
||||
author = {Guillaume Jaume, Hazim Kemal Ekenel, Jean-Philippe Thiran},
|
||||
booktitle = {Accepted to ICDAR-OST},
|
||||
year = {2019}}'
|
||||
Data:
|
||||
Website: https://guillaumejaume.github.io/FUNSD/
|
||||
Language:
|
||||
- English
|
||||
Scene:
|
||||
- Document
|
||||
Granularity:
|
||||
- Word
|
||||
Tasks:
|
||||
- textdet
|
||||
- textrecog
|
||||
- textspotting
|
||||
License:
|
||||
Type: FUNSD License
|
||||
Link: https://guillaumejaume.github.io/FUNSD/work/
|
||||
Format: .json
|
|
@ -0,0 +1,73 @@
|
|||
**Text Detection/Recognition/Spotting**
|
||||
|
||||
```json
|
||||
{
|
||||
"form": [
|
||||
{
|
||||
"id": 0,
|
||||
"text": "Registration No.",
|
||||
"box": [
|
||||
94,
|
||||
169,
|
||||
191,
|
||||
186
|
||||
],
|
||||
"linking": [
|
||||
[
|
||||
0,
|
||||
1
|
||||
]
|
||||
],
|
||||
"label": "question",
|
||||
"words": [
|
||||
{
|
||||
"text": "Registration",
|
||||
"box": [
|
||||
94,
|
||||
169,
|
||||
168,
|
||||
186
|
||||
]
|
||||
},
|
||||
{
|
||||
"text": "No.",
|
||||
"box": [
|
||||
170,
|
||||
169,
|
||||
191,
|
||||
183
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "533",
|
||||
"box": [
|
||||
209,
|
||||
169,
|
||||
236,
|
||||
182
|
||||
],
|
||||
"label": "answer",
|
||||
"words": [
|
||||
{
|
||||
"box": [
|
||||
209,
|
||||
169,
|
||||
236,
|
||||
182
|
||||
],
|
||||
"text": "533"
|
||||
}
|
||||
],
|
||||
"linking": [
|
||||
[
|
||||
0,
|
||||
1
|
||||
]
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
|
@ -0,0 +1,38 @@
|
|||
data_root = 'data/funsd'
|
||||
cache_path = 'data/cache'
|
||||
|
||||
data_obtainer = dict(
|
||||
type='NaiveDataObtainer',
|
||||
cache_path=cache_path,
|
||||
data_root=data_root,
|
||||
files=[
|
||||
dict(
|
||||
url='https://guillaumejaume.github.io/FUNSD/dataset.zip',
|
||||
save_name='funsd.zip',
|
||||
md5='e05de47de238aa343bf55d8807d659a9',
|
||||
split=['train', 'test'],
|
||||
content=['image', 'annotation'],
|
||||
mapping=[
|
||||
['funsd/dataset/training_data/images', 'textdet_imgs/train'],
|
||||
['funsd/dataset/testing_data/images', 'textdet_imgs/test'],
|
||||
[
|
||||
'funsd/dataset/training_data/annotations',
|
||||
'annotations/train'
|
||||
],
|
||||
['funsd/dataset/testing_data/annotations', 'annotations/test'],
|
||||
]),
|
||||
])
|
||||
|
||||
data_converter = dict(
|
||||
type='TextDetDataConverter',
|
||||
splits=['train', 'test'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(
|
||||
type='pair_gather',
|
||||
suffixes=['.png'],
|
||||
rule=[r'(\w+)\.png', r'\1.json']),
|
||||
parser=dict(type='FUNSDTextDetAnnParser'),
|
||||
dumper=dict(type='JsonDumper'),
|
||||
delete=['annotations', 'funsd'])
|
||||
|
||||
config_generator = dict(type='TextDetConfigGenerator', data_root=data_root)
|
|
@ -0,0 +1,5 @@
|
|||
_base_ = ['textdet.py']
|
||||
|
||||
data_converter = dict(type='TextRecogCropConverter')
|
||||
|
||||
config_generator = dict(type='TextRecogConfigGenerator')
|
|
@ -0,0 +1,5 @@
|
|||
_base_ = ['textdet.py']
|
||||
|
||||
data_converter = dict(type='TextSpottingDataConverter')
|
||||
|
||||
config_generator = dict(type='TextSpottingConfigGenerator')
|
|
@ -1,5 +1,6 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
from .coco_parser import COCOTextDetAnnParser
|
||||
from .funsd_parser import FUNSDTextDetAnnParser
|
||||
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
|
||||
ICDARTxtTextRecogAnnParser)
|
||||
from .svt_parser import SVTTextDetAnnParser
|
||||
|
@ -9,5 +10,5 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser
|
|||
__all__ = [
|
||||
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
|
||||
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
|
||||
'COCOTextDetAnnParser', 'SVTTextDetAnnParser'
|
||||
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import json
|
||||
from typing import Tuple
|
||||
|
||||
from mmocr.utils import bbox2poly
|
||||
from ..data_preparer import DATA_PARSERS
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
@DATA_PARSERS.register_module()
|
||||
class FUNSDTextDetAnnParser(BaseParser):
|
||||
"""FUNSD Text Detection Annotation Parser. See
|
||||
dataset_zoo/funsd/sample_anno.md for annotation example.
|
||||
|
||||
Args:
|
||||
nproc (int): The number of processes to parse the annotation. Defaults
|
||||
to 1.
|
||||
"""
|
||||
|
||||
def __init__(self, nproc: int = 1) -> None:
|
||||
super().__init__(nproc=nproc)
|
||||
|
||||
def parse_file(self, file: Tuple, split: str) -> Tuple:
|
||||
"""Parse single annotation."""
|
||||
img_file, json_file = file
|
||||
instances = list()
|
||||
for poly, text, ignore in self.loader(json_file):
|
||||
instances.append(dict(poly=poly, text=text, ignore=ignore))
|
||||
|
||||
return img_file, instances
|
||||
|
||||
def loader(self, file_path: str):
|
||||
with open(file_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
for form in data['form']:
|
||||
for word in form['words']:
|
||||
poly = bbox2poly(word['box']).tolist()
|
||||
text = word['text']
|
||||
ignore = len(text) == 0
|
||||
yield poly, text, ignore
|
Loading…
Reference in New Issue