[Feature] CodeCamp #115 Add NAF to dataset preparer (#1609)

* add naf converter

* fix test

* update

* use fuzzy search instead

* update

* update
pull/1639/head^2
Qing Jiang 2022-12-29 15:19:49 +08:00 committed by GitHub
parent e3fd570687
commit b79382cd6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 290 additions and 3 deletions

1
.gitignore vendored
View File

@ -142,3 +142,4 @@ mmocr/.mim
workdirs/
.history/
.dev/
data/

View File

@ -0,0 +1,31 @@
Name: 'NAF'
Paper:
Title: Deep Visual Template-Free Form Parsing
URL: https://ieeexplore.ieee.org/abstract/document/8977962
Venue: ICDAR
Year: '2019'
BibTeX: '@inproceedings{davis2019deep,
title={Deep visual template-free form parsing},
author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris},
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
pages={134--141},
year={2019},
organization={IEEE}}'
Data:
Website: https://github.com/herobd/NAF_dataset
Language:
- English
Scene:
- Document
- Handwritten
Granularity:
- Word
- Line
Tasks:
- textrecog
- textdet
- textspotting
License:
Type: CDLA
Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE
Format: .json

View File

@ -0,0 +1,6 @@
**Text Detection/Recognition/Spotting**
```json
{"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}],
"textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}}
```

View File

@ -0,0 +1,49 @@
data_root = 'data/naf'
cache_path = 'data/cache'
data_obtainer = dict(
type='NaiveDataObtainer',
cache_path=cache_path,
data_root=data_root,
files=[
dict(
url='https://github.com/herobd/NAF_dataset/releases/'
'download/v1.0/labeled_images.tar.gz',
save_name='naf_image.tar.gz',
md5='6521cdc25c313a1f2928a16a77ad8f29',
split=['train', 'test', 'val'],
content=['image'],
mapping=[['naf_image/labeled_images', 'temp_images/']]),
dict(
url='https://github.com/herobd/NAF_dataset/archive/'
'refs/heads/master.zip',
save_name='naf_anno.zip',
md5='abf5af6266cc527d772231751bc884b3',
split=['train', 'test', 'val'],
content=['annotation'],
mapping=[
[
'naf_anno/NAF_dataset-master/groups/**/*.json',
'annotations/'
],
[
'naf_anno/NAF_dataset-master/train_valid_test_split.json',
'data_split.json'
]
]),
])
data_converter = dict(
type='TextDetDataConverter',
splits=['train', 'test', 'val'],
data_root=data_root,
gatherer=dict(type='naf_gather'),
parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
dumper=dict(type='JsonDumper'),
nproc=1)
config_generator = dict(
type='TextDetConfigGenerator',
data_root=data_root,
val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])

View File

@ -0,0 +1,19 @@
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
# not accurate. The test/valid set ones were hand corrected, but the train set
# was only hand corrected a little. They aren't very good results. Better
# not to use them for recognition and text spotting.
_base_ = ['textdet.py']
data_root = 'data/naf'
data_converter = dict(
type='TextRecogCropConverter',
parser=dict(
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
det=False),
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
config_generator = dict(
type='TextRecogConfigGenerator',
data_root=data_root,
val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])

View File

@ -0,0 +1,18 @@
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
# not accurate. The test/valid set ones were hand corrected, but the train set
# was only hand corrected a little. They aren't very good results. Better
# not to use them for recognition and text spotting.
_base_ = ['textdet.py']
data_root = 'data/naf'
data_converter = dict(
type='TextSpottingDataConverter',
parser=dict(
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
det=False),
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
config_generator = dict(
type='TextSpottingConfigGenerator',
data_root=data_root,
val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])

View File

@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os
import os.path as osp
import re
import shutil
@ -61,6 +62,8 @@ class BaseDataConverter:
self.gatherer = self.pair_gather
elif gather_type == 'mono_gather':
self.gatherer = self.mono_gather
elif gather_type == 'naf_gather':
self.gatherer = self.naf_gather
else:
raise NotImplementedError
@ -181,11 +184,51 @@ class BaseDataConverter:
return files
def naf_gather(self, img_path: str, ann_path: str,
**kwargs) -> List[Tuple]:
"""Gather the dataset file from NAF dataset. Specifically for the case
that there is a split file that contains the names of different splits.
For example,
img_001.jpg train: img_001.jpg
img_002.jpg ---> data_split.json ---> test: img_002.jpg
img_003.jpg val: img_003.jpg
Args:
img_path (str): Path to the images.
anno_path (str): Path to the annotations.
Returns:
List[Tuple]: A list of tuples (img_path, ann_path).
"""
split_file = osp.join(self.data_root, 'data_split.json')
with open(split_file, 'r') as f:
split_data = json.load(f)
files = []
# Rename the key
split_data['val'] = split_data.pop('valid')
if not osp.exists(img_path):
os.makedirs(img_path)
for groups in split_data[self.current_split]:
for img_name in split_data[self.current_split][groups]:
src_img = osp.join(self.data_root, 'temp_images', img_name)
dst_img = osp.join(img_path, img_name)
if not osp.exists(src_img):
Warning(f'{src_img} does not exist!')
continue
# move the image to the new path
shutil.move(src_img, dst_img)
ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
files.append((dst_img, ann))
return files
def clean(self) -> None:
for d in self.delete:
delete_file = osp.join(self.data_root, d)
if osp.exists(delete_file):
shutil.rmtree(delete_file)
if osp.isdir(delete_file):
shutil.rmtree(delete_file)
else:
os.remove(delete_file)
@DATA_CONVERTERS.register_module()

View File

@ -1,4 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
import glob
import os
import os.path as osp
import shutil
@ -148,7 +149,14 @@ class NaiveDataObtainer:
for src, dst in mapping:
src = osp.join(self.data_root, src)
dst = osp.join(self.data_root, dst)
if osp.exists(src) and not osp.exists(dst):
if '*' in src:
mkdir_or_exist(dst)
for f in glob.glob(src):
if not osp.exists(osp.join(dst, osp.basename(f))):
shutil.move(f, dst)
elif osp.exists(src) and not osp.exists(dst):
shutil.move(src, dst)
def clean(self) -> None:

View File

@ -3,6 +3,7 @@ from .coco_parser import COCOTextDetAnnParser
from .funsd_parser import FUNSDTextDetAnnParser
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
ICDARTxtTextRecogAnnParser)
from .naf_parser import NAFAnnParser
from .svt_parser import SVTTextDetAnnParser
from .totaltext_parser import TotaltextTextDetAnnParser
from .wildreceipt_parser import WildreceiptKIEAnnParser
@ -10,5 +11,6 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser
__all__ = [
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
'NAFAnnParser'
]

View File

@ -0,0 +1,110 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
from typing import Dict, List, Tuple
import numpy as np
from ..data_preparer import DATA_PARSERS
from .base import BaseParser
@DATA_PARSERS.register_module()
class NAFAnnParser(BaseParser):
"""NAF dataset parser.
The original annotation format of this dataset is stored in json files,
which has the following keys that will be used here:
- 'textBBs': List of text bounding box objects
- 'poly_points': list of [x,y] pairs, the box corners going
top-left,top-right,bottom-right,bottom-left
- 'id': id of the textBB, used to match with the text
- 'transcriptions': Dict of transcription objects, use the 'id' key
to match with the textBB.
Some special characters are used in the transcription:
"«text»" indicates that "text" had a strikethrough
"¿" indicates the transcriber could not read a character
"§" indicates the whole line or word was illegible
"" (empty string) is if the field was blank
Args:
data_root (str): Path to the dataset root.
ignore (list(str)): The text of the ignored instances. Default: ['#'].
det (bool): Whether to parse the detection annotation. Default: True.
If False, the parser will consider special case in NAF dataset
where the transcription is not available.
nproc (int): Number of processes to load the data. Default: 1.
"""
def __init__(self,
data_root: str,
ignore: List[str] = ['#'],
det: bool = True,
nproc: int = 1) -> None:
self.ignore = ignore
self.det = det
super().__init__(data_root=data_root, nproc=nproc)
def parse_file(self, file: Tuple, split: str) -> Dict:
"""Convert single annotation."""
img_file, json_file = file
instances = list()
for poly, text in self.loader(json_file):
instances.append(
dict(poly=poly, text=text, ignore=text in self.ignore))
return img_file, instances
def loader(self, file_path: str) -> str:
"""Load the annotation of the NAF dataset.
Args:
file_path (str): Path to the json file
Retyrb:
str: Complete annotation of the json file
"""
with open(file_path, 'r') as f:
data = json.load(f)
# 'textBBs' contains the printed texts of the table while 'fieldBBs'
# contains the text filled by human.
for box_type in ['textBBs', 'fieldBBs']:
if not self.det:
# 'textBBs' is only used for detection task.
if box_type == 'textBBs':
continue
for anno in data[box_type]:
# Skip blanks
if self.det:
if box_type == 'fieldBBs':
if anno['type'] == 'blank':
continue
poly = np.array(anno['poly_points']).reshape(
1, 8)[0].tolist()
# Since detection task only need poly, we can skip the
# transcription part that can be empty.
text = None
else:
# For tasks that need transcription, NAF dataset has
# serval special cases:
# 1. The transcription for the whole image is not
# available.
# 2. The transcription for the certain text is not
# available.
# 3. If the length of the transcription is 0, it should
# be ignored.
if 'transcriptions' not in data.keys():
break
if anno['id'] not in data['transcriptions'].keys():
continue
text = data['transcriptions'][anno['id']]
text = text.strip(
'\u202a') # Remove unicode control character
text = text.replace('»', '').replace(
'«', '') # Remove strikethrough flag
if len(text) == 0:
continue
poly = np.array(anno['poly_points']).reshape(
1, 8)[0].tolist()
yield poly, text