mirror of https://github.com/open-mmlab/mmocr.git
* add naf converter * fix test * update * use fuzzy search instead * update * updatepull/1639/head^2
parent
e3fd570687
commit
b79382cd6b
|
@ -142,3 +142,4 @@ mmocr/.mim
|
|||
workdirs/
|
||||
.history/
|
||||
.dev/
|
||||
data/
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
Name: 'NAF'
|
||||
Paper:
|
||||
Title: Deep Visual Template-Free Form Parsing
|
||||
URL: https://ieeexplore.ieee.org/abstract/document/8977962
|
||||
Venue: ICDAR
|
||||
Year: '2019'
|
||||
BibTeX: '@inproceedings{davis2019deep,
|
||||
title={Deep visual template-free form parsing},
|
||||
author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris},
|
||||
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
|
||||
pages={134--141},
|
||||
year={2019},
|
||||
organization={IEEE}}'
|
||||
Data:
|
||||
Website: https://github.com/herobd/NAF_dataset
|
||||
Language:
|
||||
- English
|
||||
Scene:
|
||||
- Document
|
||||
- Handwritten
|
||||
Granularity:
|
||||
- Word
|
||||
- Line
|
||||
Tasks:
|
||||
- textrecog
|
||||
- textdet
|
||||
- textspotting
|
||||
License:
|
||||
Type: CDLA
|
||||
Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE
|
||||
Format: .json
|
|
@ -0,0 +1,6 @@
|
|||
**Text Detection/Recognition/Spotting**
|
||||
|
||||
```json
|
||||
{"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}],
|
||||
"textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}}
|
||||
```
|
|
@ -0,0 +1,49 @@
|
|||
data_root = 'data/naf'
|
||||
cache_path = 'data/cache'
|
||||
|
||||
data_obtainer = dict(
|
||||
type='NaiveDataObtainer',
|
||||
cache_path=cache_path,
|
||||
data_root=data_root,
|
||||
files=[
|
||||
dict(
|
||||
url='https://github.com/herobd/NAF_dataset/releases/'
|
||||
'download/v1.0/labeled_images.tar.gz',
|
||||
save_name='naf_image.tar.gz',
|
||||
md5='6521cdc25c313a1f2928a16a77ad8f29',
|
||||
split=['train', 'test', 'val'],
|
||||
content=['image'],
|
||||
mapping=[['naf_image/labeled_images', 'temp_images/']]),
|
||||
dict(
|
||||
url='https://github.com/herobd/NAF_dataset/archive/'
|
||||
'refs/heads/master.zip',
|
||||
save_name='naf_anno.zip',
|
||||
md5='abf5af6266cc527d772231751bc884b3',
|
||||
split=['train', 'test', 'val'],
|
||||
content=['annotation'],
|
||||
mapping=[
|
||||
[
|
||||
'naf_anno/NAF_dataset-master/groups/**/*.json',
|
||||
'annotations/'
|
||||
],
|
||||
[
|
||||
'naf_anno/NAF_dataset-master/train_valid_test_split.json',
|
||||
'data_split.json'
|
||||
]
|
||||
]),
|
||||
])
|
||||
|
||||
data_converter = dict(
|
||||
type='TextDetDataConverter',
|
||||
splits=['train', 'test', 'val'],
|
||||
data_root=data_root,
|
||||
gatherer=dict(type='naf_gather'),
|
||||
parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
|
||||
delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
|
||||
dumper=dict(type='JsonDumper'),
|
||||
nproc=1)
|
||||
|
||||
config_generator = dict(
|
||||
type='TextDetConfigGenerator',
|
||||
data_root=data_root,
|
||||
val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])
|
|
@ -0,0 +1,19 @@
|
|||
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
|
||||
# not accurate. The test/valid set ones were hand corrected, but the train set
|
||||
# was only hand corrected a little. They aren't very good results. Better
|
||||
# not to use them for recognition and text spotting.
|
||||
|
||||
_base_ = ['textdet.py']
|
||||
data_root = 'data/naf'
|
||||
|
||||
data_converter = dict(
|
||||
type='TextRecogCropConverter',
|
||||
parser=dict(
|
||||
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
|
||||
det=False),
|
||||
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
|
||||
|
||||
config_generator = dict(
|
||||
type='TextRecogConfigGenerator',
|
||||
data_root=data_root,
|
||||
val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])
|
|
@ -0,0 +1,18 @@
|
|||
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
|
||||
# not accurate. The test/valid set ones were hand corrected, but the train set
|
||||
# was only hand corrected a little. They aren't very good results. Better
|
||||
# not to use them for recognition and text spotting.
|
||||
|
||||
_base_ = ['textdet.py']
|
||||
data_root = 'data/naf'
|
||||
data_converter = dict(
|
||||
type='TextSpottingDataConverter',
|
||||
parser=dict(
|
||||
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
|
||||
det=False),
|
||||
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
|
||||
|
||||
config_generator = dict(
|
||||
type='TextSpottingConfigGenerator',
|
||||
data_root=data_root,
|
||||
val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])
|
|
@ -1,5 +1,6 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import json
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
import shutil
|
||||
|
@ -61,6 +62,8 @@ class BaseDataConverter:
|
|||
self.gatherer = self.pair_gather
|
||||
elif gather_type == 'mono_gather':
|
||||
self.gatherer = self.mono_gather
|
||||
elif gather_type == 'naf_gather':
|
||||
self.gatherer = self.naf_gather
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -181,11 +184,51 @@ class BaseDataConverter:
|
|||
|
||||
return files
|
||||
|
||||
def naf_gather(self, img_path: str, ann_path: str,
|
||||
**kwargs) -> List[Tuple]:
|
||||
"""Gather the dataset file from NAF dataset. Specifically for the case
|
||||
that there is a split file that contains the names of different splits.
|
||||
For example,
|
||||
|
||||
img_001.jpg train: img_001.jpg
|
||||
img_002.jpg ---> data_split.json ---> test: img_002.jpg
|
||||
img_003.jpg val: img_003.jpg
|
||||
|
||||
Args:
|
||||
img_path (str): Path to the images.
|
||||
anno_path (str): Path to the annotations.
|
||||
Returns:
|
||||
List[Tuple]: A list of tuples (img_path, ann_path).
|
||||
"""
|
||||
split_file = osp.join(self.data_root, 'data_split.json')
|
||||
with open(split_file, 'r') as f:
|
||||
split_data = json.load(f)
|
||||
files = []
|
||||
# Rename the key
|
||||
split_data['val'] = split_data.pop('valid')
|
||||
if not osp.exists(img_path):
|
||||
os.makedirs(img_path)
|
||||
for groups in split_data[self.current_split]:
|
||||
for img_name in split_data[self.current_split][groups]:
|
||||
src_img = osp.join(self.data_root, 'temp_images', img_name)
|
||||
dst_img = osp.join(img_path, img_name)
|
||||
if not osp.exists(src_img):
|
||||
Warning(f'{src_img} does not exist!')
|
||||
continue
|
||||
# move the image to the new path
|
||||
shutil.move(src_img, dst_img)
|
||||
ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
|
||||
files.append((dst_img, ann))
|
||||
return files
|
||||
|
||||
def clean(self) -> None:
|
||||
for d in self.delete:
|
||||
delete_file = osp.join(self.data_root, d)
|
||||
if osp.exists(delete_file):
|
||||
shutil.rmtree(delete_file)
|
||||
if osp.isdir(delete_file):
|
||||
shutil.rmtree(delete_file)
|
||||
else:
|
||||
os.remove(delete_file)
|
||||
|
||||
|
||||
@DATA_CONVERTERS.register_module()
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import glob
|
||||
import os
|
||||
import os.path as osp
|
||||
import shutil
|
||||
|
@ -148,7 +149,14 @@ class NaiveDataObtainer:
|
|||
for src, dst in mapping:
|
||||
src = osp.join(self.data_root, src)
|
||||
dst = osp.join(self.data_root, dst)
|
||||
if osp.exists(src) and not osp.exists(dst):
|
||||
|
||||
if '*' in src:
|
||||
mkdir_or_exist(dst)
|
||||
for f in glob.glob(src):
|
||||
if not osp.exists(osp.join(dst, osp.basename(f))):
|
||||
shutil.move(f, dst)
|
||||
|
||||
elif osp.exists(src) and not osp.exists(dst):
|
||||
shutil.move(src, dst)
|
||||
|
||||
def clean(self) -> None:
|
||||
|
|
|
@ -3,6 +3,7 @@ from .coco_parser import COCOTextDetAnnParser
|
|||
from .funsd_parser import FUNSDTextDetAnnParser
|
||||
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
|
||||
ICDARTxtTextRecogAnnParser)
|
||||
from .naf_parser import NAFAnnParser
|
||||
from .svt_parser import SVTTextDetAnnParser
|
||||
from .totaltext_parser import TotaltextTextDetAnnParser
|
||||
from .wildreceipt_parser import WildreceiptKIEAnnParser
|
||||
|
@ -10,5 +11,6 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser
|
|||
__all__ = [
|
||||
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
|
||||
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
|
||||
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
|
||||
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
|
||||
'NAFAnnParser'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import json
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..data_preparer import DATA_PARSERS
|
||||
from .base import BaseParser
|
||||
|
||||
|
||||
@DATA_PARSERS.register_module()
|
||||
class NAFAnnParser(BaseParser):
|
||||
"""NAF dataset parser.
|
||||
|
||||
The original annotation format of this dataset is stored in json files,
|
||||
which has the following keys that will be used here:
|
||||
- 'textBBs': List of text bounding box objects
|
||||
- 'poly_points': list of [x,y] pairs, the box corners going
|
||||
top-left,top-right,bottom-right,bottom-left
|
||||
- 'id': id of the textBB, used to match with the text
|
||||
- 'transcriptions': Dict of transcription objects, use the 'id' key
|
||||
to match with the textBB.
|
||||
|
||||
Some special characters are used in the transcription:
|
||||
"«text»" indicates that "text" had a strikethrough
|
||||
"¿" indicates the transcriber could not read a character
|
||||
"§" indicates the whole line or word was illegible
|
||||
"" (empty string) is if the field was blank
|
||||
|
||||
Args:
|
||||
data_root (str): Path to the dataset root.
|
||||
ignore (list(str)): The text of the ignored instances. Default: ['#'].
|
||||
det (bool): Whether to parse the detection annotation. Default: True.
|
||||
If False, the parser will consider special case in NAF dataset
|
||||
where the transcription is not available.
|
||||
nproc (int): Number of processes to load the data. Default: 1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
data_root: str,
|
||||
ignore: List[str] = ['#'],
|
||||
det: bool = True,
|
||||
nproc: int = 1) -> None:
|
||||
self.ignore = ignore
|
||||
self.det = det
|
||||
super().__init__(data_root=data_root, nproc=nproc)
|
||||
|
||||
def parse_file(self, file: Tuple, split: str) -> Dict:
|
||||
"""Convert single annotation."""
|
||||
img_file, json_file = file
|
||||
instances = list()
|
||||
for poly, text in self.loader(json_file):
|
||||
instances.append(
|
||||
dict(poly=poly, text=text, ignore=text in self.ignore))
|
||||
|
||||
return img_file, instances
|
||||
|
||||
def loader(self, file_path: str) -> str:
|
||||
"""Load the annotation of the NAF dataset.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the json file
|
||||
|
||||
Retyrb:
|
||||
str: Complete annotation of the json file
|
||||
"""
|
||||
with open(file_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# 'textBBs' contains the printed texts of the table while 'fieldBBs'
|
||||
# contains the text filled by human.
|
||||
for box_type in ['textBBs', 'fieldBBs']:
|
||||
if not self.det:
|
||||
# 'textBBs' is only used for detection task.
|
||||
if box_type == 'textBBs':
|
||||
continue
|
||||
for anno in data[box_type]:
|
||||
# Skip blanks
|
||||
if self.det:
|
||||
if box_type == 'fieldBBs':
|
||||
if anno['type'] == 'blank':
|
||||
continue
|
||||
poly = np.array(anno['poly_points']).reshape(
|
||||
1, 8)[0].tolist()
|
||||
# Since detection task only need poly, we can skip the
|
||||
# transcription part that can be empty.
|
||||
text = None
|
||||
else:
|
||||
# For tasks that need transcription, NAF dataset has
|
||||
# serval special cases:
|
||||
# 1. The transcription for the whole image is not
|
||||
# available.
|
||||
# 2. The transcription for the certain text is not
|
||||
# available.
|
||||
# 3. If the length of the transcription is 0, it should
|
||||
# be ignored.
|
||||
if 'transcriptions' not in data.keys():
|
||||
break
|
||||
if anno['id'] not in data['transcriptions'].keys():
|
||||
continue
|
||||
text = data['transcriptions'][anno['id']]
|
||||
text = text.strip(
|
||||
'\u202a') # Remove unicode control character
|
||||
text = text.replace('»', '').replace(
|
||||
'«', '') # Remove strikethrough flag
|
||||
if len(text) == 0:
|
||||
continue
|
||||
poly = np.array(anno['poly_points']).reshape(
|
||||
1, 8)[0].tolist()
|
||||
yield poly, text
|
Loading…
Reference in New Issue