mirror of
https://github.com/open-mmlab/mmocr.git
synced 2025-06-03 21:54:47 +08:00
* add naf converter * fix test * update * use fuzzy search instead * update * update
This commit is contained in:
parent
e3fd570687
commit
b79382cd6b
1
.gitignore
vendored
1
.gitignore
vendored
@ -142,3 +142,4 @@ mmocr/.mim
|
|||||||
workdirs/
|
workdirs/
|
||||||
.history/
|
.history/
|
||||||
.dev/
|
.dev/
|
||||||
|
data/
|
||||||
|
31
dataset_zoo/naf/metafile.yml
Normal file
31
dataset_zoo/naf/metafile.yml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
Name: 'NAF'
|
||||||
|
Paper:
|
||||||
|
Title: Deep Visual Template-Free Form Parsing
|
||||||
|
URL: https://ieeexplore.ieee.org/abstract/document/8977962
|
||||||
|
Venue: ICDAR
|
||||||
|
Year: '2019'
|
||||||
|
BibTeX: '@inproceedings{davis2019deep,
|
||||||
|
title={Deep visual template-free form parsing},
|
||||||
|
author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris},
|
||||||
|
booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
|
||||||
|
pages={134--141},
|
||||||
|
year={2019},
|
||||||
|
organization={IEEE}}'
|
||||||
|
Data:
|
||||||
|
Website: https://github.com/herobd/NAF_dataset
|
||||||
|
Language:
|
||||||
|
- English
|
||||||
|
Scene:
|
||||||
|
- Document
|
||||||
|
- Handwritten
|
||||||
|
Granularity:
|
||||||
|
- Word
|
||||||
|
- Line
|
||||||
|
Tasks:
|
||||||
|
- textrecog
|
||||||
|
- textdet
|
||||||
|
- textspotting
|
||||||
|
License:
|
||||||
|
Type: CDLA
|
||||||
|
Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE
|
||||||
|
Format: .json
|
6
dataset_zoo/naf/sample_anno.md
Normal file
6
dataset_zoo/naf/sample_anno.md
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
**Text Detection/Recognition/Spotting**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}],
|
||||||
|
"textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}}
|
||||||
|
```
|
49
dataset_zoo/naf/textdet.py
Normal file
49
dataset_zoo/naf/textdet.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
data_root = 'data/naf'
|
||||||
|
cache_path = 'data/cache'
|
||||||
|
|
||||||
|
data_obtainer = dict(
|
||||||
|
type='NaiveDataObtainer',
|
||||||
|
cache_path=cache_path,
|
||||||
|
data_root=data_root,
|
||||||
|
files=[
|
||||||
|
dict(
|
||||||
|
url='https://github.com/herobd/NAF_dataset/releases/'
|
||||||
|
'download/v1.0/labeled_images.tar.gz',
|
||||||
|
save_name='naf_image.tar.gz',
|
||||||
|
md5='6521cdc25c313a1f2928a16a77ad8f29',
|
||||||
|
split=['train', 'test', 'val'],
|
||||||
|
content=['image'],
|
||||||
|
mapping=[['naf_image/labeled_images', 'temp_images/']]),
|
||||||
|
dict(
|
||||||
|
url='https://github.com/herobd/NAF_dataset/archive/'
|
||||||
|
'refs/heads/master.zip',
|
||||||
|
save_name='naf_anno.zip',
|
||||||
|
md5='abf5af6266cc527d772231751bc884b3',
|
||||||
|
split=['train', 'test', 'val'],
|
||||||
|
content=['annotation'],
|
||||||
|
mapping=[
|
||||||
|
[
|
||||||
|
'naf_anno/NAF_dataset-master/groups/**/*.json',
|
||||||
|
'annotations/'
|
||||||
|
],
|
||||||
|
[
|
||||||
|
'naf_anno/NAF_dataset-master/train_valid_test_split.json',
|
||||||
|
'data_split.json'
|
||||||
|
]
|
||||||
|
]),
|
||||||
|
])
|
||||||
|
|
||||||
|
data_converter = dict(
|
||||||
|
type='TextDetDataConverter',
|
||||||
|
splits=['train', 'test', 'val'],
|
||||||
|
data_root=data_root,
|
||||||
|
gatherer=dict(type='naf_gather'),
|
||||||
|
parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
|
||||||
|
delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
|
||||||
|
dumper=dict(type='JsonDumper'),
|
||||||
|
nproc=1)
|
||||||
|
|
||||||
|
config_generator = dict(
|
||||||
|
type='TextDetConfigGenerator',
|
||||||
|
data_root=data_root,
|
||||||
|
val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])
|
19
dataset_zoo/naf/textrecog.py
Normal file
19
dataset_zoo/naf/textrecog.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
|
||||||
|
# not accurate. The test/valid set ones were hand corrected, but the train set
|
||||||
|
# was only hand corrected a little. They aren't very good results. Better
|
||||||
|
# not to use them for recognition and text spotting.
|
||||||
|
|
||||||
|
_base_ = ['textdet.py']
|
||||||
|
data_root = 'data/naf'
|
||||||
|
|
||||||
|
data_converter = dict(
|
||||||
|
type='TextRecogCropConverter',
|
||||||
|
parser=dict(
|
||||||
|
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
|
||||||
|
det=False),
|
||||||
|
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
|
||||||
|
|
||||||
|
config_generator = dict(
|
||||||
|
type='TextRecogConfigGenerator',
|
||||||
|
data_root=data_root,
|
||||||
|
val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])
|
18
dataset_zoo/naf/textspotting.py
Normal file
18
dataset_zoo/naf/textspotting.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# The transcription of NAF dataset is annotated from Tessaract OCR, which is
|
||||||
|
# not accurate. The test/valid set ones were hand corrected, but the train set
|
||||||
|
# was only hand corrected a little. They aren't very good results. Better
|
||||||
|
# not to use them for recognition and text spotting.
|
||||||
|
|
||||||
|
_base_ = ['textdet.py']
|
||||||
|
data_root = 'data/naf'
|
||||||
|
data_converter = dict(
|
||||||
|
type='TextSpottingDataConverter',
|
||||||
|
parser=dict(
|
||||||
|
type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
|
||||||
|
det=False),
|
||||||
|
delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
|
||||||
|
|
||||||
|
config_generator = dict(
|
||||||
|
type='TextSpottingConfigGenerator',
|
||||||
|
data_root=data_root,
|
||||||
|
val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])
|
@ -1,5 +1,6 @@
|
|||||||
# Copyright (c) OpenMMLab. All rights reserved.
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
@ -61,6 +62,8 @@ class BaseDataConverter:
|
|||||||
self.gatherer = self.pair_gather
|
self.gatherer = self.pair_gather
|
||||||
elif gather_type == 'mono_gather':
|
elif gather_type == 'mono_gather':
|
||||||
self.gatherer = self.mono_gather
|
self.gatherer = self.mono_gather
|
||||||
|
elif gather_type == 'naf_gather':
|
||||||
|
self.gatherer = self.naf_gather
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@ -181,11 +184,51 @@ class BaseDataConverter:
|
|||||||
|
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
def naf_gather(self, img_path: str, ann_path: str,
|
||||||
|
**kwargs) -> List[Tuple]:
|
||||||
|
"""Gather the dataset file from NAF dataset. Specifically for the case
|
||||||
|
that there is a split file that contains the names of different splits.
|
||||||
|
For example,
|
||||||
|
|
||||||
|
img_001.jpg train: img_001.jpg
|
||||||
|
img_002.jpg ---> data_split.json ---> test: img_002.jpg
|
||||||
|
img_003.jpg val: img_003.jpg
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img_path (str): Path to the images.
|
||||||
|
anno_path (str): Path to the annotations.
|
||||||
|
Returns:
|
||||||
|
List[Tuple]: A list of tuples (img_path, ann_path).
|
||||||
|
"""
|
||||||
|
split_file = osp.join(self.data_root, 'data_split.json')
|
||||||
|
with open(split_file, 'r') as f:
|
||||||
|
split_data = json.load(f)
|
||||||
|
files = []
|
||||||
|
# Rename the key
|
||||||
|
split_data['val'] = split_data.pop('valid')
|
||||||
|
if not osp.exists(img_path):
|
||||||
|
os.makedirs(img_path)
|
||||||
|
for groups in split_data[self.current_split]:
|
||||||
|
for img_name in split_data[self.current_split][groups]:
|
||||||
|
src_img = osp.join(self.data_root, 'temp_images', img_name)
|
||||||
|
dst_img = osp.join(img_path, img_name)
|
||||||
|
if not osp.exists(src_img):
|
||||||
|
Warning(f'{src_img} does not exist!')
|
||||||
|
continue
|
||||||
|
# move the image to the new path
|
||||||
|
shutil.move(src_img, dst_img)
|
||||||
|
ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
|
||||||
|
files.append((dst_img, ann))
|
||||||
|
return files
|
||||||
|
|
||||||
def clean(self) -> None:
|
def clean(self) -> None:
|
||||||
for d in self.delete:
|
for d in self.delete:
|
||||||
delete_file = osp.join(self.data_root, d)
|
delete_file = osp.join(self.data_root, d)
|
||||||
if osp.exists(delete_file):
|
if osp.exists(delete_file):
|
||||||
shutil.rmtree(delete_file)
|
if osp.isdir(delete_file):
|
||||||
|
shutil.rmtree(delete_file)
|
||||||
|
else:
|
||||||
|
os.remove(delete_file)
|
||||||
|
|
||||||
|
|
||||||
@DATA_CONVERTERS.register_module()
|
@DATA_CONVERTERS.register_module()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# Copyright (c) OpenMMLab. All rights reserved.
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||||||
|
import glob
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import shutil
|
import shutil
|
||||||
@ -148,7 +149,14 @@ class NaiveDataObtainer:
|
|||||||
for src, dst in mapping:
|
for src, dst in mapping:
|
||||||
src = osp.join(self.data_root, src)
|
src = osp.join(self.data_root, src)
|
||||||
dst = osp.join(self.data_root, dst)
|
dst = osp.join(self.data_root, dst)
|
||||||
if osp.exists(src) and not osp.exists(dst):
|
|
||||||
|
if '*' in src:
|
||||||
|
mkdir_or_exist(dst)
|
||||||
|
for f in glob.glob(src):
|
||||||
|
if not osp.exists(osp.join(dst, osp.basename(f))):
|
||||||
|
shutil.move(f, dst)
|
||||||
|
|
||||||
|
elif osp.exists(src) and not osp.exists(dst):
|
||||||
shutil.move(src, dst)
|
shutil.move(src, dst)
|
||||||
|
|
||||||
def clean(self) -> None:
|
def clean(self) -> None:
|
||||||
|
@ -3,6 +3,7 @@ from .coco_parser import COCOTextDetAnnParser
|
|||||||
from .funsd_parser import FUNSDTextDetAnnParser
|
from .funsd_parser import FUNSDTextDetAnnParser
|
||||||
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
|
from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
|
||||||
ICDARTxtTextRecogAnnParser)
|
ICDARTxtTextRecogAnnParser)
|
||||||
|
from .naf_parser import NAFAnnParser
|
||||||
from .svt_parser import SVTTextDetAnnParser
|
from .svt_parser import SVTTextDetAnnParser
|
||||||
from .totaltext_parser import TotaltextTextDetAnnParser
|
from .totaltext_parser import TotaltextTextDetAnnParser
|
||||||
from .wildreceipt_parser import WildreceiptKIEAnnParser
|
from .wildreceipt_parser import WildreceiptKIEAnnParser
|
||||||
@ -10,5 +11,6 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
|
'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
|
||||||
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
|
'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
|
||||||
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
|
'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
|
||||||
|
'NAFAnnParser'
|
||||||
]
|
]
|
||||||
|
110
mmocr/datasets/preparers/parsers/naf_parser.py
Normal file
110
mmocr/datasets/preparers/parsers/naf_parser.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
# Copyright (c) OpenMMLab. All rights reserved.
|
||||||
|
import json
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ..data_preparer import DATA_PARSERS
|
||||||
|
from .base import BaseParser
|
||||||
|
|
||||||
|
|
||||||
|
@DATA_PARSERS.register_module()
|
||||||
|
class NAFAnnParser(BaseParser):
|
||||||
|
"""NAF dataset parser.
|
||||||
|
|
||||||
|
The original annotation format of this dataset is stored in json files,
|
||||||
|
which has the following keys that will be used here:
|
||||||
|
- 'textBBs': List of text bounding box objects
|
||||||
|
- 'poly_points': list of [x,y] pairs, the box corners going
|
||||||
|
top-left,top-right,bottom-right,bottom-left
|
||||||
|
- 'id': id of the textBB, used to match with the text
|
||||||
|
- 'transcriptions': Dict of transcription objects, use the 'id' key
|
||||||
|
to match with the textBB.
|
||||||
|
|
||||||
|
Some special characters are used in the transcription:
|
||||||
|
"«text»" indicates that "text" had a strikethrough
|
||||||
|
"¿" indicates the transcriber could not read a character
|
||||||
|
"§" indicates the whole line or word was illegible
|
||||||
|
"" (empty string) is if the field was blank
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_root (str): Path to the dataset root.
|
||||||
|
ignore (list(str)): The text of the ignored instances. Default: ['#'].
|
||||||
|
det (bool): Whether to parse the detection annotation. Default: True.
|
||||||
|
If False, the parser will consider special case in NAF dataset
|
||||||
|
where the transcription is not available.
|
||||||
|
nproc (int): Number of processes to load the data. Default: 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
data_root: str,
|
||||||
|
ignore: List[str] = ['#'],
|
||||||
|
det: bool = True,
|
||||||
|
nproc: int = 1) -> None:
|
||||||
|
self.ignore = ignore
|
||||||
|
self.det = det
|
||||||
|
super().__init__(data_root=data_root, nproc=nproc)
|
||||||
|
|
||||||
|
def parse_file(self, file: Tuple, split: str) -> Dict:
|
||||||
|
"""Convert single annotation."""
|
||||||
|
img_file, json_file = file
|
||||||
|
instances = list()
|
||||||
|
for poly, text in self.loader(json_file):
|
||||||
|
instances.append(
|
||||||
|
dict(poly=poly, text=text, ignore=text in self.ignore))
|
||||||
|
|
||||||
|
return img_file, instances
|
||||||
|
|
||||||
|
def loader(self, file_path: str) -> str:
|
||||||
|
"""Load the annotation of the NAF dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the json file
|
||||||
|
|
||||||
|
Retyrb:
|
||||||
|
str: Complete annotation of the json file
|
||||||
|
"""
|
||||||
|
with open(file_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# 'textBBs' contains the printed texts of the table while 'fieldBBs'
|
||||||
|
# contains the text filled by human.
|
||||||
|
for box_type in ['textBBs', 'fieldBBs']:
|
||||||
|
if not self.det:
|
||||||
|
# 'textBBs' is only used for detection task.
|
||||||
|
if box_type == 'textBBs':
|
||||||
|
continue
|
||||||
|
for anno in data[box_type]:
|
||||||
|
# Skip blanks
|
||||||
|
if self.det:
|
||||||
|
if box_type == 'fieldBBs':
|
||||||
|
if anno['type'] == 'blank':
|
||||||
|
continue
|
||||||
|
poly = np.array(anno['poly_points']).reshape(
|
||||||
|
1, 8)[0].tolist()
|
||||||
|
# Since detection task only need poly, we can skip the
|
||||||
|
# transcription part that can be empty.
|
||||||
|
text = None
|
||||||
|
else:
|
||||||
|
# For tasks that need transcription, NAF dataset has
|
||||||
|
# serval special cases:
|
||||||
|
# 1. The transcription for the whole image is not
|
||||||
|
# available.
|
||||||
|
# 2. The transcription for the certain text is not
|
||||||
|
# available.
|
||||||
|
# 3. If the length of the transcription is 0, it should
|
||||||
|
# be ignored.
|
||||||
|
if 'transcriptions' not in data.keys():
|
||||||
|
break
|
||||||
|
if anno['id'] not in data['transcriptions'].keys():
|
||||||
|
continue
|
||||||
|
text = data['transcriptions'][anno['id']]
|
||||||
|
text = text.strip(
|
||||||
|
'\u202a') # Remove unicode control character
|
||||||
|
text = text.replace('»', '').replace(
|
||||||
|
'«', '') # Remove strikethrough flag
|
||||||
|
if len(text) == 0:
|
||||||
|
continue
|
||||||
|
poly = np.array(anno['poly_points']).reshape(
|
||||||
|
1, 8)[0].tolist()
|
||||||
|
yield poly, text
|
Loading…
x
Reference in New Issue
Block a user