From b79382cd6bc63dde72cb4ef0ff4f2b6e6f9c488c Mon Sep 17 00:00:00 2001 From: Qing Jiang Date: Thu, 29 Dec 2022 15:19:49 +0800 Subject: [PATCH] [Feature] CodeCamp #115 Add NAF to dataset preparer (#1609) * add naf converter * fix test * update * use fuzzy search instead * update * update --- .gitignore | 1 + dataset_zoo/naf/metafile.yml | 31 +++++ dataset_zoo/naf/sample_anno.md | 6 + dataset_zoo/naf/textdet.py | 49 ++++++++ dataset_zoo/naf/textrecog.py | 19 +++ dataset_zoo/naf/textspotting.py | 18 +++ mmocr/datasets/preparers/data_converter.py | 45 ++++++- mmocr/datasets/preparers/data_obtainer.py | 10 +- mmocr/datasets/preparers/parsers/__init__.py | 4 +- .../datasets/preparers/parsers/naf_parser.py | 110 ++++++++++++++++++ 10 files changed, 290 insertions(+), 3 deletions(-) create mode 100644 dataset_zoo/naf/metafile.yml create mode 100644 dataset_zoo/naf/sample_anno.md create mode 100644 dataset_zoo/naf/textdet.py create mode 100644 dataset_zoo/naf/textrecog.py create mode 100644 dataset_zoo/naf/textspotting.py create mode 100644 mmocr/datasets/preparers/parsers/naf_parser.py diff --git a/.gitignore b/.gitignore index 64efec76..63510208 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,4 @@ mmocr/.mim workdirs/ .history/ .dev/ +data/ diff --git a/dataset_zoo/naf/metafile.yml b/dataset_zoo/naf/metafile.yml new file mode 100644 index 00000000..c816c772 --- /dev/null +++ b/dataset_zoo/naf/metafile.yml @@ -0,0 +1,31 @@ +Name: 'NAF' +Paper: + Title: Deep Visual Template-Free Form Parsing + URL: https://ieeexplore.ieee.org/abstract/document/8977962 + Venue: ICDAR + Year: '2019' + BibTeX: '@inproceedings{davis2019deep, + title={Deep visual template-free form parsing}, + author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + pages={134--141}, + year={2019}, + organization={IEEE}}' +Data: + Website: https://github.com/herobd/NAF_dataset + Language: + - English + Scene: + - Document + - Handwritten + Granularity: + - Word + - Line + Tasks: + - textrecog + - textdet + - textspotting + License: + Type: CDLA + Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE + Format: .json diff --git a/dataset_zoo/naf/sample_anno.md b/dataset_zoo/naf/sample_anno.md new file mode 100644 index 00000000..9043e763 --- /dev/null +++ b/dataset_zoo/naf/sample_anno.md @@ -0,0 +1,6 @@ +**Text Detection/Recognition/Spotting** + +```json +{"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}], + "textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}} +``` diff --git a/dataset_zoo/naf/textdet.py b/dataset_zoo/naf/textdet.py new file mode 100644 index 00000000..25261b71 --- /dev/null +++ b/dataset_zoo/naf/textdet.py @@ -0,0 +1,49 @@ +data_root = 'data/naf' +cache_path = 'data/cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://github.com/herobd/NAF_dataset/releases/' + 'download/v1.0/labeled_images.tar.gz', + save_name='naf_image.tar.gz', + md5='6521cdc25c313a1f2928a16a77ad8f29', + split=['train', 'test', 'val'], + content=['image'], + mapping=[['naf_image/labeled_images', 'temp_images/']]), + dict( + url='https://github.com/herobd/NAF_dataset/archive/' + 'refs/heads/master.zip', + save_name='naf_anno.zip', + md5='abf5af6266cc527d772231751bc884b3', + split=['train', 'test', 'val'], + content=['annotation'], + mapping=[ + [ + 'naf_anno/NAF_dataset-master/groups/**/*.json', + 'annotations/' + ], + [ + 'naf_anno/NAF_dataset-master/train_valid_test_split.json', + 'data_split.json' + ] + ]), + ]) + +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test', 'val'], + data_root=data_root, + gatherer=dict(type='naf_gather'), + parser=dict(type='NAFAnnParser', data_root=data_root, det=True), + delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'], + dumper=dict(type='JsonDumper'), + nproc=1) + +config_generator = dict( + type='TextDetConfigGenerator', + data_root=data_root, + val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')]) diff --git a/dataset_zoo/naf/textrecog.py b/dataset_zoo/naf/textrecog.py new file mode 100644 index 00000000..943bd8cd --- /dev/null +++ b/dataset_zoo/naf/textrecog.py @@ -0,0 +1,19 @@ +# The transcription of NAF dataset is annotated from Tessaract OCR, which is +# not accurate. The test/valid set ones were hand corrected, but the train set +# was only hand corrected a little. They aren't very good results. Better +# not to use them for recognition and text spotting. + +_base_ = ['textdet.py'] +data_root = 'data/naf' + +data_converter = dict( + type='TextRecogCropConverter', + parser=dict( + type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'], + det=False), + delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations']) + +config_generator = dict( + type='TextRecogConfigGenerator', + data_root=data_root, + val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')]) diff --git a/dataset_zoo/naf/textspotting.py b/dataset_zoo/naf/textspotting.py new file mode 100644 index 00000000..97b50b4a --- /dev/null +++ b/dataset_zoo/naf/textspotting.py @@ -0,0 +1,18 @@ +# The transcription of NAF dataset is annotated from Tessaract OCR, which is +# not accurate. The test/valid set ones were hand corrected, but the train set +# was only hand corrected a little. They aren't very good results. Better +# not to use them for recognition and text spotting. + +_base_ = ['textdet.py'] +data_root = 'data/naf' +data_converter = dict( + type='TextSpottingDataConverter', + parser=dict( + type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'], + det=False), + delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations']) + +config_generator = dict( + type='TextSpottingConfigGenerator', + data_root=data_root, + val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')]) diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index 8640c586..a1d0b1f6 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import json +import os import os.path as osp import re import shutil @@ -61,6 +62,8 @@ class BaseDataConverter: self.gatherer = self.pair_gather elif gather_type == 'mono_gather': self.gatherer = self.mono_gather + elif gather_type == 'naf_gather': + self.gatherer = self.naf_gather else: raise NotImplementedError @@ -181,11 +184,51 @@ class BaseDataConverter: return files + def naf_gather(self, img_path: str, ann_path: str, + **kwargs) -> List[Tuple]: + """Gather the dataset file from NAF dataset. Specifically for the case + that there is a split file that contains the names of different splits. + For example, + + img_001.jpg train: img_001.jpg + img_002.jpg ---> data_split.json ---> test: img_002.jpg + img_003.jpg val: img_003.jpg + + Args: + img_path (str): Path to the images. + anno_path (str): Path to the annotations. + Returns: + List[Tuple]: A list of tuples (img_path, ann_path). + """ + split_file = osp.join(self.data_root, 'data_split.json') + with open(split_file, 'r') as f: + split_data = json.load(f) + files = [] + # Rename the key + split_data['val'] = split_data.pop('valid') + if not osp.exists(img_path): + os.makedirs(img_path) + for groups in split_data[self.current_split]: + for img_name in split_data[self.current_split][groups]: + src_img = osp.join(self.data_root, 'temp_images', img_name) + dst_img = osp.join(img_path, img_name) + if not osp.exists(src_img): + Warning(f'{src_img} does not exist!') + continue + # move the image to the new path + shutil.move(src_img, dst_img) + ann = osp.join(ann_path, img_name.replace('.jpg', '.json')) + files.append((dst_img, ann)) + return files + def clean(self) -> None: for d in self.delete: delete_file = osp.join(self.data_root, d) if osp.exists(delete_file): - shutil.rmtree(delete_file) + if osp.isdir(delete_file): + shutil.rmtree(delete_file) + else: + os.remove(delete_file) @DATA_CONVERTERS.register_module() diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py index 5d7492cd..98ffdfd1 100644 --- a/mmocr/datasets/preparers/data_obtainer.py +++ b/mmocr/datasets/preparers/data_obtainer.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import glob import os import os.path as osp import shutil @@ -148,7 +149,14 @@ class NaiveDataObtainer: for src, dst in mapping: src = osp.join(self.data_root, src) dst = osp.join(self.data_root, dst) - if osp.exists(src) and not osp.exists(dst): + + if '*' in src: + mkdir_or_exist(dst) + for f in glob.glob(src): + if not osp.exists(osp.join(dst, osp.basename(f))): + shutil.move(f, dst) + + elif osp.exists(src) and not osp.exists(dst): shutil.move(src, dst) def clean(self) -> None: diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index aa0ed8e5..cdd08de1 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -3,6 +3,7 @@ from .coco_parser import COCOTextDetAnnParser from .funsd_parser import FUNSDTextDetAnnParser from .icdar_txt_parser import (ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser) +from .naf_parser import NAFAnnParser from .svt_parser import SVTTextDetAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser @@ -10,5 +11,6 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser __all__ = [ 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', - 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser' + 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', + 'NAFAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/naf_parser.py b/mmocr/datasets/preparers/parsers/naf_parser.py new file mode 100644 index 00000000..1e9a6164 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/naf_parser.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +from typing import Dict, List, Tuple + +import numpy as np + +from ..data_preparer import DATA_PARSERS +from .base import BaseParser + + +@DATA_PARSERS.register_module() +class NAFAnnParser(BaseParser): + """NAF dataset parser. + + The original annotation format of this dataset is stored in json files, + which has the following keys that will be used here: + - 'textBBs': List of text bounding box objects + - 'poly_points': list of [x,y] pairs, the box corners going + top-left,top-right,bottom-right,bottom-left + - 'id': id of the textBB, used to match with the text + - 'transcriptions': Dict of transcription objects, use the 'id' key + to match with the textBB. + + Some special characters are used in the transcription: + "«text»" indicates that "text" had a strikethrough + "¿" indicates the transcriber could not read a character + "§" indicates the whole line or word was illegible + "" (empty string) is if the field was blank + + Args: + data_root (str): Path to the dataset root. + ignore (list(str)): The text of the ignored instances. Default: ['#']. + det (bool): Whether to parse the detection annotation. Default: True. + If False, the parser will consider special case in NAF dataset + where the transcription is not available. + nproc (int): Number of processes to load the data. Default: 1. + """ + + def __init__(self, + data_root: str, + ignore: List[str] = ['#'], + det: bool = True, + nproc: int = 1) -> None: + self.ignore = ignore + self.det = det + super().__init__(data_root=data_root, nproc=nproc) + + def parse_file(self, file: Tuple, split: str) -> Dict: + """Convert single annotation.""" + img_file, json_file = file + instances = list() + for poly, text in self.loader(json_file): + instances.append( + dict(poly=poly, text=text, ignore=text in self.ignore)) + + return img_file, instances + + def loader(self, file_path: str) -> str: + """Load the annotation of the NAF dataset. + + Args: + file_path (str): Path to the json file + + Retyrb: + str: Complete annotation of the json file + """ + with open(file_path, 'r') as f: + data = json.load(f) + + # 'textBBs' contains the printed texts of the table while 'fieldBBs' + # contains the text filled by human. + for box_type in ['textBBs', 'fieldBBs']: + if not self.det: + # 'textBBs' is only used for detection task. + if box_type == 'textBBs': + continue + for anno in data[box_type]: + # Skip blanks + if self.det: + if box_type == 'fieldBBs': + if anno['type'] == 'blank': + continue + poly = np.array(anno['poly_points']).reshape( + 1, 8)[0].tolist() + # Since detection task only need poly, we can skip the + # transcription part that can be empty. + text = None + else: + # For tasks that need transcription, NAF dataset has + # serval special cases: + # 1. The transcription for the whole image is not + # available. + # 2. The transcription for the certain text is not + # available. + # 3. If the length of the transcription is 0, it should + # be ignored. + if 'transcriptions' not in data.keys(): + break + if anno['id'] not in data['transcriptions'].keys(): + continue + text = data['transcriptions'][anno['id']] + text = text.strip( + '\u202a') # Remove unicode control character + text = text.replace('»', '').replace( + '«', '') # Remove strikethrough flag + if len(text) == 0: + continue + poly = np.array(anno['poly_points']).reshape( + 1, 8)[0].tolist() + yield poly, text