[Feature] CodeCamp #115 Add NAF to dataset preparer (#1609)

* add naf converter * fix test * update * use fuzzy search instead * update * update
2025-06-03 21:54:47 +08:00 · 2022-12-29 15:19:49 +08:00 · 2022-12-29 15:19:49 +08:00 · b79382cd6b
commit b79382cd6b
parent e3fd570687
10 changed files with 290 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -142,3 +142,4 @@ mmocr/.mim
 workdirs/
 .history/
 .dev/
 data/
--- a/dataset_zoo/naf/metafile.yml
+++ b/dataset_zoo/naf/metafile.yml
@ -0,0 +1,31 @@
 Name: 'NAF'
 Paper:
  Title: Deep Visual Template-Free Form Parsing
  URL: https://ieeexplore.ieee.org/abstract/document/8977962
  Venue: ICDAR
  Year: '2019'
  BibTeX: '@inproceedings{davis2019deep,
            title={Deep visual template-free form parsing},
            author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris},
            booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
            pages={134--141},
            year={2019},
            organization={IEEE}}'
 Data:
  Website: https://github.com/herobd/NAF_dataset
  Language:
    - English
  Scene:
    - Document
    - Handwritten
  Granularity:
    - Word
    - Line
  Tasks:
    - textrecog
    - textdet
    - textspotting
  License:
    Type: CDLA
    Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE
  Format: .json
--- a/dataset_zoo/naf/sample_anno.md
+++ b/dataset_zoo/naf/sample_anno.md
@ -0,0 +1,6 @@
 **Text Detection/Recognition/Spotting**
 ```json
 {"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}],
 "textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}}
 ```
--- a/dataset_zoo/naf/textdet.py
+++ b/dataset_zoo/naf/textdet.py
@ -0,0 +1,49 @@
 data_root = 'data/naf'
 cache_path = 'data/cache'
 data_obtainer = dict(
    type='NaiveDataObtainer',
    cache_path=cache_path,
    data_root=data_root,
    files=[
        dict(
            url='https://github.com/herobd/NAF_dataset/releases/'
            'download/v1.0/labeled_images.tar.gz',
            save_name='naf_image.tar.gz',
            md5='6521cdc25c313a1f2928a16a77ad8f29',
            split=['train', 'test', 'val'],
            content=['image'],
            mapping=[['naf_image/labeled_images', 'temp_images/']]),
        dict(
            url='https://github.com/herobd/NAF_dataset/archive/'
            'refs/heads/master.zip',
            save_name='naf_anno.zip',
            md5='abf5af6266cc527d772231751bc884b3',
            split=['train', 'test', 'val'],
            content=['annotation'],
            mapping=[
                [
                    'naf_anno/NAF_dataset-master/groups/**/*.json',
                    'annotations/'
                ],
                [
                    'naf_anno/NAF_dataset-master/train_valid_test_split.json',
                    'data_split.json'
                ]
            ]),
    ])
 data_converter = dict(
    type='TextDetDataConverter',
    splits=['train', 'test', 'val'],
    data_root=data_root,
    gatherer=dict(type='naf_gather'),
    parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
    delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
    dumper=dict(type='JsonDumper'),
    nproc=1)
 config_generator = dict(
    type='TextDetConfigGenerator',
    data_root=data_root,
    val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])
--- a/dataset_zoo/naf/textrecog.py
+++ b/dataset_zoo/naf/textrecog.py
@ -0,0 +1,19 @@
 # The transcription of NAF dataset is annotated from Tessaract OCR, which is
 # not accurate. The test/valid set ones were hand corrected, but the train set
 # was only hand corrected a little. They aren't very good results. Better
 # not to use them for recognition and text spotting.
 _base_ = ['textdet.py']
 data_root = 'data/naf'
 data_converter = dict(
    type='TextRecogCropConverter',
    parser=dict(
        type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
        det=False),
    delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
 config_generator = dict(
    type='TextRecogConfigGenerator',
    data_root=data_root,
    val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])
--- a/dataset_zoo/naf/textspotting.py
+++ b/dataset_zoo/naf/textspotting.py
@ -0,0 +1,18 @@
 # The transcription of NAF dataset is annotated from Tessaract OCR, which is
 # not accurate. The test/valid set ones were hand corrected, but the train set
 # was only hand corrected a little. They aren't very good results. Better
 # not to use them for recognition and text spotting.
 _base_ = ['textdet.py']
 data_root = 'data/naf'
 data_converter = dict(
    type='TextSpottingDataConverter',
    parser=dict(
        type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
        det=False),
    delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
 config_generator = dict(
    type='TextSpottingConfigGenerator',
    data_root=data_root,
    val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import os
 import os.path as osp
 import re
 import shutil
@ -61,6 +62,8 @@ class BaseDataConverter:
            self.gatherer = self.pair_gather
        elif gather_type == 'mono_gather':
            self.gatherer = self.mono_gather
        elif gather_type == 'naf_gather':
            self.gatherer = self.naf_gather
        else:
            raise NotImplementedError
@ -181,11 +184,51 @@ class BaseDataConverter:
        return files
    def naf_gather(self, img_path: str, ann_path: str,
                   **kwargs) -> List[Tuple]:
        """Gather the dataset file from NAF dataset. Specifically for the case
        that there is a split file that contains the names of different splits.
        For example,
            img_001.jpg                           train: img_001.jpg
            img_002.jpg ---> data_split.json ---> test: img_002.jpg
            img_003.jpg                           val: img_003.jpg
        Args:
            img_path (str): Path to the images.
            anno_path (str): Path to the annotations.
        Returns:
            List[Tuple]: A list of tuples (img_path, ann_path).
        """
        split_file = osp.join(self.data_root, 'data_split.json')
        with open(split_file, 'r') as f:
            split_data = json.load(f)
        files = []
        # Rename the key
        split_data['val'] = split_data.pop('valid')
        if not osp.exists(img_path):
            os.makedirs(img_path)
        for groups in split_data[self.current_split]:
            for img_name in split_data[self.current_split][groups]:
                src_img = osp.join(self.data_root, 'temp_images', img_name)
                dst_img = osp.join(img_path, img_name)
                if not osp.exists(src_img):
                    Warning(f'{src_img} does not exist!')
                    continue
                # move the image to the new path
                shutil.move(src_img, dst_img)
                ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
                files.append((dst_img, ann))
        return files
    def clean(self) -> None:
        for d in self.delete:
            delete_file = osp.join(self.data_root, d)
            if osp.exists(delete_file):
-                shutil.rmtree(delete_file)
+                if osp.isdir(delete_file):
                    shutil.rmtree(delete_file)
                else:
                    os.remove(delete_file)
@DATA_CONVERTERS.register_module()
--- a/mmocr/datasets/preparers/data_obtainer.py
+++ b/mmocr/datasets/preparers/data_obtainer.py
@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import glob
 import os
 import os.path as osp
 import shutil
@ -148,7 +149,14 @@ class NaiveDataObtainer:
        for src, dst in mapping:
            src = osp.join(self.data_root, src)
            dst = osp.join(self.data_root, dst)
-            if osp.exists(src) and not osp.exists(dst):
+
            if '*' in src:
                mkdir_or_exist(dst)
                for f in glob.glob(src):
                    if not osp.exists(osp.join(dst, osp.basename(f))):
                        shutil.move(f, dst)
            elif osp.exists(src) and not osp.exists(dst):
                shutil.move(src, dst)
    def clean(self) -> None:
--- a/mmocr/datasets/preparers/parsers/init.py
+++ b/mmocr/datasets/preparers/parsers/init.py
@ -3,6 +3,7 @@ from .coco_parser import COCOTextDetAnnParser
 from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                               ICDARTxtTextRecogAnnParser)
 from .naf_parser import NAFAnnParser
 from .svt_parser import SVTTextDetAnnParser
 from .totaltext_parser import TotaltextTextDetAnnParser
 from .wildreceipt_parser import WildreceiptKIEAnnParser
@ -10,5 +11,6 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser
 __all__ = [
    'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
    'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
-    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
+    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
    'NAFAnnParser'
 ]
--- a/mmocr/datasets/preparers/parsers/naf_parser.py
+++ b/mmocr/datasets/preparers/parsers/naf_parser.py
@ -0,0 +1,110 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 from typing import Dict, List, Tuple
 import numpy as np
 from ..data_preparer import DATA_PARSERS
 from .base import BaseParser
@DATA_PARSERS.register_module()
 class NAFAnnParser(BaseParser):
    """NAF dataset parser.
    The original annotation format of this dataset is stored in json files,
    which has the following keys that will be used here:
        - 'textBBs': List of text bounding box objects
            - 'poly_points': list of [x,y] pairs, the box corners going
                top-left,top-right,bottom-right,bottom-left
            - 'id': id of the textBB, used to match with the text
        - 'transcriptions': Dict of transcription objects, use the 'id' key
            to match with the textBB.
    Some special characters are used in the transcription:
    "«text»" indicates that "text" had a strikethrough
    "¿" indicates the transcriber could not read a character
    "§" indicates the whole line or word was illegible
    "" (empty string) is if the field was blank
    Args:
        data_root (str): Path to the dataset root.
        ignore (list(str)): The text of the ignored instances. Default: ['#'].
        det (bool): Whether to parse the detection annotation. Default: True.
            If False, the parser will consider special case in NAF dataset
            where the transcription is not available.
        nproc (int): Number of processes to load the data. Default: 1.
    """
    def __init__(self,
                 data_root: str,
                 ignore: List[str] = ['#'],
                 det: bool = True,
                 nproc: int = 1) -> None:
        self.ignore = ignore
        self.det = det
        super().__init__(data_root=data_root, nproc=nproc)
    def parse_file(self, file: Tuple, split: str) -> Dict:
        """Convert single annotation."""
        img_file, json_file = file
        instances = list()
        for poly, text in self.loader(json_file):
            instances.append(
                dict(poly=poly, text=text, ignore=text in self.ignore))
        return img_file, instances
    def loader(self, file_path: str) -> str:
        """Load the annotation of the NAF dataset.
        Args:
            file_path (str): Path to the json file
        Retyrb:
            str: Complete annotation of the json file
        """
        with open(file_path, 'r') as f:
            data = json.load(f)
        # 'textBBs' contains the printed texts of the table while 'fieldBBs'
        #  contains the text filled by human.
        for box_type in ['textBBs', 'fieldBBs']:
            if not self.det:
                # 'textBBs' is only used for detection task.
                if box_type == 'textBBs':
                    continue
            for anno in data[box_type]:
                # Skip blanks
                if self.det:
                    if box_type == 'fieldBBs':
                        if anno['type'] == 'blank':
                            continue
                    poly = np.array(anno['poly_points']).reshape(
                        1, 8)[0].tolist()
                    # Since detection task only need poly, we can skip the
                    # transcription part that can be empty.
                    text = None
                else:
                    # For tasks that need transcription, NAF dataset has
                    # serval special cases:
                    # 1. The transcription for the whole image is not
                    # available.
                    # 2. The transcription for the certain text is not
                    # available.
                    # 3. If the length of the transcription is 0, it should
                    # be ignored.
                    if 'transcriptions' not in data.keys():
                        break
                    if anno['id'] not in data['transcriptions'].keys():
                        continue
                    text = data['transcriptions'][anno['id']]
                    text = text.strip(
                        '\u202a')  # Remove unicode control character
                    text = text.replace('»', '').replace(
                        '«', '')  # Remove strikethrough flag
                    if len(text) == 0:
                        continue
                    poly = np.array(anno['poly_points']).reshape(
                        1, 8)[0].tolist()
                yield poly, text