From b79382cd6bc63dde72cb4ef0ff4f2b6e6f9c488c Mon Sep 17 00:00:00 2001
From: Qing Jiang <mountchicken@outlook.com>
Date: Thu, 29 Dec 2022 15:19:49 +0800
Subject: [PATCH] [Feature] CodeCamp #115 Add NAF to dataset preparer (#1609)

* add naf converter

* fix test

* update

* use fuzzy search instead

* update

* update
---
 .gitignore                                    |   1 +
 dataset_zoo/naf/metafile.yml                  |  31 +++++
 dataset_zoo/naf/sample_anno.md                |   6 +
 dataset_zoo/naf/textdet.py                    |  49 ++++++++
 dataset_zoo/naf/textrecog.py                  |  19 +++
 dataset_zoo/naf/textspotting.py               |  18 +++
 mmocr/datasets/preparers/data_converter.py    |  45 ++++++-
 mmocr/datasets/preparers/data_obtainer.py     |  10 +-
 mmocr/datasets/preparers/parsers/__init__.py  |   4 +-
 .../datasets/preparers/parsers/naf_parser.py  | 110 ++++++++++++++++++
 10 files changed, 290 insertions(+), 3 deletions(-)
 create mode 100644 dataset_zoo/naf/metafile.yml
 create mode 100644 dataset_zoo/naf/sample_anno.md
 create mode 100644 dataset_zoo/naf/textdet.py
 create mode 100644 dataset_zoo/naf/textrecog.py
 create mode 100644 dataset_zoo/naf/textspotting.py
 create mode 100644 mmocr/datasets/preparers/parsers/naf_parser.py

diff --git a/.gitignore b/.gitignore
index 64efec76..63510208 100644
--- a/.gitignore
+++ b/.gitignore
@@ -142,3 +142,4 @@ mmocr/.mim
 workdirs/
 .history/
 .dev/
+data/
diff --git a/dataset_zoo/naf/metafile.yml b/dataset_zoo/naf/metafile.yml
new file mode 100644
index 00000000..c816c772
--- /dev/null
+++ b/dataset_zoo/naf/metafile.yml
@@ -0,0 +1,31 @@
+Name: 'NAF'
+Paper:
+  Title: Deep Visual Template-Free Form Parsing
+  URL: https://ieeexplore.ieee.org/abstract/document/8977962
+  Venue: ICDAR
+  Year: '2019'
+  BibTeX: '@inproceedings{davis2019deep,
+            title={Deep visual template-free form parsing},
+            author={Davis, Brian and Morse, Bryan and Cohen, Scott and Price, Brian and Tensmeyer, Chris},
+            booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)},
+            pages={134--141},
+            year={2019},
+            organization={IEEE}}'
+Data:
+  Website: https://github.com/herobd/NAF_dataset
+  Language:
+    - English
+  Scene:
+    - Document
+    - Handwritten
+  Granularity:
+    - Word
+    - Line
+  Tasks:
+    - textrecog
+    - textdet
+    - textspotting
+  License:
+    Type: CDLA
+    Link: https://github.com/herobd/NAF_dataset/blob/master/LICENSE
+  Format: .json
diff --git a/dataset_zoo/naf/sample_anno.md b/dataset_zoo/naf/sample_anno.md
new file mode 100644
index 00000000..9043e763
--- /dev/null
+++ b/dataset_zoo/naf/sample_anno.md
@@ -0,0 +1,6 @@
+**Text Detection/Recognition/Spotting**
+
+```json
+{"fieldBBs": [{"poly_points": [[435, 1406], [466, 1406], [466, 1439], [435, 1439]], "type": "fieldCheckBox", "id": "f0", "isBlank": 1}, {"poly_points": [[435, 1444], [469, 1444], [469, 1478], [435, 1478]], "type": "fieldCheckBox", "id": "f1", "isBlank": 1}],
+ "textBBs": [{"poly_points": [[1183, 1337], [2028, 1345], [2032, 1395], [1186, 1398]], "type": "text", "id": "t0"}, {"poly_points": [[492, 1336], [809, 1338], [809, 1379], [492, 1378]], "type": "text", "id": "t1"}, {"poly_points": [[512, 1375], [798, 1376], [798, 1405], [512, 1404]], "type": "textInst", "id": "t2"}], "imageFilename": "007182398_00026.jpg", "transcriptions": {"f0": "\u00bf\u00bf\u00bf \u00bf\u00bf\u00bf 18/1/49 \u00bf\u00bf\u00bf\u00bf\u00bf", "f1": "U.S. Navy 53rd. Naval Const. Batt.", "t0": "APPLICATION FOR HEADSTONE OR MARKER", "t1": "ORIGINAL"}}
+```
diff --git a/dataset_zoo/naf/textdet.py b/dataset_zoo/naf/textdet.py
new file mode 100644
index 00000000..25261b71
--- /dev/null
+++ b/dataset_zoo/naf/textdet.py
@@ -0,0 +1,49 @@
+data_root = 'data/naf'
+cache_path = 'data/cache'
+
+data_obtainer = dict(
+    type='NaiveDataObtainer',
+    cache_path=cache_path,
+    data_root=data_root,
+    files=[
+        dict(
+            url='https://github.com/herobd/NAF_dataset/releases/'
+            'download/v1.0/labeled_images.tar.gz',
+            save_name='naf_image.tar.gz',
+            md5='6521cdc25c313a1f2928a16a77ad8f29',
+            split=['train', 'test', 'val'],
+            content=['image'],
+            mapping=[['naf_image/labeled_images', 'temp_images/']]),
+        dict(
+            url='https://github.com/herobd/NAF_dataset/archive/'
+            'refs/heads/master.zip',
+            save_name='naf_anno.zip',
+            md5='abf5af6266cc527d772231751bc884b3',
+            split=['train', 'test', 'val'],
+            content=['annotation'],
+            mapping=[
+                [
+                    'naf_anno/NAF_dataset-master/groups/**/*.json',
+                    'annotations/'
+                ],
+                [
+                    'naf_anno/NAF_dataset-master/train_valid_test_split.json',
+                    'data_split.json'
+                ]
+            ]),
+    ])
+
+data_converter = dict(
+    type='TextDetDataConverter',
+    splits=['train', 'test', 'val'],
+    data_root=data_root,
+    gatherer=dict(type='naf_gather'),
+    parser=dict(type='NAFAnnParser', data_root=data_root, det=True),
+    delete=['temp_images', 'data_split.json', 'annotations', 'naf_anno'],
+    dumper=dict(type='JsonDumper'),
+    nproc=1)
+
+config_generator = dict(
+    type='TextDetConfigGenerator',
+    data_root=data_root,
+    val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')])
diff --git a/dataset_zoo/naf/textrecog.py b/dataset_zoo/naf/textrecog.py
new file mode 100644
index 00000000..943bd8cd
--- /dev/null
+++ b/dataset_zoo/naf/textrecog.py
@@ -0,0 +1,19 @@
+# The transcription of NAF dataset is annotated from Tessaract OCR, which is
+# not accurate. The test/valid set ones were hand corrected, but the train set
+# was only hand corrected a little. They aren't very good results. Better
+# not to use them for recognition and text spotting.
+
+_base_ = ['textdet.py']
+data_root = 'data/naf'
+
+data_converter = dict(
+    type='TextRecogCropConverter',
+    parser=dict(
+        type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
+        det=False),
+    delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
+
+config_generator = dict(
+    type='TextRecogConfigGenerator',
+    data_root=data_root,
+    val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')])
diff --git a/dataset_zoo/naf/textspotting.py b/dataset_zoo/naf/textspotting.py
new file mode 100644
index 00000000..97b50b4a
--- /dev/null
+++ b/dataset_zoo/naf/textspotting.py
@@ -0,0 +1,18 @@
+# The transcription of NAF dataset is annotated from Tessaract OCR, which is
+# not accurate. The test/valid set ones were hand corrected, but the train set
+# was only hand corrected a little. They aren't very good results. Better
+# not to use them for recognition and text spotting.
+
+_base_ = ['textdet.py']
+data_root = 'data/naf'
+data_converter = dict(
+    type='TextSpottingDataConverter',
+    parser=dict(
+        type='NAFAnnParser', data_root=data_root, ignore=['¿', '§'],
+        det=False),
+    delete=['temp_images', 'naf_anno', 'data_split.json', 'annotations'])
+
+config_generator = dict(
+    type='TextSpottingConfigGenerator',
+    data_root=data_root,
+    val_anns=[dict(ann_file='textspotting_val.json', dataset_postfix='')])
diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py
index 8640c586..a1d0b1f6 100644
--- a/mmocr/datasets/preparers/data_converter.py
+++ b/mmocr/datasets/preparers/data_converter.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
+import os
 import os.path as osp
 import re
 import shutil
@@ -61,6 +62,8 @@ class BaseDataConverter:
             self.gatherer = self.pair_gather
         elif gather_type == 'mono_gather':
             self.gatherer = self.mono_gather
+        elif gather_type == 'naf_gather':
+            self.gatherer = self.naf_gather
         else:
             raise NotImplementedError
 
@@ -181,11 +184,51 @@ class BaseDataConverter:
 
         return files
 
+    def naf_gather(self, img_path: str, ann_path: str,
+                   **kwargs) -> List[Tuple]:
+        """Gather the dataset file from NAF dataset. Specifically for the case
+        that there is a split file that contains the names of different splits.
+        For example,
+
+            img_001.jpg                           train: img_001.jpg
+            img_002.jpg ---> data_split.json ---> test: img_002.jpg
+            img_003.jpg                           val: img_003.jpg
+
+        Args:
+            img_path (str): Path to the images.
+            anno_path (str): Path to the annotations.
+        Returns:
+            List[Tuple]: A list of tuples (img_path, ann_path).
+        """
+        split_file = osp.join(self.data_root, 'data_split.json')
+        with open(split_file, 'r') as f:
+            split_data = json.load(f)
+        files = []
+        # Rename the key
+        split_data['val'] = split_data.pop('valid')
+        if not osp.exists(img_path):
+            os.makedirs(img_path)
+        for groups in split_data[self.current_split]:
+            for img_name in split_data[self.current_split][groups]:
+                src_img = osp.join(self.data_root, 'temp_images', img_name)
+                dst_img = osp.join(img_path, img_name)
+                if not osp.exists(src_img):
+                    Warning(f'{src_img} does not exist!')
+                    continue
+                # move the image to the new path
+                shutil.move(src_img, dst_img)
+                ann = osp.join(ann_path, img_name.replace('.jpg', '.json'))
+                files.append((dst_img, ann))
+        return files
+
     def clean(self) -> None:
         for d in self.delete:
             delete_file = osp.join(self.data_root, d)
             if osp.exists(delete_file):
-                shutil.rmtree(delete_file)
+                if osp.isdir(delete_file):
+                    shutil.rmtree(delete_file)
+                else:
+                    os.remove(delete_file)
 
 
 @DATA_CONVERTERS.register_module()
diff --git a/mmocr/datasets/preparers/data_obtainer.py b/mmocr/datasets/preparers/data_obtainer.py
index 5d7492cd..98ffdfd1 100644
--- a/mmocr/datasets/preparers/data_obtainer.py
+++ b/mmocr/datasets/preparers/data_obtainer.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import glob
 import os
 import os.path as osp
 import shutil
@@ -148,7 +149,14 @@ class NaiveDataObtainer:
         for src, dst in mapping:
             src = osp.join(self.data_root, src)
             dst = osp.join(self.data_root, dst)
-            if osp.exists(src) and not osp.exists(dst):
+
+            if '*' in src:
+                mkdir_or_exist(dst)
+                for f in glob.glob(src):
+                    if not osp.exists(osp.join(dst, osp.basename(f))):
+                        shutil.move(f, dst)
+
+            elif osp.exists(src) and not osp.exists(dst):
                 shutil.move(src, dst)
 
     def clean(self) -> None:
diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py
index aa0ed8e5..cdd08de1 100644
--- a/mmocr/datasets/preparers/parsers/__init__.py
+++ b/mmocr/datasets/preparers/parsers/__init__.py
@@ -3,6 +3,7 @@ from .coco_parser import COCOTextDetAnnParser
 from .funsd_parser import FUNSDTextDetAnnParser
 from .icdar_txt_parser import (ICDARTxtTextDetAnnParser,
                                ICDARTxtTextRecogAnnParser)
+from .naf_parser import NAFAnnParser
 from .svt_parser import SVTTextDetAnnParser
 from .totaltext_parser import TotaltextTextDetAnnParser
 from .wildreceipt_parser import WildreceiptKIEAnnParser
@@ -10,5 +11,6 @@ from .wildreceipt_parser import WildreceiptKIEAnnParser
 __all__ = [
     'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser',
     'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser',
-    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser'
+    'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser',
+    'NAFAnnParser'
 ]
diff --git a/mmocr/datasets/preparers/parsers/naf_parser.py b/mmocr/datasets/preparers/parsers/naf_parser.py
new file mode 100644
index 00000000..1e9a6164
--- /dev/null
+++ b/mmocr/datasets/preparers/parsers/naf_parser.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from ..data_preparer import DATA_PARSERS
+from .base import BaseParser
+
+
+@DATA_PARSERS.register_module()
+class NAFAnnParser(BaseParser):
+    """NAF dataset parser.
+
+    The original annotation format of this dataset is stored in json files,
+    which has the following keys that will be used here:
+        - 'textBBs': List of text bounding box objects
+            - 'poly_points': list of [x,y] pairs, the box corners going
+                top-left,top-right,bottom-right,bottom-left
+            - 'id': id of the textBB, used to match with the text
+        - 'transcriptions': Dict of transcription objects, use the 'id' key
+            to match with the textBB.
+
+    Some special characters are used in the transcription:
+    "«text»" indicates that "text" had a strikethrough
+    "¿" indicates the transcriber could not read a character
+    "§" indicates the whole line or word was illegible
+    "" (empty string) is if the field was blank
+
+    Args:
+        data_root (str): Path to the dataset root.
+        ignore (list(str)): The text of the ignored instances. Default: ['#'].
+        det (bool): Whether to parse the detection annotation. Default: True.
+            If False, the parser will consider special case in NAF dataset
+            where the transcription is not available.
+        nproc (int): Number of processes to load the data. Default: 1.
+    """
+
+    def __init__(self,
+                 data_root: str,
+                 ignore: List[str] = ['#'],
+                 det: bool = True,
+                 nproc: int = 1) -> None:
+        self.ignore = ignore
+        self.det = det
+        super().__init__(data_root=data_root, nproc=nproc)
+
+    def parse_file(self, file: Tuple, split: str) -> Dict:
+        """Convert single annotation."""
+        img_file, json_file = file
+        instances = list()
+        for poly, text in self.loader(json_file):
+            instances.append(
+                dict(poly=poly, text=text, ignore=text in self.ignore))
+
+        return img_file, instances
+
+    def loader(self, file_path: str) -> str:
+        """Load the annotation of the NAF dataset.
+
+        Args:
+            file_path (str): Path to the json file
+
+        Retyrb:
+            str: Complete annotation of the json file
+        """
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+
+        # 'textBBs' contains the printed texts of the table while 'fieldBBs'
+        #  contains the text filled by human.
+        for box_type in ['textBBs', 'fieldBBs']:
+            if not self.det:
+                # 'textBBs' is only used for detection task.
+                if box_type == 'textBBs':
+                    continue
+            for anno in data[box_type]:
+                # Skip blanks
+                if self.det:
+                    if box_type == 'fieldBBs':
+                        if anno['type'] == 'blank':
+                            continue
+                    poly = np.array(anno['poly_points']).reshape(
+                        1, 8)[0].tolist()
+                    # Since detection task only need poly, we can skip the
+                    # transcription part that can be empty.
+                    text = None
+                else:
+                    # For tasks that need transcription, NAF dataset has
+                    # serval special cases:
+                    # 1. The transcription for the whole image is not
+                    # available.
+                    # 2. The transcription for the certain text is not
+                    # available.
+                    # 3. If the length of the transcription is 0, it should
+                    # be ignored.
+                    if 'transcriptions' not in data.keys():
+                        break
+                    if anno['id'] not in data['transcriptions'].keys():
+                        continue
+                    text = data['transcriptions'][anno['id']]
+                    text = text.strip(
+                        '\u202a')  # Remove unicode control character
+                    text = text.replace('»', '').replace(
+                        '«', '')  # Remove strikethrough flag
+                    if len(text) == 0:
+                        continue
+                    poly = np.array(anno['poly_points']).reshape(
+                        1, 8)[0].tolist()
+                yield poly, text