mirror of https://github.com/open-mmlab/mmocr.git
parent
d9bb3d6359
commit
13986f497d
tools/data
textdet
textrecog
|
@ -31,6 +31,7 @@
|
|||
| BID | [homepage](https://github.com/ricardobnjunior/Brazilian-Identity-Document-Dataset) | - | - | - |
|
||||
| RCTW | [homepage](https://rctw.vlrlab.net/index.html) | - | - | - |
|
||||
| HierText | [homepage](https://github.com/google-research-datasets/hiertext) | - | - | - |
|
||||
| ArT | [homepage](https://rrc.cvc.uab.es/?ch=14) | - | - | - |
|
||||
|
||||
### Install AWS CLI (optional)
|
||||
|
||||
|
@ -941,3 +942,41 @@ inconsistency results in false examples in the training set. Therefore, users sh
|
|||
│ ├── instances_training.json
|
||||
│ └── instances_val.json
|
||||
```
|
||||
|
||||
## ArT
|
||||
|
||||
- Step1: Download `train_images.tar.gz`, and `train_labels.json` from the [homepage](https://rrc.cvc.uab.es/?ch=14&com=downloads) to `art/`
|
||||
|
||||
```bash
|
||||
mkdir art && cd art
|
||||
mkdir annotations
|
||||
|
||||
# Download ArT dataset
|
||||
wget https://dataset-bj.cdn.bcebos.com/art/train_images.tar.gz --no-check-certificate
|
||||
wget https://dataset-bj.cdn.bcebos.com/art/train_labels.json --no-check-certificate
|
||||
|
||||
# Extract
|
||||
tar -xf train_images.tar.gz
|
||||
mv train_images imgs
|
||||
mv train_labels.json annotations/
|
||||
|
||||
# Remove unnecessary files
|
||||
rm train_images.tar.gz
|
||||
```
|
||||
|
||||
- Step2: Generate `instances_training.json` and `instances_val.json` (optional). Since the test annotations are not publicly available, you may specify `--val-ratio` to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example.
|
||||
|
||||
```bash
|
||||
# Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2
|
||||
python tools/data/textdet/art_converter.py PATH/TO/art --nproc 4
|
||||
```
|
||||
|
||||
- After running the above codes, the directory structure should be as follows:
|
||||
|
||||
```text
|
||||
│── art
|
||||
│ ├── annotations
|
||||
│ ├── imgs
|
||||
│ ├── instances_training.json
|
||||
│ └── instances_val.json (optional)
|
||||
```
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
| BID | [homepage](https://github.com/ricardobnjunior/Brazilian-Identity-Document-Dataset) | - | - | - |
|
||||
| RCTW | [homepage](https://rctw.vlrlab.net/index.html) | - | - | - |
|
||||
| HierText | [homepage](https://github.com/google-research-datasets/hiertext) | - | - | - |
|
||||
| ArT | [homepage](https://rrc.cvc.uab.es/?ch=14) | - | - | - |
|
||||
|
||||
(*) Since the official homepage is unavailable now, we provide an alternative for quick reference. However, we do not guarantee the correctness of the dataset.
|
||||
|
||||
|
@ -1116,3 +1117,40 @@ should be as follows:
|
|||
│ ├── train_label.jsonl
|
||||
│ └── val_label.jsonl
|
||||
```
|
||||
|
||||
## ArT
|
||||
|
||||
- Step1: Download `train_images.tar.gz`, and `train_labels.json` from the [homepage](https://rrc.cvc.uab.es/?ch=14&com=downloads) to `art/`
|
||||
|
||||
```bash
|
||||
mkdir art && cd art
|
||||
mkdir annotations
|
||||
|
||||
# Download ArT dataset
|
||||
wget https://dataset-bj.cdn.bcebos.com/art/train_task2_images.tar.gz
|
||||
wget https://dataset-bj.cdn.bcebos.com/art/train_task2_labels.json
|
||||
|
||||
# Extract
|
||||
tar -xf train_task2_images.tar.gz
|
||||
mv train_task2_images crops
|
||||
mv train_task2_labels.json annotations/
|
||||
|
||||
# Remove unnecessary files
|
||||
rm train_images.tar.gz
|
||||
```
|
||||
|
||||
- Step2: Generate `train_label.jsonl` and `val_label.jsonl` (optional). Since the test annotations are not publicly available, you may specify `--val-ratio` to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example.
|
||||
|
||||
```bash
|
||||
# Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2
|
||||
python tools/data/textrecog/art_converter.py PATH/TO/art
|
||||
```
|
||||
|
||||
- After running the above codes, the directory structure should be as follows:
|
||||
|
||||
```text
|
||||
│── art
|
||||
│ ├── crops
|
||||
│ ├── train_label.jsonl
|
||||
│ └── val_label.jsonl (optional)
|
||||
```
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import argparse
|
||||
import math
|
||||
import os.path as osp
|
||||
|
||||
import mmcv
|
||||
|
||||
from mmocr.utils import convert_annotations
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate training and validation set of ArT ')
|
||||
parser.add_argument('root_path', help='Root dir path of ArT')
|
||||
parser.add_argument(
|
||||
'--val-ratio', help='Split ratio for val set', default=0.0, type=float)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def collect_art_info(root_path, split, ratio, print_every=1000):
|
||||
"""Collect the annotation information.
|
||||
|
||||
The annotation format is as the following:
|
||||
{
|
||||
'gt_1726': # 'gt_1726' is file name
|
||||
[
|
||||
{
|
||||
'transcription': '燎申集团',
|
||||
'points': [
|
||||
[141, 199],
|
||||
[237, 201],
|
||||
[313, 236],
|
||||
[357, 283],
|
||||
[359, 300],
|
||||
[309, 261],
|
||||
[233, 230],
|
||||
[140, 231]
|
||||
],
|
||||
'language': 'Chinese',
|
||||
'illegibility': False
|
||||
},
|
||||
...
|
||||
],
|
||||
...
|
||||
}
|
||||
|
||||
|
||||
Args:
|
||||
root_path (str): Root path to the dataset
|
||||
split (str): Dataset split, which should be 'train' or 'val'
|
||||
ratio (float): Split ratio for val set
|
||||
print_every (int): Print log info per iteration
|
||||
|
||||
Returns:
|
||||
img_info (dict): The dict of the img and annotation information
|
||||
"""
|
||||
|
||||
annotation_path = osp.join(root_path, 'annotations/train_labels.json')
|
||||
if not osp.exists(annotation_path):
|
||||
raise Exception(
|
||||
f'{annotation_path} not exists, please check and try again.')
|
||||
|
||||
annotation = mmcv.load(annotation_path)
|
||||
img_prefixes = annotation.keys()
|
||||
|
||||
trn_files, val_files = [], []
|
||||
if ratio > 0:
|
||||
for i, file in enumerate(img_prefixes):
|
||||
if i % math.floor(1 / ratio):
|
||||
trn_files.append(file)
|
||||
else:
|
||||
val_files.append(file)
|
||||
else:
|
||||
trn_files, val_files = img_prefixes, []
|
||||
print(f'training #{len(trn_files)}, val #{len(val_files)}')
|
||||
|
||||
if split == 'train':
|
||||
img_prefixes = trn_files
|
||||
elif split == 'val':
|
||||
img_prefixes = val_files
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
img_infos = []
|
||||
for i, prefix in enumerate(img_prefixes):
|
||||
if i > 0 and i % print_every == 0:
|
||||
print(f'{i}/{len(img_prefixes)}')
|
||||
img_file = osp.join(root_path, 'imgs', prefix + '.jpg')
|
||||
# Skip not exist images
|
||||
if not osp.exists(img_file):
|
||||
continue
|
||||
img = mmcv.imread(img_file)
|
||||
|
||||
img_info = dict(
|
||||
file_name=osp.join(osp.basename(img_file)),
|
||||
height=img.shape[0],
|
||||
width=img.shape[1],
|
||||
segm_file=osp.join(osp.basename(annotation_path)))
|
||||
|
||||
anno_info = []
|
||||
for ann in annotation[prefix]:
|
||||
segmentation = []
|
||||
for x, y in ann['points']:
|
||||
segmentation.append(max(0, x))
|
||||
segmentation.append(max(0, y))
|
||||
xs, ys = segmentation[::2], segmentation[1::2]
|
||||
x, y = min(xs), min(ys)
|
||||
w, h = max(xs) - x, max(ys) - y
|
||||
bbox = [x, y, w, h]
|
||||
if ann['transcription'] == '###' or ann['illegibility']:
|
||||
iscrowd = 1
|
||||
else:
|
||||
iscrowd = 0
|
||||
anno = dict(
|
||||
iscrowd=iscrowd,
|
||||
category_id=1,
|
||||
bbox=bbox,
|
||||
area=w * h,
|
||||
segmentation=[segmentation])
|
||||
anno_info.append(anno)
|
||||
img_info.update(anno_info=anno_info)
|
||||
img_infos.append(img_info)
|
||||
|
||||
return img_infos
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
root_path = args.root_path
|
||||
print('Processing training set...')
|
||||
training_infos = collect_art_info(root_path, 'train', args.val_ratio)
|
||||
convert_annotations(training_infos,
|
||||
osp.join(root_path, 'instances_training.json'))
|
||||
if args.val_ratio > 0:
|
||||
print('Processing validation set...')
|
||||
val_infos = collect_art_info(root_path, 'val', args.val_ratio)
|
||||
convert_annotations(val_infos, osp.join(root_path,
|
||||
'instances_val.json'))
|
||||
print('Finish')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,129 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os.path as osp
|
||||
|
||||
import mmcv
|
||||
|
||||
from mmocr.utils.fileio import list_to_file
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate training and validation set of ArT ')
|
||||
parser.add_argument('root_path', help='Root dir path of ArT')
|
||||
parser.add_argument(
|
||||
'--val-ratio', help='Split ratio for val set', default=0.0, type=float)
|
||||
parser.add_argument(
|
||||
'--nproc', default=1, type=int, help='Number of processes')
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
default='jsonl',
|
||||
help='Use jsonl or string to format annotations',
|
||||
choices=['jsonl', 'txt'])
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def convert_art(root_path, split, ratio, format):
|
||||
"""Collect the annotation information and crop the images.
|
||||
|
||||
The annotation format is as the following:
|
||||
{
|
||||
"gt_2836_0": [
|
||||
{
|
||||
"transcription": "URDER",
|
||||
"points": [
|
||||
[25, 51],
|
||||
[0, 2],
|
||||
[21, 0],
|
||||
[42, 43]
|
||||
],
|
||||
"language": "Latin",
|
||||
"illegibility": false
|
||||
}
|
||||
], ...
|
||||
}
|
||||
|
||||
|
||||
Args:
|
||||
root_path (str): The root path of the dataset
|
||||
split (str): The split of dataset. Namely: training or val
|
||||
ratio (float): Split ratio for val set
|
||||
format (str): Annotation format, whether be txt or jsonl
|
||||
|
||||
Returns:
|
||||
img_info (dict): The dict of the img and annotation information
|
||||
"""
|
||||
|
||||
annotation_path = osp.join(root_path,
|
||||
'annotations/train_task2_labels.json')
|
||||
if not osp.exists(annotation_path):
|
||||
raise Exception(
|
||||
f'{annotation_path} not exists, please check and try again.')
|
||||
|
||||
annotation = mmcv.load(annotation_path)
|
||||
# outputs
|
||||
dst_label_file = osp.join(root_path, f'{split}_label.{format}')
|
||||
|
||||
img_prefixes = annotation.keys()
|
||||
|
||||
trn_files, val_files = [], []
|
||||
if ratio > 0:
|
||||
for i, file in enumerate(img_prefixes):
|
||||
if i % math.floor(1 / ratio):
|
||||
trn_files.append(file)
|
||||
else:
|
||||
val_files.append(file)
|
||||
else:
|
||||
trn_files, val_files = img_prefixes, []
|
||||
print(f'training #{len(trn_files)}, val #{len(val_files)}')
|
||||
|
||||
if split == 'train':
|
||||
img_prefixes = trn_files
|
||||
elif split == 'val':
|
||||
img_prefixes = val_files
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
labels = []
|
||||
for prefix in img_prefixes:
|
||||
text_label = annotation[prefix][0]['transcription']
|
||||
dst_img_name = prefix + '.jpg'
|
||||
|
||||
if format == 'txt':
|
||||
labels.append(f'crops/{dst_img_name}' f' {text_label}')
|
||||
elif format == 'jsonl':
|
||||
labels.append(
|
||||
json.dumps(
|
||||
{
|
||||
'filename': f'crops/{dst_img_name}',
|
||||
'text': text_label
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
list_to_file(dst_label_file, labels)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
root_path = args.root_path
|
||||
print('Processing training set...')
|
||||
convert_art(
|
||||
root_path=root_path,
|
||||
split='train',
|
||||
ratio=args.val_ratio,
|
||||
format=args.format)
|
||||
if args.val_ratio > 0:
|
||||
print('Processing validation set...')
|
||||
convert_art(
|
||||
root_path=root_path,
|
||||
split='val',
|
||||
ratio=args.val_ratio,
|
||||
format=args.format)
|
||||
print('Finish')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue