P3: Update textdet data conversion scripts

pull/1178/head
gaotongxiao 2022-06-13 07:48:12 +00:00
parent 3992f0d78e
commit 1af7f94a63
25 changed files with 104 additions and 137 deletions

View File

@ -7,8 +7,7 @@ from .box_util import (bezier_to_polygon, is_on_same_line, sort_points,
from .check_argument import (equal_len, is_2dlist, is_3dlist, is_none_or_type,
is_type_list, valid_boundary)
from .collect_env import collect_env
from .data_convert_util import (convert_annotations, dump_ocr_data,
recog_anno_to_imginfo)
from .data_convert_util import dump_ocr_data, recog_anno_to_imginfo
from .fileio import list_from_file, list_to_file
from .img_util import drop_orientation, is_not_png
from .lmdb_util import recog2lmdb
@ -25,13 +24,12 @@ from .string_util import StringStrip
__all__ = [
'Registry', 'build_from_cfg', 'get_root_logger', 'collect_env',
'is_3dlist', 'is_type_list', 'is_none_or_type', 'equal_len', 'is_2dlist',
'valid_boundary', 'drop_orientation', 'convert_annotations', 'is_not_png',
'list_to_file', 'list_from_file', 'is_on_same_line',
'stitch_boxes_into_lines', 'StringStrip', 'revert_sync_batchnorm',
'bezier_to_polygon', 'sort_points', 'recog2lmdb', 'dump_ocr_data',
'recog_anno_to_imginfo', 'rescale_polygons', 'rescale_polygon',
'rescale_bboxes', 'bbox2poly', 'crop_polygon', 'is_poly_inside_rect',
'poly2bbox', 'poly_intersection', 'poly_iou', 'poly_make_valid',
'poly_union', 'poly2shapely', 'polys2shapely', 'register_all_modules',
'dist_points2line', 'offset_polygon'
'valid_boundary', 'drop_orientation', 'is_not_png', 'list_to_file',
'list_from_file', 'is_on_same_line', 'stitch_boxes_into_lines',
'StringStrip', 'revert_sync_batchnorm', 'bezier_to_polygon', 'sort_points',
'recog2lmdb', 'dump_ocr_data', 'recog_anno_to_imginfo', 'rescale_polygons',
'rescale_polygon', 'rescale_bboxes', 'bbox2poly', 'crop_polygon',
'is_poly_inside_rect', 'poly2bbox', 'poly_intersection', 'poly_iou',
'poly_make_valid', 'poly_union', 'poly2shapely', 'polys2shapely',
'register_all_modules', 'dist_points2line', 'offset_polygon'
]

View File

@ -7,47 +7,6 @@ import mmcv
from mmocr.utils import is_type_list
# TODO: Remove it when all converters no longer need it
def convert_annotations(image_infos, out_json_name):
"""Convert the annotation into coco style.
Args:
image_infos(list): The list of image information dicts
out_json_name(str): The output json filename
Returns:
out_json(dict): The coco style dict
"""
assert isinstance(image_infos, list)
assert isinstance(out_json_name, str)
assert out_json_name
out_json = dict()
img_id = 0
ann_id = 0
out_json['images'] = []
out_json['categories'] = []
out_json['annotations'] = []
for image_info in image_infos:
image_info['id'] = img_id
anno_infos = image_info.pop('anno_info')
out_json['images'].append(image_info)
for anno_info in anno_infos:
anno_info['image_id'] = img_id
anno_info['id'] = ann_id
out_json['annotations'].append(anno_info)
ann_id += 1
img_id += 1
cat = dict(id=1, name='text')
out_json['categories'].append(cat)
if len(out_json['annotations']) == 0:
out_json.pop('annotations')
mmcv.dump(out_json, out_json_name)
return out_json
def dump_ocr_data(image_infos: Sequence[Dict], out_json_name: str,
task_name: str) -> Dict:
"""Dump the annotation in openmmlab style.

View File

@ -5,7 +5,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -175,9 +175,9 @@ def main():
image_infos = [image_infos]
splits = ['training']
for i, split in enumerate(splits):
convert_annotations(
image_infos[i],
osp.join(root_path, 'instances_' + split + '.json'))
dump_ocr_data(image_infos[i],
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -5,7 +5,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def parse_args():
@ -111,11 +111,12 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_cocotext_info(root_path, 'train')
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
print('Processing validation set...')
val_infos = collect_cocotext_info(root_path, 'val')
convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')

View File

@ -9,7 +9,7 @@ import mmcv
import numpy as np
from shapely.geometry import Polygon
from mmocr.utils import convert_annotations, list_from_file
from mmocr.utils import dump_ocr_data, list_from_file
def collect_files(img_dir, gt_dir, split):
@ -224,7 +224,7 @@ def main():
files = collect_files(
osp.join(img_dir, split), osp.join(gt_dir, split), split)
image_infos = collect_annotations(files, split, nproc=args.nproc)
convert_annotations(image_infos, osp.join(out_dir, json_name))
dump_ocr_data(image_infos, osp.join(out_dir, json_name), 'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
import numpy as np
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -152,9 +152,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -148,9 +148,9 @@ def main():
osp.join(root_path, 'imgs'),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import numpy as np
from shapely.geometry import Polygon
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_level_info(annotation):
@ -139,11 +139,12 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_hiertext_info(root_path, args.level, 'train')
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
print('Processing validation set...')
val_infos = collect_hiertext_info(root_path, args.level, 'val')
convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
from PIL import Image
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def convert_gif(img_path):
@ -163,9 +163,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -5,7 +5,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir, split):
@ -156,9 +156,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split), split)
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -8,7 +8,7 @@ import mmcv
import numpy as np
from shapely.geometry import Polygon
from mmocr.utils import convert_annotations, list_from_file
from mmocr.utils import dump_ocr_data, list_from_file
def collect_files(img_dir, gt_dir):
@ -176,7 +176,7 @@ def main():
osp.join(img_dir, split), osp.join(gt_dir, split))
image_infos = collect_annotations(
files, args.dataset, nproc=args.nproc)
convert_annotations(image_infos, osp.join(out_dir, json_name))
dump_ocr_data(image_infos, osp.join(out_dir, json_name), 'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import xml.etree.ElementTree as ET
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -196,9 +196,9 @@ def main():
image_infos = [image_infos]
splits = ['training']
for i, split in enumerate(splits):
convert_annotations(
dump_ocr_data(
list(filter(None, image_infos[i])),
osp.join(root_path, 'instances_' + split + '.json'))
osp.join(root_path, 'instances_' + split + '.json'), 'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
import numpy as np
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def parse_args():
@ -141,8 +141,9 @@ def main():
with mmcv.Timer(print_tmpl='It takes {}s to convert IMGUR annotation'):
anno_infos = collect_imgur_info(
root_path, f'imgur5k_annotations_{split}.json')
convert_annotations(anno_infos,
osp.join(root_path, f'instances_{split}.json'))
dump_ocr_data(anno_infos,
osp.join(root_path, f'instances_{split}.json'),
'textdet')
if __name__ == '__main__':

View File

@ -7,7 +7,7 @@ import xml.etree.ElementTree as ET
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir, ratio):
@ -183,16 +183,17 @@ def main():
trn_infos = collect_annotations(trn_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert KAIST Training annotation'):
convert_annotations(trn_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(trn_infos, osp.join(root_path,
'instances_training.json'),
'textdet')
# Val set
if len(val_files) > 0:
val_infos = collect_annotations(val_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert KAIST Val annotation'):
convert_annotations(val_infos,
osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
if __name__ == '__main__':

View File

@ -5,7 +5,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def parse_args():
@ -115,13 +115,13 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_lsvt_info(root_path, 'train', args.val_ratio)
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
if args.val_ratio > 0:
print('Processing validation set...')
val_infos = collect_lsvt_info(root_path, 'val', args.val_ratio)
convert_annotations(val_infos, osp.join(root_path,
'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')

View File

@ -6,7 +6,7 @@ import xml.etree.ElementTree as ET
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(data_dir):
@ -171,9 +171,9 @@ def main():
with mmcv.Timer(print_tmpl='It takes {}s to convert LV annotation'):
files = collect_files(osp.join(root_path, 'imgs', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -8,7 +8,7 @@ import cv2
import mmcv
from PIL import Image
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir, ratio):
@ -185,16 +185,17 @@ def main():
trn_infos = collect_annotations(trn_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert MTWI Training annotation'):
convert_annotations(trn_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(trn_infos, osp.join(root_path,
'instances_training.json'),
'textdet')
# Val set
if len(val_files) > 0:
val_infos = collect_annotations(val_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert MTWI Val annotation'):
convert_annotations(val_infos,
osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
if __name__ == '__main__':

View File

@ -4,7 +4,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir, split_info):
@ -186,9 +186,9 @@ def main():
osp.join(root_path, 'imgs'),
osp.join(root_path, 'annotations'), split_info[split])
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir, ratio):
@ -175,16 +175,17 @@ def main():
with mmcv.Timer(
print_tmpl='It takes {}s to convert RCTW Training annotation'):
trn_infos = collect_annotations(trn_files, nproc=args.nproc)
convert_annotations(trn_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(trn_infos, osp.join(root_path,
'instances_training.json'),
'textdet')
# Val set
if len(val_files) > 0:
with mmcv.Timer(
print_tmpl='It takes {}s to convert RCTW Val annotation'):
val_infos = collect_annotations(val_files, nproc=args.nproc)
convert_annotations(val_infos,
osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir, ratio):
@ -189,16 +189,17 @@ def main():
trn_infos = collect_annotations(trn_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert ReCTS Training annotation'):
convert_annotations(trn_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(trn_infos, osp.join(root_path,
'instances_training.json'),
'textdet')
# Val set
if len(val_files) > 0:
val_infos = collect_annotations(val_files, nproc=args.nproc)
with mmcv.Timer(
print_tmpl='It takes {}s to convert ReCTS Val annotation'):
convert_annotations(val_infos,
osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ import os.path as osp
import mmcv
import numpy as np
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -157,9 +157,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations', split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':

View File

@ -164,6 +164,7 @@ def parse_args():
return args
# TODO: Refactor synthtext
def main():
args = parse_args()
synthtext_path = args.synthtext_path

View File

@ -5,7 +5,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def parse_args():
@ -63,11 +63,12 @@ def main():
root_path = args.root_path
print('Processing training set...')
training_infos = collect_textocr_info(root_path, 'TextOCR_0.1_train.json')
convert_annotations(training_infos,
osp.join(root_path, 'instances_training.json'))
dump_ocr_data(training_infos,
osp.join(root_path, 'instances_training.json'), 'textdet')
print('Processing validation set...')
val_infos = collect_textocr_info(root_path, 'TextOCR_0.1_val.json')
convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
dump_ocr_data(val_infos, osp.join(root_path, 'instances_val.json'),
'textdet')
print('Finish')

View File

@ -12,7 +12,7 @@ import scipy.io as scio
import yaml
from shapely.geometry import Polygon
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -401,7 +401,8 @@ def main():
files = collect_files(
osp.join(img_dir, split), osp.join(gt_dir, split))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(image_infos, osp.join(root_path, json_name))
dump_ocr_data(image_infos, osp.join(root_path, json_name),
'textdet')
if __name__ == '__main__':

View File

@ -5,7 +5,7 @@ import os.path as osp
import mmcv
from mmocr.utils import convert_annotations
from mmocr.utils import dump_ocr_data
def collect_files(img_dir, gt_dir):
@ -161,9 +161,9 @@ def main():
osp.join(root_path, 'imgs', split),
osp.join(root_path, 'annotations'))
image_infos = collect_annotations(files, nproc=args.nproc)
convert_annotations(
image_infos, osp.join(root_path,
'instances_' + split + '.json'))
dump_ocr_data(image_infos,
osp.join(root_path, 'instances_' + split + '.json'),
'textdet')
if __name__ == '__main__':