diff --git a/docs/datasets.md b/docs/datasets.md index a812427d..cb7f9b3c 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -120,7 +120,7 @@ The structure of the text detection dataset directory is organized as follows. python tools/data/textdet/textocr_converter.py /path/to/textocr ``` - For `Totaltext`: - - Step1: Download `totaltext.zip` from [github dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset) and `groundtruth_text.zip` from [github Groundtruth](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Groundtruth/Text) (We recommend downloading the text groundtruth with .mat format since our totaltext_converter.py supports groundtruth with .mat format only). + - Step1: Download `totaltext.zip` from [github dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset) and `groundtruth_text.zip` from [github Groundtruth](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Groundtruth/Text) (Our totaltext_converter.py supports groundtruth with both .mat and .txt format). ```bash mkdir totaltext && cd totaltext mkdir imgs && mkdir annotations @@ -339,7 +339,7 @@ python tools/data/utils/txt2lmdb.py -i data/mixture/Syn90k/label.txt -o data/mix - For `Totaltext`: - - Step1: Download `totaltext.zip` from [github dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset) and `groundtruth_text.zip` from [github Groundtruth](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Groundtruth/Text) (We recommend downloading the text groundtruth with .mat format since our totaltext_converter.py supports groundtruth with .mat format only). + - Step1: Download `totaltext.zip` from [github dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset) and `groundtruth_text.zip` from [github Groundtruth](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Groundtruth/Text) (Our totaltext_converter.py supports groundtruth with both .mat and .txt format). ```bash mkdir totaltext && cd totaltext mkdir imgs && mkdir annotations diff --git a/setup.cfg b/setup.cfg index c368a1d7..f8c1e67c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,7 @@ line_length = 79 multi_line_output = 0 known_standard_library = setuptools known_first_party = mmocr -known_third_party = PIL,Polygon,cv2,imgaug,lanms,lmdb,matplotlib,mmcv,mmdet,numpy,packaging,pyclipper,pycocotools,pytest,rapidfuzz,scipy,shapely,skimage,titlecase,torch,torchvision +known_third_party = PIL,Polygon,cv2,imgaug,lanms,lmdb,matplotlib,mmcv,mmdet,numpy,packaging,pyclipper,pycocotools,pytest,rapidfuzz,scipy,shapely,skimage,titlecase,torch,torchvision,yaml no_lines_before = STDLIB,LOCALFOLDER default_section = THIRDPARTY diff --git a/tools/data/textdet/totaltext_converter.py b/tools/data/textdet/totaltext_converter.py index eb4b35d6..67e5b9b9 100644 --- a/tools/data/textdet/totaltext_converter.py +++ b/tools/data/textdet/totaltext_converter.py @@ -1,12 +1,14 @@ import argparse import glob +import os import os.path as osp -from functools import partial +import re import cv2 import mmcv import numpy as np import scipy.io as scio +import yaml from shapely.geometry import Polygon from mmocr.utils import convert_annotations, drop_orientation, is_not_png @@ -19,7 +21,6 @@ def collect_files(img_dir, gt_dir, split): img_dir(str): The image directory gt_dir(str): The groundtruth directory split(str): The split of dataset. Namely: training or test - Returns: files(list): The list of tuples (img_file, groundtruth_file) """ @@ -37,63 +38,44 @@ def collect_files(img_dir, gt_dir, split): for suffix in suffixes: imgs_list.extend(glob.glob(osp.join(img_dir, '*' + suffix))) - imgs_list = [ - drop_orientation(f) if is_not_png(f) else f for f in imgs_list - ] + imgs_list = sorted( + [drop_orientation(f) if is_not_png(f) else f for f in imgs_list]) + ann_list = sorted( + [osp.join(gt_dir, gt_file) for gt_file in os.listdir(gt_dir)]) - files = [] - if split == 'training': - for img_file in imgs_list: - gt_file = osp.join( - gt_dir, - 'poly_gt_' + osp.splitext(osp.basename(img_file))[0] + '.mat') - files.append((img_file, gt_file)) - assert len(files), f'No images found in {img_dir}' - print(f'Loaded {len(files)} images from {img_dir}') - elif split == 'test': - for img_file in imgs_list: - gt_file = osp.join( - gt_dir, - 'poly_gt_' + osp.splitext(osp.basename(img_file))[0] + '.mat') - files.append((img_file, gt_file)) - assert len(files), f'No images found in {img_dir}' - print(f'Loaded {len(files)} images from {img_dir}') + files = list(zip(imgs_list, ann_list)) + assert len(files), f'No images found in {img_dir}' + print(f'Loaded {len(files)} images from {img_dir}') return files -def collect_annotations(files, split, nproc=1): +def collect_annotations(files, nproc=1): """Collect the annotation information. Args: files(list): The list of tuples (image_file, groundtruth_file) - split(str): The split of dataset. Namely: training or test nproc(int): The number of process to collect annotations - Returns: images(list): The list of image information dicts """ assert isinstance(files, list) - assert isinstance(split, str) assert isinstance(nproc, int) - load_img_info_with_split = partial(load_img_info, split=split) if nproc > 1: images = mmcv.track_parallel_progress( - load_img_info_with_split, files, nproc=nproc) + load_img_info, files, nproc=nproc) else: - images = mmcv.track_progress(load_img_info_with_split, files) + images = mmcv.track_progress(load_img_info, files) return images -def get_contours(gt_path, split): - """Get the contours and words for each ground_truth file. +def get_contours_mat(gt_path): + """Get the contours and words for each ground_truth mat file. Args: gt_path(str): The relative path of the ground_truth mat file - split(str): The split of dataset: training or test - Returns: contours(list[lists]): A list of lists of contours for the text instances @@ -101,15 +83,11 @@ def get_contours(gt_path, split): for the text instances """ assert isinstance(gt_path, str) - assert isinstance(split, str) contours = [] words = [] data = scio.loadmat(gt_path) - if split == 'training': - data_polygt = data['polygt'] - elif split == 'test': - data_polygt = data['polygt'] + data_polygt = data['polygt'] for i, lines in enumerate(data_polygt): X = np.array(lines[1]) @@ -138,23 +116,150 @@ def get_contours(gt_path, split): return contours, words -def load_mat_info(img_info, gt_file, split): +def load_mat_info(img_info, gt_file): """Load the information of one ground truth in .mat format. Args: img_info(dict): The dict of only the image information gt_file(str): The relative path of the ground_truth mat file for one image - split(str): The split of dataset: training or test - Returns: img_info(dict): The dict of the img and annotation information """ assert isinstance(img_info, dict) assert isinstance(gt_file, str) - assert isinstance(split, str) - contours, words = get_contours(gt_file, split) + contours, words = get_contours_mat(gt_file) + anno_info = [] + for contour in contours: + if contour.shape[0] == 2: + continue + category_id = 1 + coordinates = np.array(contour).reshape(-1, 2) + polygon = Polygon(coordinates) + iscrowd = 0 + + area = polygon.area + # convert to COCO style XYWH format + min_x, min_y, max_x, max_y = polygon.bounds + bbox = [min_x, min_y, max_x - min_x, max_y - min_y] + + anno = dict( + iscrowd=iscrowd, + category_id=category_id, + bbox=bbox, + area=area, + segmentation=[contour]) + anno_info.append(anno) + + img_info.update(anno_info=anno_info) + + return img_info + + +def process_line(line, contours, words): + """Get the contours and words by processing each line in the gt file. + + Args: + line(str): The line in gt file containing annotation info + contours(list[lists]): A list of lists of contours + for the text instances + words(list[list]): A list of lists of words (string) + for the text instances + Returns: + contours(list[lists]): A list of lists of contours + for the text instances + words(list[list]): A list of lists of words (string) + for the text instances + """ + + line = '{' + line.replace('[[', '[').replace(']]', ']') + '}' + ann_dict = re.sub('([0-9]) +([0-9])', r'\1,\2', line) + ann_dict = re.sub('([0-9]) +([ 0-9])', r'\1,\2', ann_dict) + ann_dict = re.sub('([0-9]) -([0-9])', r'\1,-\2', ann_dict) + ann_dict = ann_dict.replace("[u',']", "[u'#']") + ann_dict = yaml.load(ann_dict) + + X = np.array([ann_dict['x']]) + Y = np.array([ann_dict['y']]) + + if len(ann_dict['transcriptions']) == 0: + word = '???' + else: + word = ann_dict['transcriptions'][0] + if len(ann_dict['transcriptions']) > 1: + for ann_word in ann_dict['transcriptions'][1:]: + word += ',' + ann_word + word = str(eval(word)) + words.append(word) + + point_num = len(X[0]) + + arr = np.concatenate([X, Y]).T + contour = [] + for i in range(point_num): + contour.append(arr[i][0]) + contour.append(arr[i][1]) + contours.append(np.asarray(contour)) + + return contours, words + + +def get_contours_txt(gt_path): + """Get the contours and words for each ground_truth txt file. + + Args: + gt_path(str): The relative path of the ground_truth mat file + Returns: + contours(list[lists]): A list of lists of contours + for the text instances + words(list[list]): A list of lists of words (string) + for the text instances + """ + assert isinstance(gt_path, str) + + contours = [] + words = [] + + with open(gt_path, 'r') as f: + tmp_line = '' + for idx, line in enumerate(f): + line = line.strip() + if idx == 0: + tmp_line = line + continue + if not line.startswith('x:'): + tmp_line += ' ' + line + continue + else: + complete_line = tmp_line + tmp_line = line + contours, words = process_line(complete_line, contours, words) + + if tmp_line != '': + contours, words = process_line(tmp_line, contours, words) + + for word in words: + + if word == '#': + word = '###' + continue + + return contours, words + + +def load_txt_info(gt_file, img_info): + """Load the information of one ground truth in .txt format. + + Args: + img_info(dict): The dict of only the image information + gt_file(str): The relative path of the ground_truth mat + file for one image + Returns: + img_info(dict): The dict of the img and annotation information + """ + + contours, words = get_contours_txt(gt_file) anno_info = [] for contour in contours: if contour.shape[0] == 2: @@ -188,7 +293,6 @@ def load_png_info(gt_file, img_info): Args: gt_file(str): The relative path of the ground_truth file for one image img_info(dict): The dict of only the image information - Returns: img_info(dict): The dict of the img and annotation information """ @@ -227,18 +331,15 @@ def load_png_info(gt_file, img_info): return img_info -def load_img_info(files, split): +def load_img_info(files): """Load the information of one image. Args: files(tuple): The tuple of (img_file, groundtruth_file) - split(str): The split of dataset: training or test - Returns: img_info(dict): The dict of the img and annotation information """ assert isinstance(files, tuple) - assert isinstance(split, str) img_file, gt_file = files # read imgs with ignoring orientations @@ -257,10 +358,10 @@ def load_img_info(files, split): # anno_info=anno_info, segm_file=osp.join(split_name, osp.basename(gt_file))) - if split == 'training': - img_info = load_mat_info(img_info, gt_file, split) - elif split == 'test': - img_info = load_mat_info(img_info, gt_file, split) + if osp.splitext(gt_file)[1] == '.mat': + img_info = load_mat_info(img_info, gt_file) + elif osp.splitext(gt_file)[1] == '.txt': + img_info = load_txt_info(gt_file, img_info) else: raise NotImplementedError @@ -303,7 +404,7 @@ def main(): print_tmpl='It takes {}s to convert totaltext annotation'): files = collect_files( osp.join(img_dir, split), osp.join(gt_dir, split), split) - image_infos = collect_annotations(files, split, nproc=args.nproc) + image_infos = collect_annotations(files, nproc=args.nproc) convert_annotations(image_infos, osp.join(out_dir, json_name)) diff --git a/tools/data/textrecog/totaltext_converter.py b/tools/data/textrecog/totaltext_converter.py index 369efe51..3ecf20c6 100644 --- a/tools/data/textrecog/totaltext_converter.py +++ b/tools/data/textrecog/totaltext_converter.py @@ -2,11 +2,12 @@ import argparse import glob import os import os.path as osp -from functools import partial +import re import mmcv import numpy as np import scipy.io as scio +import yaml from shapely.geometry import Polygon from mmocr.datasets.pipelines.crop import crop_img @@ -21,7 +22,6 @@ def collect_files(img_dir, gt_dir, split): img_dir(str): The image directory gt_dir(str): The groundtruth directory split(str): The split of dataset. Namely: training or test - Returns: files(list): The list of tuples (img_file, groundtruth_file) """ @@ -32,70 +32,52 @@ def collect_files(img_dir, gt_dir, split): # note that we handle png and jpg only. Pls convert others such as gif to # jpg or png offline - suffixes = ['.png', '.jpg', '.jpeg'] + suffixes = ['.png', '.PNG', '.jpg', '.JPG', '.jpeg', '.JPEG'] # suffixes = ['.png'] imgs_list = [] for suffix in suffixes: imgs_list.extend(glob.glob(osp.join(img_dir, '*' + suffix))) - imgs_list = [ - drop_orientation(f) if is_not_png(f) else f for f in imgs_list - ] + imgs_list = sorted( + [drop_orientation(f) if is_not_png(f) else f for f in imgs_list]) + ann_list = sorted( + [osp.join(gt_dir, gt_file) for gt_file in os.listdir(gt_dir)]) - files = [] - if split == 'training': - for img_file in imgs_list: - gt_file = osp.join( - gt_dir, - 'poly_gt_' + osp.splitext(osp.basename(img_file))[0] + '.mat') - files.append((img_file, gt_file)) - assert len(files), f'No images found in {img_dir}' - print(f'Loaded {len(files)} images from {img_dir}') - elif split == 'test': - for img_file in imgs_list: - gt_file = osp.join( - gt_dir, - 'poly_gt_' + osp.splitext(osp.basename(img_file))[0] + '.mat') - files.append((img_file, gt_file)) - assert len(files), f'No images found in {img_dir}' - print(f'Loaded {len(files)} images from {img_dir}') + files = [(img_file, gt_file) + for (img_file, gt_file) in zip(imgs_list, ann_list)] + assert len(files), f'No images found in {img_dir}' + print(f'Loaded {len(files)} images from {img_dir}') return files -def collect_annotations(files, split, nproc=1): +def collect_annotations(files, nproc=1): """Collect the annotation information. Args: files(list): The list of tuples (image_file, groundtruth_file) - split(str): The split of dataset. Namely: training or test nproc(int): The number of process to collect annotations - Returns: images(list): The list of image information dicts """ assert isinstance(files, list) - assert isinstance(split, str) assert isinstance(nproc, int) - load_img_info_with_split = partial(load_img_info, split=split) if nproc > 1: images = mmcv.track_parallel_progress( - load_img_info_with_split, files, nproc=nproc) + load_img_info, files, nproc=nproc) else: - images = mmcv.track_progress(load_img_info_with_split, files) + images = mmcv.track_progress(load_img_info, files) return images -def get_contours(gt_path, split): - """Get the contours and words for each ground_truth file. +def get_contours_mat(gt_path): + """Get the contours and words for each ground_truth mat file. Args: gt_path(str): The relative path of the ground_truth mat file - split(str): The split of dataset: training or test - Returns: contours(list[lists]): A list of lists of contours for the text instances @@ -103,17 +85,13 @@ def get_contours(gt_path, split): for the text instances """ assert isinstance(gt_path, str) - assert isinstance(split, str) contours = [] words = [] data = scio.loadmat(gt_path) - if split == 'training': - data_polygt = data['polygt'] - elif split == 'test': - data_polygt = data['polygt'] + data_polygt = data['polygt'] - for lines in data_polygt: + for i, lines in enumerate(data_polygt): X = np.array(lines[1]) Y = np.array(lines[3]) @@ -140,23 +118,140 @@ def get_contours(gt_path, split): return contours, words -def load_mat_info(img_info, gt_file, split): +def load_mat_info(img_info, gt_file): """Load the information of one ground truth in .mat format. Args: img_info(dict): The dict of only the image information gt_file(str): The relative path of the ground_truth mat file for one image - split(str): The split of dataset: training or test - Returns: img_info(dict): The dict of the img and annotation information """ assert isinstance(img_info, dict) assert isinstance(gt_file, str) - assert isinstance(split, str) - contours, words = get_contours(gt_file, split) + contours, words = get_contours_mat(gt_file) + anno_info = [] + for contour, word in zip(contours, words): + if contour.shape[0] == 2: + continue + coordinates = np.array(contour).reshape(-1, 2) + polygon = Polygon(coordinates) + + # convert to COCO style XYWH format + min_x, min_y, max_x, max_y = polygon.bounds + bbox = [min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y] + anno = dict(word=word, bbox=bbox) + anno_info.append(anno) + + img_info.update(anno_info=anno_info) + return img_info + + +def process_line(line, contours, words): + """Get the contours and words by processing each line in the gt file. + + Args: + line(str): The line in gt file containing annotation info + contours(list[lists]): A list of lists of contours + for the text instances + words(list[list]): A list of lists of words (string) + for the text instances + Returns: + contours(list[lists]): A list of lists of contours + for the text instances + words(list[list]): A list of lists of words (string) + for the text instances + """ + + line = '{' + line.replace('[[', '[').replace(']]', ']') + '}' + ann_dict = re.sub('([0-9]) +([0-9])', r'\1,\2', line) + ann_dict = re.sub('([0-9]) +([ 0-9])', r'\1,\2', ann_dict) + ann_dict = re.sub('([0-9]) -([0-9])', r'\1,-\2', ann_dict) + ann_dict = ann_dict.replace("[u',']", "[u'#']") + ann_dict = yaml.load(ann_dict) + + X = np.array([ann_dict['x']]) + Y = np.array([ann_dict['y']]) + + if len(ann_dict['transcriptions']) == 0: + word = '???' + else: + word = ann_dict['transcriptions'][0] + if len(ann_dict['transcriptions']) > 1: + for ann_word in ann_dict['transcriptions'][1:]: + word += ',' + ann_word + word = str(eval(word)) + words.append(word) + + point_num = len(X[0]) + + arr = np.concatenate([X, Y]).T + contour = [] + for i in range(point_num): + contour.append(arr[i][0]) + contour.append(arr[i][1]) + contours.append(np.asarray(contour)) + + return contours, words + + +def get_contours_txt(gt_path): + """Get the contours and words for each ground_truth txt file. + + Args: + gt_path(str): The relative path of the ground_truth mat file + Returns: + contours(list[lists]): A list of lists of contours + for the text instances + words(list[list]): A list of lists of words (string) + for the text instances + """ + assert isinstance(gt_path, str) + + contours = [] + words = [] + + with open(gt_path, 'r') as f: + tmp_line = '' + for idx, line in enumerate(f): + line = line.strip() + if idx == 0: + tmp_line = line + continue + if not line.startswith('x:'): + tmp_line += ' ' + line + continue + else: + complete_line = tmp_line + tmp_line = line + contours, words = process_line(complete_line, contours, words) + + if tmp_line != '': + contours, words = process_line(tmp_line, contours, words) + + for word in words: + + if word == '#': + word = '###' + continue + + return contours, words + + +def load_txt_info(gt_file, img_info): + """Load the information of one ground truth in .txt format. + + Args: + img_info(dict): The dict of only the image information + gt_file(str): The relative path of the ground_truth mat + file for one image + Returns: + img_info(dict): The dict of the img and annotation information + """ + + contours, words = get_contours_txt(gt_file) anno_info = [] for contour, word in zip(contours, words): if contour.shape[0] == 2: @@ -175,6 +270,14 @@ def load_mat_info(img_info, gt_file, split): def generate_ann(root_path, split, image_infos): + """Generate cropped annotations and label txt file. + + Args: + root_path(str): The relative path of the totaltext file + split(str): The split of dataset. Namely: training or test + image_infos(list[dict]): A list of dicts of the img and + annotation information + """ dst_image_root = osp.join(root_path, 'dst_imgs', split) if split == 'training': @@ -202,18 +305,15 @@ def generate_ann(root_path, split, image_infos): list_to_file(dst_label_file, lines) -def load_img_info(files, split): +def load_img_info(files): """Load the information of one image. Args: files(tuple): The tuple of (img_file, groundtruth_file) - split(str): The split of dataset: training or test - Returns: img_info(dict): The dict of the img and annotation information """ assert isinstance(files, tuple) - assert isinstance(split, str) img_file, gt_file = files # read imgs with ignoring orientations @@ -232,10 +332,10 @@ def load_img_info(files, split): # anno_info=anno_info, segm_file=osp.join(split_name, osp.basename(gt_file))) - if split == 'training': - img_info = load_mat_info(img_info, gt_file, split) - elif split == 'test': - img_info = load_mat_info(img_info, gt_file, split) + if osp.splitext(gt_file)[1] == '.mat': + img_info = load_mat_info(img_info, gt_file) + elif osp.splitext(gt_file)[1] == '.txt': + img_info = load_txt_info(gt_file, img_info) else: raise NotImplementedError @@ -278,7 +378,7 @@ def main(): print_tmpl='It takes {}s to convert totaltext annotation'): files = collect_files( osp.join(img_dir, split), osp.join(gt_dir, split), split) - image_infos = collect_annotations(files, split, nproc=args.nproc) + image_infos = collect_annotations(files, nproc=args.nproc) generate_ann(root_path, split, image_infos)