From 71dfeb335fd7f5faa99ec422d179188bd764b146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8F=A4=E6=9C=88=E9=97=BB=E6=98=9F?= Date: Sun, 18 Sep 2022 11:43:35 +0800 Subject: [PATCH] [Feature] add download and convert script of dataset (#11) --- tools/dataset_converters/balloon2coco.py | 58 +++++++++++++ tools/misc/download_dataset.py | 105 +++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 tools/dataset_converters/balloon2coco.py create mode 100644 tools/misc/download_dataset.py diff --git a/tools/dataset_converters/balloon2coco.py b/tools/dataset_converters/balloon2coco.py new file mode 100644 index 00000000..65eb660c --- /dev/null +++ b/tools/dataset_converters/balloon2coco.py @@ -0,0 +1,58 @@ +import os.path as osp + +import mmcv +import mmengine + + +def convert_balloon_to_coco(ann_file, out_file, image_prefix): + + data_infos = mmengine.load(ann_file) + + annotations = [] + images = [] + obj_count = 0 + for idx, v in enumerate(mmengine.track_iter_progress(data_infos.values())): + filename = v['filename'] + img_path = osp.join(image_prefix, filename) + height, width = mmcv.imread(img_path).shape[:2] + + images.append( + dict(id=idx, file_name=filename, height=height, width=width)) + + for _, obj in v['regions'].items(): + assert not obj['region_attributes'] + obj = obj['shape_attributes'] + px = obj['all_points_x'] + py = obj['all_points_y'] + poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)] + poly = [p for x in poly for p in x] + + x_min, y_min, x_max, y_max = (min(px), min(py), max(px), max(py)) + + data_anno = dict( + image_id=idx, + id=obj_count, + category_id=0, + bbox=[x_min, y_min, x_max - x_min, y_max - y_min], + area=(x_max - x_min) * (y_max - y_min), + segmentation=[poly], + iscrowd=0) + annotations.append(data_anno) + obj_count += 1 + + coco_format_json = dict( + images=images, + annotations=annotations, + categories=[{ + 'id': 0, + 'name': 'balloon' + }]) + mmengine.dump(coco_format_json, out_file) + + +if __name__ == '__main__': + + convert_balloon_to_coco('data/balloon/train/via_region_data.json', + 'data/balloon/train.json', 'data/balloon/train/') + convert_balloon_to_coco('data/balloon/val/via_region_data.json', + 'data/balloon/val.json', 'data/balloon/val/') diff --git a/tools/misc/download_dataset.py b/tools/misc/download_dataset.py new file mode 100644 index 00000000..7685e95b --- /dev/null +++ b/tools/misc/download_dataset.py @@ -0,0 +1,105 @@ +import argparse +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from tarfile import TarFile +from zipfile import ZipFile + +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Download datasets for training') + parser.add_argument( + '--dataset-name', type=str, help='dataset name', default='coco2017') + parser.add_argument( + '--save-dir', + type=str, + help='the dir to save dataset', + default='data/coco') + parser.add_argument( + '--unzip', + action='store_true', + help='whether unzip dataset or not, zipped files will be saved') + parser.add_argument( + '--delete', + action='store_true', + help='delete the download zipped files') + parser.add_argument( + '--threads', type=int, help='number of threading', default=4) + args = parser.parse_args() + return args + + +def download(url, dir, unzip=True, delete=False, threads=1): + + def download_one(url, dir): + f = dir / Path(url).name + if Path(url).is_file(): + Path(url).rename(f) + elif not f.exists(): + print(f'Downloading {url} to {f}') + torch.hub.download_url_to_file(url, f, progress=True) + if unzip and f.suffix in ('.zip', '.tar'): + print(f'Unzipping {f.name}') + if f.suffix == '.zip': + ZipFile(f).extractall(path=dir) + elif f.suffix == '.tar': + TarFile(f).extractall(path=dir) + if delete: + f.unlink() + print(f'Delete {f}') + + dir = Path(dir) + if threads > 1: + pool = ThreadPool(threads) + pool.imap(lambda x: download_one(*x), zip(url, repeat(dir))) + pool.close() + pool.join() + else: + for u in [url] if isinstance(url, (str, Path)) else url: + download_one(u, dir) + + +def main(): + args = parse_args() + path = Path(args.save_dir) + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + data2url = dict( + # TODO: Support for downloading Panoptic Segmentation of COCO + coco2017=[ + 'http://images.cocodataset.org/zips/train2017.zip', + 'http://images.cocodataset.org/zips/val2017.zip', + 'http://images.cocodataset.org/zips/test2017.zip', + 'http://images.cocodataset.org/annotations/' + + 'annotations_trainval2017.zip' + ], + lvis=[ + 'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa + 'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa + ], + voc2007=[ + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', # noqa + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', # noqa + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar', # noqa + ], + balloon=[ + 'https://github.com/matterport/Mask_RCNN/' + + 'releases/download/v2.1/balloon_dataset.zip' + ]) + url = data2url.get(args.dataset_name, None) + if url is None: + print('Only support COCO, VOC, balloon,and LVIS now!') + return + download( + url, + dir=path, + unzip=args.unzip, + delete=args.delete, + threads=args.threads) + + +if __name__ == '__main__': + main()