mirror of https://github.com/open-mmlab/mmyolo.git
[Feature] Add `extract_subcoco` script (#186)
* add extract_subcoco.py * update * fix comment * fix commentpull/249/head
parent
d75fbabcb0
commit
ce73b03b72
docs
en/user_guides
zh_cn/user_guides
tools/misc
|
@ -231,3 +231,38 @@ python tools/model_converters/yolox_to_mmyolo.py --src yolox_s.pth --dst mmyolox
|
|||
```
|
||||
|
||||
The converted `mmyolox.pt` can be used by MMYOLO.
|
||||
|
||||
## Extracts a subset of COCO
|
||||
|
||||
The training dataset of the COCO2017 dataset includes 118K images, and the validation set includes 5K images, which is a relatively large dataset. Loading JSON in debugging or quick verification scenarios will consume more resources and bring slower startup speed.
|
||||
The `extract_subcoco.py` script provides the ability to extract a specified number of images. The user can use the `--num-img` parameter to get a COCO subset of the specified number of images.
|
||||
|
||||
Currently, only support COCO2017. In the future will support user-defined datasets of standard coco JSON format.
|
||||
|
||||
The root path folder format is as follows:
|
||||
|
||||
```text
|
||||
├── root
|
||||
│ ├── annotations
|
||||
│ ├── train2017
|
||||
│ ├── val2017
|
||||
│ ├── test2017
|
||||
```
|
||||
|
||||
1. Extract 10 training images and 10 validation images using only 5K validation sets.
|
||||
|
||||
```shell
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 10
|
||||
```
|
||||
|
||||
2. Extract 20 training images using the training set and 20 validation images using the validation set.
|
||||
|
||||
```shell
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set
|
||||
```
|
||||
|
||||
3. Set the global seed to 1. The default is no setting.
|
||||
|
||||
```shell
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set --seed 1
|
||||
```
|
||||
|
|
|
@ -230,3 +230,38 @@ python tools/model_converters/yolox_to_mmyolo.py --src yolox_s.pth --dst mmyolox
|
|||
```
|
||||
|
||||
转换好的 `mmyolox.pt` 即可以在 MMYOLO 中使用。
|
||||
|
||||
## 提取 COCO 子集
|
||||
|
||||
COCO2017 数据集训练数据集包括 118K 张图片,验证集包括 5K 张图片,数据集比较大。在调试或者快速验证程序是否正确的场景下加载 json 会需要消耗较多资源和带来较慢的启动速度,这会导致程序体验不好。
|
||||
`extract_subcoco.py` 脚本提供了切分指定张图片的功能,用户可以通过 `--num-img` 参数来得到指定图片数目的 COCO 子集,从而满足上述需求。
|
||||
|
||||
注意: 本脚本目前仅仅支持 COCO2017 数据集,未来会支持更加通用的 COCO JSON 格式数据集
|
||||
|
||||
输入 root 根路径文件夹格式如下所示:
|
||||
|
||||
```text
|
||||
├── root
|
||||
│ ├── annotations
|
||||
│ ├── train2017
|
||||
│ ├── val2017
|
||||
│ ├── test2017
|
||||
```
|
||||
|
||||
1. 仅仅使用 5K 张验证集切分出 10 张训练图片和 10 张验证图片
|
||||
|
||||
```shell
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 10
|
||||
```
|
||||
|
||||
2. 使用训练集切分出 20 张训练图片,使用验证集切分出 20 张验证图片
|
||||
|
||||
```shell
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set
|
||||
```
|
||||
|
||||
3. 设置全局种子,默认不设置
|
||||
|
||||
```shell
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set --seed 1
|
||||
```
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
# Copyright (c) OpenMMLab. All rights reserved.
|
||||
"""Extracting subsets from coco2017 dataset.
|
||||
|
||||
This script is mainly used to debug and verify the correctness of the
|
||||
program quickly.
|
||||
The root folder format must be in the following format:
|
||||
|
||||
├── root
|
||||
│ ├── annotations
|
||||
│ ├── train2017
|
||||
│ ├── val2017
|
||||
│ ├── test2017
|
||||
|
||||
Currently, only support COCO2017. In the future will support user-defined
|
||||
datasets of standard coco JSON format.
|
||||
|
||||
Example:
|
||||
python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img ${NUM_IMG}
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os.path as osp
|
||||
import shutil
|
||||
|
||||
import mmengine
|
||||
import numpy as np
|
||||
from pycocotools.coco import COCO
|
||||
|
||||
|
||||
# TODO: Currently only supports coco2017
|
||||
def _process_data(args,
|
||||
in_dataset_type: str,
|
||||
out_dataset_type: str,
|
||||
year: str = '2017'):
|
||||
assert in_dataset_type in ('train', 'val')
|
||||
assert out_dataset_type in ('train', 'val')
|
||||
|
||||
int_ann_file_name = f'annotations/instances_{in_dataset_type}{year}.json'
|
||||
out_ann_file_name = f'annotations/instances_{out_dataset_type}{year}.json'
|
||||
|
||||
ann_path = osp.join(args.root, int_ann_file_name)
|
||||
json_data = mmengine.load(ann_path)
|
||||
|
||||
new_json_data = {
|
||||
'info': json_data['info'],
|
||||
'licenses': json_data['licenses'],
|
||||
'categories': json_data['categories'],
|
||||
'images': [],
|
||||
'annotations': []
|
||||
}
|
||||
|
||||
images = json_data['images']
|
||||
coco = COCO(ann_path)
|
||||
|
||||
# shuffle
|
||||
np.random.shuffle(images)
|
||||
|
||||
progress_bar = mmengine.ProgressBar(args.num_img)
|
||||
|
||||
for i in range(args.num_img):
|
||||
file_name = images[i]['file_name']
|
||||
image_path = osp.join(args.root, in_dataset_type + year, file_name)
|
||||
|
||||
ann_ids = coco.getAnnIds(imgIds=[images[i]['id']])
|
||||
ann_info = coco.loadAnns(ann_ids)
|
||||
|
||||
new_json_data['images'].append(images[i])
|
||||
new_json_data['annotations'].extend(ann_info)
|
||||
|
||||
shutil.copy(image_path, osp.join(args.out_dir,
|
||||
out_dataset_type + year))
|
||||
|
||||
progress_bar.update()
|
||||
|
||||
mmengine.dump(new_json_data, osp.join(args.out_dir, out_ann_file_name))
|
||||
|
||||
|
||||
def _make_dirs(out_dir):
|
||||
mmengine.mkdir_or_exist(out_dir)
|
||||
mmengine.mkdir_or_exist(osp.join(out_dir, 'annotations'))
|
||||
mmengine.mkdir_or_exist(osp.join(out_dir, 'train2017'))
|
||||
mmengine.mkdir_or_exist(osp.join(out_dir, 'val2017'))
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Extract coco subset')
|
||||
parser.add_argument('root', help='root path')
|
||||
parser.add_argument(
|
||||
'out_dir', type=str, help='directory where subset coco will be saved.')
|
||||
parser.add_argument(
|
||||
'--num-img', default=50, type=int, help='num of extract image')
|
||||
parser.add_argument(
|
||||
'--use-training-set',
|
||||
action='store_true',
|
||||
help='Whether to use the training set when extract the training set. '
|
||||
'The training subset is extracted from the validation set by '
|
||||
'default which can speed up.')
|
||||
parser.add_argument('--seed', default=-1, type=int, help='seed')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
assert args.out_dir != args.root, \
|
||||
'The file will be overwritten in place, ' \
|
||||
'so the same folder is not allowed !'
|
||||
|
||||
seed = int(args.seed)
|
||||
if seed != -1:
|
||||
print(f'Set the global seed: {seed}')
|
||||
np.random.seed(int(args.seed))
|
||||
|
||||
_make_dirs(args.out_dir)
|
||||
|
||||
print('====Start processing train dataset====')
|
||||
if args.use_training_set:
|
||||
_process_data(args, 'train', 'train')
|
||||
else:
|
||||
_process_data(args, 'val', 'train')
|
||||
print('\n====Start processing val dataset====')
|
||||
_process_data(args, 'val', 'val')
|
||||
print(f'\n Result save to {args.out_dir}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue