mmsegmentation/tools/dataset_converters/refuge.py

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import os.path as osp
import tempfile
import zipfile

import mmcv
import numpy as np
from mmengine.utils import mkdir_or_exist


def parse_args():
    parser = argparse.ArgumentParser(
        description='Convert REFUGE dataset to mmsegmentation format')
    parser.add_argument('--raw_data_root', help='the root path of raw data')

    parser.add_argument('--tmp_dir', help='path of the temporary directory')
    parser.add_argument('-o', '--out_dir', help='output path')
    args = parser.parse_args()
    return args


def extract_img(root: str,
                cur_dir: str,
                out_dir: str,
                mode: str = 'train',
                file_type: str = 'img') -> None:
    """_summary_

    Args:
       Args:
        root (str): root where the extracted data is saved
        cur_dir (cur_dir): dir where the zip_file exists
        out_dir (str): root dir where the data is saved

        mode (str, optional): Defaults to 'train'.
        file_type (str, optional): Defaults to 'img',else to 'mask'.
    """
    zip_file = zipfile.ZipFile(cur_dir)
    zip_file.extractall(root)
    for cur_dir, dirs, files in os.walk(root):
        # filter child dirs and directories with "Illustration" and "MACOSX"
        if len(dirs) == 0 and \
                cur_dir.split('\\')[-1].find('Illustration') == -1 and \
                cur_dir.find('MACOSX') == -1:

            file_names = [
                file for file in files
                if file.endswith('.jpg') or file.endswith('.bmp')
            ]
            for filename in sorted(file_names):
                img = mmcv.imread(osp.join(cur_dir, filename))

                if file_type == 'annotations':
                    img = img[:, :, 0]
                    img[np.where(img == 0)] = 1
                    img[np.where(img == 128)] = 2
                    img[np.where(img == 255)] = 0
                mmcv.imwrite(
                    img,
                    osp.join(out_dir, file_type, mode,
                             osp.splitext(filename)[0] + '.png'))


def main():
    args = parse_args()

    raw_data_root = args.raw_data_root
    if args.out_dir is None:
        out_dir = osp.join('./data', 'REFUGE')

    else:
        out_dir = args.out_dir

    print('Making directories...')
    mkdir_or_exist(out_dir)
    mkdir_or_exist(osp.join(out_dir, 'images'))
    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
    mkdir_or_exist(osp.join(out_dir, 'images', 'test'))
    mkdir_or_exist(osp.join(out_dir, 'annotations'))
    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
    mkdir_or_exist(osp.join(out_dir, 'annotations', 'test'))

    print('Generating images and annotations...')
    # process data from the child dir on the first rank
    cur_dir, dirs, files = list(os.walk(raw_data_root))[0]
    print('====================')

    files = list(filter(lambda x: x.endswith('.zip'), files))

    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
        for file in files:
            # search data folders for training,validation,test
            mode = list(
                filter(lambda x: file.lower().find(x) != -1,
                       ['training', 'test', 'validation']))[0]
            file_root = osp.join(tmp_dir, file[:-4])
            file_type = 'images' if file.find('Anno') == -1 and file.find(
                'GT') == -1 else 'annotations'
            extract_img(file_root, osp.join(cur_dir, file), out_dir, mode,
                        file_type)

    print('Done!')


if __name__ == '__main__':
    main()