mmpretrain/tools/kfold-cross-valid.py

372 lines
13 KiB
Python

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy
import os
import os.path as osp
import time
import warnings
from datetime import datetime
from pathlib import Path
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from mmcls import __version__
from mmcls.apis import init_random_seed, set_random_seed, train_model
from mmcls.datasets import build_dataset
from mmcls.models import build_classifier
from mmcls.utils import collect_env, get_root_logger, load_json_log
TEST_METRICS = ('precision', 'recall', 'f1_score', 'support', 'mAP', 'CP',
'CR', 'CF1', 'OP', 'OR', 'OF1', 'accuracy')
prog_description = """K-Fold cross-validation.
To start a 5-fold cross-validation experiment:
python tools/kfold-cross-valid.py $CONFIG --num-splits 5
To resume a 5-fold cross-validation from an interrupted experiment:
python tools/kfold-cross-valid.py $CONFIG --num-splits 5 --resume-from work_dirs/fold2/latest.pth
To summarize a 5-fold cross-validation:
python tools/kfold-cross-valid.py $CONFIG --num-splits 5 --summary
""" # noqa: E501
def parse_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=prog_description)
parser.add_argument('config', help='train config file path')
parser.add_argument(
'--num-splits', type=int, help='The number of all folds.')
parser.add_argument(
'--fold',
type=int,
help='The fold used to do validation. '
'If specify, only do an experiment of the specified fold.')
parser.add_argument(
'--summary',
action='store_true',
help='Summarize the k-fold cross-validation results.')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument(
'--resume-from', help='the checkpoint file to resume from')
parser.add_argument(
'--no-validate',
action='store_true',
help='whether not to evaluate the checkpoint during training')
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument('--device', help='device used for training')
group_gpus.add_argument(
'--gpus',
type=int,
help='(Deprecated, please use --gpu-id) number of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='(Deprecated, please use --gpu-id) ids of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--deterministic',
action='store_true',
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def copy_config(old_cfg):
"""deepcopy a Config object."""
new_cfg = Config()
_cfg_dict = copy.deepcopy(old_cfg._cfg_dict)
_filename = copy.deepcopy(old_cfg._filename)
_text = copy.deepcopy(old_cfg._text)
super(Config, new_cfg).__setattr__('_cfg_dict', _cfg_dict)
super(Config, new_cfg).__setattr__('_filename', _filename)
super(Config, new_cfg).__setattr__('_text', _text)
return new_cfg
def train_single_fold(args, cfg, fold, distributed, seed):
# create the work_dir for the fold
work_dir = osp.join(cfg.work_dir, f'fold{fold}')
cfg.work_dir = work_dir
# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# wrap the dataset cfg
train_dataset = dict(
type='KFoldDataset',
fold=fold,
dataset=cfg.data.train,
num_splits=args.num_splits,
seed=seed,
)
val_dataset = dict(
type='KFoldDataset',
fold=fold,
# Use the same dataset with training.
dataset=copy.deepcopy(cfg.data.train),
num_splits=args.num_splits,
seed=seed,
test_mode=True,
)
val_dataset['dataset']['pipeline'] = cfg.data.val.pipeline
cfg.data.train = train_dataset
cfg.data.val = val_dataset
cfg.data.test = val_dataset
# dump config
stem, suffix = osp.basename(args.config).rsplit('.', 1)
cfg.dump(osp.join(cfg.work_dir, f'{stem}_fold{fold}.{suffix}'))
# init the logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
# init the meta dict to record some important information such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' +
dash_line)
meta['env_info'] = env_info
# log some basic info
logger.info(f'Distributed training: {distributed}')
logger.info(f'Config:\n{cfg.pretty_text}')
logger.info(
f'-------- Cross-validation: [{fold+1}/{args.num_splits}] -------- ')
# set random seeds
# Use different seed in different folds
logger.info(f'Set random seed to {seed + fold}, '
f'deterministic: {args.deterministic}')
set_random_seed(seed + fold, deterministic=args.deterministic)
cfg.seed = seed + fold
meta['seed'] = seed + fold
model = build_classifier(cfg.model)
model.init_weights()
datasets = [build_dataset(cfg.data.train)]
if len(cfg.workflow) == 2:
val_dataset = copy.deepcopy(cfg.data.val)
val_dataset.pipeline = cfg.data.train.pipeline
datasets.append(build_dataset(val_dataset))
meta.update(
dict(
mmcls_version=__version__,
config=cfg.pretty_text,
CLASSES=datasets[0].CLASSES,
kfold=dict(fold=fold, num_splits=args.num_splits)))
# add an attribute for visualization convenience
train_model(
model,
datasets,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
device='cpu' if args.device == 'cpu' else 'cuda',
meta=meta)
def summary(args, cfg):
summary = dict()
for fold in range(args.num_splits):
work_dir = Path(cfg.work_dir) / f'fold{fold}'
# Find the latest training log
log_files = list(work_dir.glob('*.log.json'))
if len(log_files) == 0:
continue
log_file = sorted(log_files)[-1]
date = datetime.fromtimestamp(log_file.lstat().st_mtime)
summary[fold] = {'date': date.strftime('%Y-%m-%d %H:%M:%S')}
# Find the latest eval log
json_log = load_json_log(log_file)
epochs = sorted(list(json_log.keys()))
eval_log = {}
def is_metric_key(key):
for metric in TEST_METRICS:
if metric in key:
return True
return False
for epoch in epochs[::-1]:
if any(is_metric_key(k) for k in json_log[epoch].keys()):
eval_log = json_log[epoch]
break
summary[fold]['epoch'] = epoch
summary[fold]['metric'] = {
k: v[0] # the value is a list with only one item.
for k, v in eval_log.items() if is_metric_key(k)
}
show_summary(args, summary)
def show_summary(args, summary_data):
try:
from rich.console import Console
from rich.table import Table
except ImportError:
raise ImportError('Please run `pip install rich` to install '
'package `rich` to draw the table.')
console = Console()
table = Table(title=f'{args.num_splits}-fold Cross-validation Summary')
table.add_column('Fold')
metrics = summary_data[0]['metric'].keys()
for metric in metrics:
table.add_column(metric)
table.add_column('Epoch')
table.add_column('Date')
for fold in range(args.num_splits):
row = [f'{fold+1}']
if fold not in summary_data:
table.add_row(*row)
continue
for metric in metrics:
metric_value = summary_data[fold]['metric'].get(metric, '')
def format_value(value):
if isinstance(value, float):
return f'{value:.2f}'
if isinstance(value, (list, tuple)):
return str([format_value(i) for i in value])
else:
return str(value)
row.append(format_value(metric_value))
row.append(str(summary_data[fold]['epoch']))
row.append(summary_data[fold]['date'])
table.add_row(*row)
console.print(table)
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
if args.summary:
summary(args, cfg)
return
# resume from the previous experiment
if args.resume_from is not None:
cfg.resume_from = args.resume_from
resume_kfold = torch.load(cfg.resume_from).get('meta',
{}).get('kfold', None)
if resume_kfold is None:
raise RuntimeError(
'No "meta" key in checkpoints or no "kfold" in the meta dict. '
'Please check if the resume checkpoint from a k-fold '
'cross-valid experiment.')
resume_fold = resume_kfold['fold']
assert args.num_splits == resume_kfold['num_splits']
else:
resume_fold = 0
if args.gpus is not None:
cfg.gpu_ids = range(1)
warnings.warn('`--gpus` is deprecated because we only support '
'single GPU mode in non-distributed training. '
'Use `gpus=1` now.')
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids[0:1]
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
'Because we only support single GPU mode in '
'non-distributed training. Use the first GPU '
'in `gpu_ids` now.')
if args.gpus is None and args.gpu_ids is None:
cfg.gpu_ids = [args.gpu_id]
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
# init a unified random seed
seed = init_random_seed(args.seed)
# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
if args.fold is not None:
folds = [args.fold]
else:
folds = range(resume_fold, args.num_splits)
for fold in folds:
cfg_ = copy_config(cfg)
if fold != resume_fold:
cfg_.resume_from = None
train_single_fold(args, cfg_, fold, distributed, seed)
if args.fold is None:
summary(args, cfg)
if __name__ == '__main__':
main()