mmpretrain/tools/train.py

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy
import os
import os.path as osp
import time
import warnings

import mmcv
import torch
import torch.distributed as dist
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist

from mmcls import __version__
from mmcls.apis import init_random_seed, set_random_seed, train_model
from mmcls.datasets import build_dataset
from mmcls.models import build_classifier
from mmcls.utils import collect_env, get_root_logger, setup_multi_processes


def parse_args():
    parser = argparse.ArgumentParser(description='Train a model')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument(
        '--device', help='device used for training. (Deprecated)')
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='(Deprecated, please use --gpu-id) number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='(Deprecated, please use --gpu-id) ids of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-id',
        type=int,
        default=0,
        help='id of gpu to use '
        '(only applicable to non-distributed training)')
    parser.add_argument(
        '--ipu-replicas',
        type=int,
        default=None,
        help='num of ipu replicas to use')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
        '--diff-seed',
        action='store_true',
        help='Whether or not set different seeds for different ranks')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local-rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)

    # set multi-process settings
    setup_multi_processes(cfg)

    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.gpus is not None:
        cfg.gpu_ids = range(1)
        warnings.warn('`--gpus` is deprecated because we only support '
                      'single GPU mode in non-distributed training. '
                      'Use `gpus=1` now.')
    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids[0:1]
        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
                      'Because we only support single GPU mode in '
                      'non-distributed training. Use the first GPU '
                      'in `gpu_ids` now.')
    if args.gpus is None and args.gpu_ids is None:
        cfg.gpu_ids = [args.gpu_id]

    if args.ipu_replicas is not None:
        cfg.ipu_replicas = args.ipu_replicas
        args.device = 'ipu'

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
    # dump config
    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # init the meta dict to record some important information such as
    # environment info and seed, which will be logged
    meta = dict()
    # log env info
    env_info_dict = collect_env()
    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
    dash_line = '-' * 60 + '\n'
    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                dash_line)
    meta['env_info'] = env_info

    # log some basic info
    logger.info(f'Distributed training: {distributed}')
    logger.info(f'Config:\n{cfg.pretty_text}')

    # set random seeds
    seed = init_random_seed(args.seed)
    seed = seed + dist.get_rank() if args.diff_seed else seed
    logger.info(f'Set random seed to {seed}, '
                f'deterministic: {args.deterministic}')
    set_random_seed(seed, deterministic=args.deterministic)
    cfg.seed = seed
    meta['seed'] = seed

    model = build_classifier(cfg.model)
    model.init_weights()

    datasets = [build_dataset(cfg.data.train)]
    if len(cfg.workflow) == 2:
        val_dataset = copy.deepcopy(cfg.data.val)
        val_dataset.pipeline = cfg.data.train.pipeline
        datasets.append(build_dataset(val_dataset))

    # save mmcls version, config file content and class names in
    # runner as meta data
    meta.update(
        dict(
            mmcls_version=__version__,
            config=cfg.pretty_text,
            CLASSES=datasets[0].CLASSES))

    # add an attribute for visualization convenience
    train_model(
        model,
        datasets,
        cfg,
        distributed=distributed,
        validate=(not args.no_validate),
        timestamp=timestamp,
        device=args.device,
        meta=meta)


if __name__ == '__main__':
    main()
[Docs] Add Copyright information. (#413) 2021-08-17 19:52:42 +08:00			`# Copyright (c) OpenMMLab. All rights reserved.`
init commit 2020-05-21 21:21:43 +08:00			`import argparse`
			`import copy`
			`import os`
			`import os.path as osp`
			`import time`
[Feature] Add ResNetV1c. (#692) * add ResNetV1c * add unit tests * fix lint * update docstring * fix lint 2022-02-23 11:36:33 +08:00			`import warnings`
init commit 2020-05-21 21:21:43 +08:00
			`import mmcv`
			`import torch`
[Feature] Add diff seeds to diff ranks. (#744) * [Feature] Add diff seeds to diff ranks. * lint 2022-03-25 15:31:13 +08:00			`import torch.distributed as dist`
init commit 2020-05-21 21:21:43 +08:00			`from mmcv import Config, DictAction`
Fix bug in gpu_ids in distributed training (#107) * update gpu_ids in distributed training * move linear scaling rule after getting correct gpu_ids * Remove support for autoscale_lr 2020-12-09 16:28:03 +08:00			`from mmcv.runner import get_dist_info, init_dist`
init commit 2020-05-21 21:21:43 +08:00
			`from mmcls import __version__`
[Enchance] Set a random seed when the user does not set a seed. (#554) 2021-12-02 18:09:55 +08:00			`from mmcls.apis import init_random_seed, set_random_seed, train_model`
init commit 2020-05-21 21:21:43 +08:00			`from mmcls.datasets import build_dataset`
Add classifiers, heads, necks and losses 2020-07-07 19:32:06 +08:00			`from mmcls.models import build_classifier`
[Enhance] Add setup multi-processing both in train and test. (#671) 2022-01-27 10:18:36 +08:00			`from mmcls.utils import collect_env, get_root_logger, setup_multi_processes`
init commit 2020-05-21 21:21:43 +08:00

			`def parse_args():`
			`parser = argparse.ArgumentParser(description='Train a model')`
			`parser.add_argument('config', help='train config file path')`
			`parser.add_argument('--work-dir', help='the dir to save logs and models')`
			`parser.add_argument(`
			`'--resume-from', help='the checkpoint file to resume from')`
			`parser.add_argument(`
			`'--no-validate',`
			`action='store_true',`
			`help='whether not to evaluate the checkpoint during training')`
			`group_gpus = parser.add_mutually_exclusive_group()`
[Enhance] New-style CPU training and inference. (#674) * [Enhance] New-style CPU training and inference. * Add version check in CPU training/test 2022-01-30 20:49:54 +08:00			`group_gpus.add_argument(`
			`'--device', help='device used for training. (Deprecated)')`
init commit 2020-05-21 21:21:43 +08:00			`group_gpus.add_argument(`
			`'--gpus',`
			`type=int,`
[Refactor] Use `--gpu-id` instead of `--gpu-ids` in non-distributed multi-gpu training/testing. (#688) 2022-02-17 02:17:01 +08:00			`help='(Deprecated, please use --gpu-id) number of gpus to use '`
init commit 2020-05-21 21:21:43 +08:00			`'(only applicable to non-distributed training)')`
			`group_gpus.add_argument(`
			`'--gpu-ids',`
			`type=int,`
			`nargs='+',`
[Refactor] Use `--gpu-id` instead of `--gpu-ids` in non-distributed multi-gpu training/testing. (#688) 2022-02-17 02:17:01 +08:00			`help='(Deprecated, please use --gpu-id) ids of gpus to use '`
			`'(only applicable to non-distributed training)')`
			`group_gpus.add_argument(`
			`'--gpu-id',`
			`type=int,`
			`default=0,`
			`help='id of gpu to use '`
init commit 2020-05-21 21:21:43 +08:00			`'(only applicable to non-distributed training)')`
[Enhance] Support training on IPU and add fine-tuning configs of ViT. (#723) * implement training and evaluation on IPU * fp16 SOTA * Tput reaches 5600 * 123 * add poptorch dataloder * change ipu_replicas to ipu-replicas * add noqa to config long line(website) * remove ipu dataloder test code * del one blank line in test_builder * refine the dataloder initialization * fix a typo * refine args for dataloder * remove an annoted line * process one more conflict * adjust code structure in mmcv.ipu * adjust ipu code structure in mmcv * IPUDataloader to IPUDataLoader * align with mmcv * adjust according to mmcv * mmcv code structre fixed Co-authored-by: hudi <dihu@graphcore.ai> 2022-04-29 22:22:19 +08:00			`parser.add_argument(`
			`'--ipu-replicas',`
			`type=int,`
			`default=None,`
			`help='num of ipu replicas to use')`
init commit 2020-05-21 21:21:43 +08:00			`parser.add_argument('--seed', type=int, default=None, help='random seed')`
[Feature] Add diff seeds to diff ranks. (#744) * [Feature] Add diff seeds to diff ranks. * lint 2022-03-25 15:31:13 +08:00			`parser.add_argument(`
			`'--diff-seed',`
			`action='store_true',`
			`help='Whether or not set different seeds for different ranks')`
init commit 2020-05-21 21:21:43 +08:00			`parser.add_argument(`
			`'--deterministic',`
			`action='store_true',`
			`help='whether to set deterministic options for CUDNN backend.')`
[Enhance] Rename the option `--options` in some tools to `--cfg-options`. (#425) 2021-09-01 12:49:03 +08:00			`parser.add_argument(`
			`'--cfg-options',`
			`nargs='+',`
			`action=DictAction,`
			`help='override some settings in the used config, the key-value pair '`
			`'in xxx=yyy format will be merged into config file. If the value to '`
			`'be overwritten is a list, it should be like key="[a,b]" or key=a,b '`
			`'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '`
			`'Note that the quotation marks are necessary and that no white space '`
			`'is allowed.')`
init commit 2020-05-21 21:21:43 +08:00			`parser.add_argument(`
			`'--launcher',`
			`choices=['none', 'pytorch', 'slurm', 'mpi'],`
			`default='none',`
			`help='job launcher')`
[Enhance] Use `--a-b` instead of `--a_b` in arguments. (#754) 2022-03-30 19:16:28 +08:00			`parser.add_argument('--local-rank', type=int, default=0)`
init commit 2020-05-21 21:21:43 +08:00			`args = parser.parse_args()`
			`if 'LOCAL_RANK' not in os.environ:`
			`os.environ['LOCAL_RANK'] = str(args.local_rank)`

			`return args`


			`def main():`
			`args = parse_args()`

			`cfg = Config.fromfile(args.config)`
[Enhance] Rename the option `--options` in some tools to `--cfg-options`. (#425) 2021-09-01 12:49:03 +08:00			`if args.cfg_options is not None:`
			`cfg.merge_from_dict(args.cfg_options)`
[Enhance] Add setup multi-processing both in train and test. (#671) 2022-01-27 10:18:36 +08:00
			`# set multi-process settings`
			`setup_multi_processes(cfg)`

init commit 2020-05-21 21:21:43 +08:00			`# set cudnn_benchmark`
			`if cfg.get('cudnn_benchmark', False):`
			`torch.backends.cudnn.benchmark = True`

			`# work_dir is determined in this priority: CLI > segment in file > filename`
			`if args.work_dir is not None:`
			`# update configs according to CLI args if args.work_dir is not None`
			`cfg.work_dir = args.work_dir`
			`elif cfg.get('work_dir', None) is None:`
			`# use config filename as default work_dir if cfg.work_dir is None`
			`cfg.work_dir = osp.join('./work_dirs',`
			`osp.splitext(osp.basename(args.config))[0])`
			`if args.resume_from is not None:`
			`cfg.resume_from = args.resume_from`
[Refactor] Use `--gpu-id` instead of `--gpu-ids` in non-distributed multi-gpu training/testing. (#688) 2022-02-17 02:17:01 +08:00			`if args.gpus is not None:`
			`cfg.gpu_ids = range(1)`
			warnings.warn('`--gpus` is deprecated because we only support '
			`'single GPU mode in non-distributed training. '`
			'Use `gpus=1` now.')
init commit 2020-05-21 21:21:43 +08:00			`if args.gpu_ids is not None:`
[Refactor] Use `--gpu-id` instead of `--gpu-ids` in non-distributed multi-gpu training/testing. (#688) 2022-02-17 02:17:01 +08:00			`cfg.gpu_ids = args.gpu_ids[0:1]`
			warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
			`'Because we only support single GPU mode in '`
			`'non-distributed training. Use the first GPU '`
			'in `gpu_ids` now.')
			`if args.gpus is None and args.gpu_ids is None:`
			`cfg.gpu_ids = [args.gpu_id]`
init commit 2020-05-21 21:21:43 +08:00
[Enhance] Support training on IPU and add fine-tuning configs of ViT. (#723) * implement training and evaluation on IPU * fp16 SOTA * Tput reaches 5600 * 123 * add poptorch dataloder * change ipu_replicas to ipu-replicas * add noqa to config long line(website) * remove ipu dataloder test code * del one blank line in test_builder * refine the dataloder initialization * fix a typo * refine args for dataloder * remove an annoted line * process one more conflict * adjust code structure in mmcv.ipu * adjust ipu code structure in mmcv * IPUDataloader to IPUDataLoader * align with mmcv * adjust according to mmcv * mmcv code structre fixed Co-authored-by: hudi <dihu@graphcore.ai> 2022-04-29 22:22:19 +08:00			`if args.ipu_replicas is not None:`
			`cfg.ipu_replicas = args.ipu_replicas`
			`args.device = 'ipu'`

init commit 2020-05-21 21:21:43 +08:00			`# init distributed env first, since logger depends on the dist info.`
			`if args.launcher == 'none':`
			`distributed = False`
			`else:`
			`distributed = True`
			`init_dist(args.launcher, **cfg.dist_params)`
Fix bug in gpu_ids in distributed training (#107) * update gpu_ids in distributed training * move linear scaling rule after getting correct gpu_ids * Remove support for autoscale_lr 2020-12-09 16:28:03 +08:00			`_, world_size = get_dist_info()`
			`cfg.gpu_ids = range(world_size)`
init commit 2020-05-21 21:21:43 +08:00
			`# create work_dir`
			`mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))`
dump config before training (#282) 2021-06-04 16:26:13 +08:00			`# dump config`
			`cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))`
init commit 2020-05-21 21:21:43 +08:00			`# init the logger before other steps`
			`timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())`
			`log_file = osp.join(cfg.work_dir, f'{timestamp}.log')`
			`logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)`

			`# init the meta dict to record some important information such as`
			`# environment info and seed, which will be logged`
			`meta = dict()`
			`# log env info`
			`env_info_dict = collect_env()`
			`env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])`
			`dash_line = '-' * 60 + '\n'`
			`logger.info('Environment info:\n' + dash_line + env_info + '\n' +`
			`dash_line)`
			`meta['env_info'] = env_info`

			`# log some basic info`
			`logger.info(f'Distributed training: {distributed}')`
			`logger.info(f'Config:\n{cfg.pretty_text}')`

			`# set random seeds`
[Enchance] Set a random seed when the user does not set a seed. (#554) 2021-12-02 18:09:55 +08:00			`seed = init_random_seed(args.seed)`
[Feature] Add diff seeds to diff ranks. (#744) * [Feature] Add diff seeds to diff ranks. * lint 2022-03-25 15:31:13 +08:00			`seed = seed + dist.get_rank() if args.diff_seed else seed`
[Enchance] Set a random seed when the user does not set a seed. (#554) 2021-12-02 18:09:55 +08:00			`logger.info(f'Set random seed to {seed}, '`
			`f'deterministic: {args.deterministic}')`
			`set_random_seed(seed, deterministic=args.deterministic)`
			`cfg.seed = seed`
			`meta['seed'] = seed`
init commit 2020-05-21 21:21:43 +08:00
Add classifiers, heads, necks and losses 2020-07-07 19:32:06 +08:00			`model = build_classifier(cfg.model)`
[WIP] Refactoring weights initialization (#270) * [WIP] Refactoring weights initialization * fix lint and constant init cfg * fix pretrained bug * fix typo * fix isort * revise model utils 2021-06-10 10:54:34 +08:00			`model.init_weights()`
init commit 2020-05-21 21:21:43 +08:00
			`datasets = [build_dataset(cfg.data.train)]`
			`if len(cfg.workflow) == 2:`
			`val_dataset = copy.deepcopy(cfg.data.val)`
			`val_dataset.pipeline = cfg.data.train.pipeline`
			`datasets.append(build_dataset(val_dataset))`
[Refactor] Save class names in best checkpoint created by evaluation hook (#641) * Save class names in best checkpoint created by evaluation hook * Save meta info into runner instaed of checkpoint_config 2022-01-13 12:20:59 +08:00
			`# save mmcls version, config file content and class names in`
			`# runner as meta data`
			`meta.update(`
			`dict(`
init commit 2020-05-21 21:21:43 +08:00			`mmcls_version=__version__,`
			`config=cfg.pretty_text,`
[Refactor] Save class names in best checkpoint created by evaluation hook (#641) * Save class names in best checkpoint created by evaluation hook * Save meta info into runner instaed of checkpoint_config 2022-01-13 12:20:59 +08:00			`CLASSES=datasets[0].CLASSES))`

init commit 2020-05-21 21:21:43 +08:00			`# add an attribute for visualization convenience`
			`train_model(`
			`model,`
			`datasets,`
			`cfg,`
			`distributed=distributed,`
			`validate=(not args.no_validate),`
			`timestamp=timestamp,`
[Enhance] Support training on IPU and add fine-tuning configs of ViT. (#723) * implement training and evaluation on IPU * fp16 SOTA * Tput reaches 5600 * 123 * add poptorch dataloder * change ipu_replicas to ipu-replicas * add noqa to config long line(website) * remove ipu dataloder test code * del one blank line in test_builder * refine the dataloder initialization * fix a typo * refine args for dataloder * remove an annoted line * process one more conflict * adjust code structure in mmcv.ipu * adjust ipu code structure in mmcv * IPUDataloader to IPUDataLoader * align with mmcv * adjust according to mmcv * mmcv code structre fixed Co-authored-by: hudi <dihu@graphcore.ai> 2022-04-29 22:22:19 +08:00			`device=args.device,`
init commit 2020-05-21 21:21:43 +08:00			`meta=meta)`


			`if __name__ == '__main__':`
			`main()`