diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index d079936a..2e14a9b3 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -59,6 +59,12 @@ design/visualization.md design/logging.md +.. toctree:: + :maxdepth: 1 + :caption: 迁移指南 + + migration/migrate_runner_from_mmcv.md + .. toctree:: :maxdepth: 2 :caption: API 文档 diff --git a/docs/zh_cn/migration/migrate_runner_from_mmcv.md b/docs/zh_cn/migration/migrate_runner_from_mmcv.md new file mode 100644 index 00000000..223e1854 --- /dev/null +++ b/docs/zh_cn/migration/migrate_runner_from_mmcv.md @@ -0,0 +1,1478 @@ +# 迁移 MMCV 执行器到 MMEngine + +## 简介 + +随着支持的深度学习任务越来越多,用户的需求不断增加,我们对 MMCV 已有的执行器(Runner)的灵活性和通用性有了更高的要求。 +因此,MMEngine 在 MMCV 的基础上,实现了一个更加通用灵活的执行器以支持更多复杂的模型训练流程。 +MMEngine 中的执行器扩大了作用域,也承担了更多的功能;我们抽象出了[训练循环控制器(EpochBasedTrainLoop/IterBasedTrainLoop)](mmengine.runner.EpochBasedLoop)、[验证循环控制器(ValLoop)](mmengine.runner.ValLoop)和[测试循环控制器(TestLoop)](mmengine.runner.TestLoop)来方便用户灵活拓展模型的执行流程。 + +我们将首先介绍算法库的执行入口该如何从 MMCV 迁移到 MMEngine, 以最大程度地简化和统一执行入口的代码。 +然后我们将详细介绍在 MMCV 和 MMEngine 中构造执行器及其内部组件进行训练的差异。 +在开始迁移前,我们建议用户先阅读[执行器教程](../tutorials/runner.md)。 + +## 执行入口 + +以 MMDet 为例,我们首先展示基于 MMEngine 重构前后,配置文件和训练启动脚本的区别: + +### 配置文件的迁移 + + + + + + + + + + + + + + + + + + + + + + + + + + +
基于 MMCV 执行器的配置文件概览 基于 MMEngine 执行器的配置文件概览
default_runtime.py + +```python +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +custom_hooks = [dict(type='NumClassCheckHook')] + +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] + + +opencv_num_threads = 0 +mp_start_method = 'fork' +auto_scale_lr = dict(enable=False, base_batch_size=16) +``` + + + +```python +default_scope = 'mmdet' + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False +``` + +
scheduler.py + +```python +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) +``` + + + +```python +# training schedule for 1x +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) +``` + +
coco_detection.py + +```python +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(interval=1, metric='bbox') +``` + + + +```python +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') + +train_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False) +test_evaluator = val_evaluator +``` + +
+ +MMEngine 中的执行器提供了更多可自定义的部分,包括训练、验证、测试过程和数据加载器的配置,因此配置文件和之前相比会长一些。 +为了方便用户的理解和阅读,我们遵循所见即所得的原则,重新调整了各个组件配置的层次,使得大部分一级字段都对应着执行器中关键属性的配置,例如数据加载器、评测器、流程配置、钩子配置等。 +这些配置在 OpenMMLab 2.0 算法库中都有默认配置,因此用户很多时候无需关心其中的大部分参数。 + +### 启动脚本的迁移 + +相比于 MMCV 的执行器,MMEngine 的执行器可以承担更多的功能,例如构建 `DataLoader`,构建分布式模型等。因此我们需要在配置文件中指定更多的参数,例如 `DataLoader` 的 `sampler` 和 `batch_sampler`,而无需在训练的启动脚本里实现构建 `DataLoader` 相关的代码。以 MMDet 的训练启动脚本为例: + + + + + + + + + + + + + + + + + + + +
基于 MMCV 执行器的训练启动脚本 基于 MMEngine 执行器的训练启动脚本
tools/train.py + +```python +args = parse_args() + +cfg = Config.fromfile(args.config) + +# replace the ${key} with the value of cfg.key +cfg = replace_cfg_vals(cfg) + +# update data root according to MMDET_DATASETS +update_data_root(cfg) + +if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + +if args.auto_scale_lr: + if 'auto_scale_lr' in cfg and \ + 'enable' in cfg.auto_scale_lr and \ + 'base_batch_size' in cfg.auto_scale_lr: + cfg.auto_scale_lr.enable = True + else: + warnings.warn('Can not find "auto_scale_lr" or ' + '"auto_scale_lr.enable" or ' + '"auto_scale_lr.base_batch_size" in your' + ' configuration file. Please update all the ' + 'configuration files to mmdet >= 2.24.1.') + +# set multi-process settings +setup_multi_processes(cfg) + +# set cudnn_benchmark +if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + +# work_dir is determined in this priority: CLI > segment in file > filename +if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir +elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + +if args.resume_from is not None: + cfg.resume_from = args.resume_from +cfg.auto_resume = args.auto_resume +if args.gpus is not None: + cfg.gpu_ids = range(1) + warnings.warn('`--gpus` is deprecated because we only support ' + 'single GPU mode in non-distributed training. ' + 'Use `gpus=1` now.') +if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed training. Use the first GPU ' + 'in `gpu_ids` now.') +if args.gpus is None and args.gpu_ids is None: + cfg.gpu_ids = [args.gpu_id] + +# init distributed env first, since logger depends on the dist info. +if args.launcher == 'none': + distributed = False +else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + +# create work_dir +mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) +# dump config +cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) +# init the logger before other steps +timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) +log_file = osp.join(cfg.work_dir, f'{timestamp}.log') +logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) + +# init the meta dict to record some important information such as +# environment info and seed, which will be logged +meta = dict() +# log env info +env_info_dict = collect_env() +env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) +dash_line = '-' * 60 + '\n' +logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) +meta['env_info'] = env_info +meta['config'] = cfg.pretty_text +# log some basic info +logger.info(f'Distributed training: {distributed}') +logger.info(f'Config:\n{cfg.pretty_text}') + +cfg.device = get_device() +# set random seeds +seed = init_random_seed(args.seed, device=cfg.device) +seed = seed + dist.get_rank() if args.diff_seed else seed +logger.info(f'Set random seed to {seed}, ' + f'deterministic: {args.deterministic}') +set_random_seed(seed, deterministic=args.deterministic) +cfg.seed = seed +meta['seed'] = seed +meta['exp_name'] = osp.basename(args.config) + +model = build_detector( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) +model.init_weights() + +datasets = [] +train_detector( + model, + datasets, + cfg, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) +``` + + + +```python +args = parse_args() + +# register all modules in mmdet into the registries +# do not init the default scope here because it will be init in the runner +register_all_modules(init_default_scope=False) + +# load config +cfg = Config.fromfile(args.config) +cfg.launcher = args.launcher +if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + +# work_dir is determined in this priority: CLI > segment in file > filename +if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir +elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + +# enable automatic-mixed-precision training +if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + +# enable automatically scaling LR +if args.auto_scale_lr: + if 'auto_scale_lr' in cfg and \ + 'enable' in cfg.auto_scale_lr and \ + 'base_batch_size' in cfg.auto_scale_lr: + cfg.auto_scale_lr.enable = True + else: + raise RuntimeError('Can not find "auto_scale_lr" or ' + '"auto_scale_lr.enable" or ' + '"auto_scale_lr.base_batch_size" in your' + ' configuration file.') + +cfg.resume = args.resume + +# build the runner from config +if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) +else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + +# start training +runner.train() +``` + +
apis/train.py + +```python +def init_random_seed(...): + ... + +def set_random_seed(...): + ... + +# define function tools. +... + + +def train_detector(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + + cfg = compat_cfg(cfg) + logger = get_root_logger(log_level=cfg.log_level) + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = build_ddp( + model, + cfg.device, + device_ids=[int(os.environ['LOCAL_RANK'])], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) + + # build optimizer + auto_scale_lr(cfg, distributed, logger) + optimizer = build_optimizer(model, cfg.optimizer) + + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks( + cfg.lr_config, + optimizer_config, + cfg.checkpoint_config, + cfg.log_config, + cfg.get('momentum_config', None), + custom_hooks_config=cfg.get('custom_hooks', None)) + + if distributed: + if isinstance(runner, EpochBasedRunner): + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + val_dataloader_default_args = dict( + samples_per_gpu=1, + workers_per_gpu=2, + dist=distributed, + shuffle=False, + persistent_workers=False) + + val_dataloader_args = { + **val_dataloader_default_args, + **cfg.data.get('val_dataloader', {}) + } + # Support batch_size > 1 in validation + + if val_dataloader_args['samples_per_gpu'] > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.val.pipeline = replace_ImageToTensor( + cfg.data.val.pipeline) + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + + val_dataloader = build_dataloader(val_dataset, **val_dataloader_args) + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' + eval_hook = DistEvalHook if distributed else EvalHook + # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the + # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. + runner.register_hook( + eval_hook(val_dataloader, **eval_cfg), priority='LOW') + + resume_from = None + if cfg.resume_from is None and cfg.get('auto_resume'): + resume_from = find_latest_checkpoint(cfg.work_dir) + if resume_from is not None: + cfg.resume_from = resume_from + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow) +``` + + + +```python +# `apis/train.py` is removed in `mmengine` +``` + +
+ +上表对比了基于 MMCV 执行器和 MMEngine 执行器 MMDet 启动脚本的区别。 +OpenMMLab 1.x 中的算法库都实现了一套 runner 的构建和训练流程,其中存在着大量的冗余代码。因此,MMEngine 的执行器在内部实现了很多流程化的代码以统一各个算法库的执行流程,例如初始化随机种子、初始化分布式环境、构建 `DataLoader` 等,使得下游算法库从此无需在训练启动脚本里实现相关代码,只需配置执行器的构造参数,就能够执行相应的流程。 +基于 MMEngine 执行器的启动脚本不仅简化了 `tools/train.py` 的代码,甚至可以直接删除 `apis/train.py`,极大程度的简化了训练启动脚本。同样的,我们在基于 MMEngine 开发自己的代码仓库时,可以通过配置执行器参数来设置随机种子、初始化分布式环境,无需自行实现相关代码。 + +## 迁移执行器(Runner) + +本节主要介绍 MMCV 执行器和 MMEngine 执行器在训练、验证、测试流程上的区别。 +在使用 MMCV 执行器和 MMEngine 执行器训练、测试模型时,以下流程有着明显的不同: + +01. [准备logger](准备logger) +02. [设置随机种子](设置随机种子) +03. [初始化环境变量](初始化训练环境) +04. [准备数据](准备数据) +05. [准备模型](准备模型) +06. [准备优化器](准备优化器) +07. [准备钩子](准备训练钩子) +08. [准备验证/测试模块](准备验证模块) +09. [构建执行器](构建执行器) +10. [开始训练](执行器训练流程)、[开始测试](执行器测试流程) +11. [迁移自定义训练流程](迁移自定义执行流程) + +后续的教程中,我们会对每个流程的差异进行详细介绍。 + +### 准备logger + +**MMCV 准备 logger** + +MMCV 需要在训练脚本里调用 `get_logger` 接口获得 logger,并用它输出、记录训练环境。 + +```python +logger = get_logger(name='custom', log_file=log_file, log_level=cfg.log_level) +env_info_dict = collect_env() +env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) +dash_line = '-' * 60 + '\n' +logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) +``` + +执行器构造时,也需要传入 logger。 + +```python +runner = Runner( + ... + logger=logger + ...) +``` + +**MMEngine 准备 logger** + +在执行器构建时传入 logger 的日志等级,执行器构建时会自动创建 logger,并输出、记录训练环境。 + +```python +log_level = 'INFO' +``` + +### 设置随机种子 + +**MMCV 设置随机种子** + +在训练脚本中手动设置随机种子: + +```python +... +seed = init_random_seed(args.seed, device=cfg.device) +seed = seed + dist.get_rank() if args.diff_seed else seed +logger.info(f'Set random seed to {seed}, ' + f'deterministic: {args.deterministic}') +set_random_seed(seed, deterministic=args.deterministic) +... +``` + +**MMEngine 设计随机种子** + +配置执行器的 `randomness` 参数,配置规则详见[执行器 api 文档](mmengine.runner.Runner.set_randomness) + +**OpenMMLab 系列算法库配置变更** + + + + + + + + + + + + +
MMCV 配置MMEngine 配置
+ +```python +seed = 1 +deterministic=False +diff_seed=False +``` + + + +```python +randomness=dict(seed=1, + deterministic=True, + diff_rank_seed=False) +``` + +
+ +在本教程中,我们将 `randomness` 配置为: + +```python +randomness = dict(seed=5) +``` + +### 初始化训练环境 + +**MMCV 初始化训练环境** + +MMCV 需要在训练脚本中配置多进程启动方式、多进程通信后端等环境变量,并在执行器构建之前初始化分布式环境,对模型进行分布式封装: + +```python +... +setup_multi_processes(cfg) +init_dist(cfg.launcher, **cfg.dist_params) +model = MMDistributedDataParallel( + model, + device_ids=[int(os.environ['LOCAL_RANK'])], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) +``` + +**MMEngine 初始化训练环境** + +MMEngine 通过配置 `env_cfg` 来选择多进程启动方式和多进程通信后端, 其默认值为 `dict(dist_cfg=dict(backend='nccl'))`,配置方式详见[执行器 api 文档](mmengine.runner.Runner.setup_env)。 + +执行器构建时接受 `launcher` 参数,如果其值不为 `'none'`,执行器构建时会自动执行分布式初始化,模型分布式封装。换句话说,使用 `MMEngine` 的执行器时,我们无需在执行器外做分布式相关的操作,只需配置 `launcher` 参数,选择训练的启动方式即可。 + +**OpenMMLab 系列算法库配置变更** + + + + + + + + + + + + +
MMCV 配置MMEngine 配置
+ +```python +launcher = 'pytorch' # 开启分布式训练 +dist_params = dict(backend='nccl') # 选择多进程通信后端 +``` + + + +```python +launcher = 'pytorch' +env_cfg = dict(dist_cfg=dict(backend='nccl')) +``` + +
+ +在本教程中,我们将 `env_cfg` 配置为: + +```python +env_cfg = dict(dist_cfg=dict(backend='nccl')) +``` + +### 准备数据 + +MMCV 和 MMEngine 的执行器均可接受构建好的 `DataLoader` 实例。因此准备数据的流程没有差异: + +```python +import torchvision.transforms as transforms +from torch.utils.data import DataLoader +from torchvision.datasets import CIFAR10 + +transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) +]) + +train_dataset = CIFAR10( + root='data', train=True, download=True, transform=transform) +train_dataloader = DataLoader( + train_dataset, batch_size=128, shuffle=True, num_workers=2) + +val_dataset = CIFAR10( + root='data', train=False, download=True, transform=transform) +val_dataloader = DataLoader( + val_dataset, batch_size=128, shuffle=False, num_workers=2) +``` + +**OpenMMLab 系列算法库配置变更** + + + + + + + + + + + + +
MMCV 配置MMEngine 配置
+ +```python +data = dict( + samples_per_gpu=2, # 单卡 batch_size + workers_per_gpu=2, # Dataloader 采样进程数 + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +``` + + + +```python +train_dataloader = dict( + batch_size=2, # samples_per_gpu -> batch_size, + num_workers=2, + # 遍历完 DataLoader 后,是否重启多进程采样 + persistent_workers=True, + # 可配置的 sampler + sampler=dict(type='DefaultSampler', shuffle=True), + # 可配置的 batch_sampler + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, # 验证阶段的 batch_size + num_workers=2, + persistent_workers=True, + drop_last=False, # 是否丢弃最后一个 batch + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) + +test_dataloader = val_dataloader +``` + +
+ +相比于 MMCV 的算法库配置,MMEngine 的配置更加复杂,但是也更加灵活。`DataLoader` 的配置过程由 `Runner` 负责,无需各个算法库实现。 + +### 准备模型 + +详见[迁移 MMCV 模型至 MMEngine](./migrate_model_from_mmcv.md) + +```python +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModel + + +class Model(BaseModel): + + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + self.loss_fn = nn.CrossEntropyLoss() + + def forward(self, img, label, mode): + feat = self.pool(F.relu(self.conv1(img))) + feat = self.pool(F.relu(self.conv2(feat))) + feat = feat.view(-1, 16 * 5 * 5) + feat = F.relu(self.fc1(feat)) + feat = F.relu(self.fc2(feat)) + feat = self.fc3(feat) + if mode == 'loss': + loss = self.loss_fn(feat, label) + return dict(loss=loss) + else: + return [feat.argmax(1)] + +model = Model() +``` + +需要注意的是,分布式训练时,MMCV 的执行器需要接受分布式封装后的模型,而 `MMEngine` 接受分布式封装前的模型,在执行器实例化阶段对其段进行分布式封装。 + +### 准备优化器 + +**MMCV 准备优化器** + +MMCV 执行器构造时,可以直接接受 Pytorch 优化器,如 + +```python +optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9) +``` + +对于复杂配置的优化器,MMCV 需要基于优化器构造器来构建优化器: + +```python + +optimizer_cfg = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + paramwise_cfg=dict(norm_decay_mult=0)) + +def build_optimizer_constructor(cfg): + constructor_type = cfg.get('type') + if constructor_type in OPTIMIZER_BUILDERS: + return build_from_cfg(cfg, OPTIMIZER_BUILDERS) + elif constructor_type in MMCV_OPTIMIZER_BUILDERS: + return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS) + else: + raise KeyError(f'{constructor_type} is not registered ' + 'in the optimizer builder registry.') + + +def build_optimizer(model, cfg): + optimizer_cfg = copy.deepcopy(cfg) + constructor_type = optimizer_cfg.pop('constructor', + 'DefaultOptimizerConstructor') + paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None) + optim_constructor = build_optimizer_constructor( + dict( + type=constructor_type, + optimizer_cfg=optimizer_cfg, + paramwise_cfg=paramwise_cfg)) + optimizer = optim_constructor(model) + return optimizer + +optimizer = build_optimizer(model, optimizer_cfg) +``` + +**MMEngine 准备优化器** + +构建 MMEngine 执行器时,需要接受 `optim_wrapper` 参数,即[优化器封装](mmengine.optim.OptimWrapper)实例或者优化器封装配置,对于复杂配置的优化器封装,`MMEngine` 同样只需要配置 `optim_wrapper`。`optim_wrapper` 的详细介绍见[执行器 api 文档](mmengine.runner.Runner.build_optim_wrapper)。 + +**OpenMMLab 系列算法库配置变更** + + + + + + + + + + + + +
MMCV 配置MMEngine 配置
+ +```python +optimizer = dict( + constructor='CustomConstructor', + type='AdamW', # 优化器配置为一级字段 + lr=0.0001, # 优化器配置为一级字段 + betas=(0.9, 0.999), # 优化器配置为一级字段 + weight_decay=0.05, # 优化器配置为一级字段 + paramwise_cfg={ # constructor 的参数 + 'decay_rate': 0.95, + 'decay_type': 'layer_wise', + 'num_layers': 6 + }) +# MMEngine 还需要配置 `optim_config` +# 来构建优化器钩子,而 MMEngine 不需要 +optimizer_config = dict(grad_clip=None) +``` + + + +```python +optim_wrapper = dict( + constructor='CustomConstructor', + type='OptimWrapper', # 指定优化器封装类型 + optimizer=dict( # 将优化器配置集中在 optimizer 内 + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05) + paramwise_cfg={ + 'decay_rate': 0.95, + 'decay_type': 'layer_wise', + 'num_layers': 6 + }) +``` + +
+ +```{note}: +对于检测、分类一类的上层任务(High level)MMCV 需要配置 `optim_config` 来构建优化器钩子,而 MMEngine 不需要。 +``` + +本教程使用的 `optim_wrapper` 如下: + +```python +from torch.optim import SGD + +optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9) +optim_wrapper = dict(optimizer=optimizer) +``` + +### 准备训练钩子 + +**MMCV 准备训练钩子:** + +MMCV 常用训练钩子的配置如下: + +```python +# learning rate scheduler config +lr_config = dict(policy='step', step=[2, 3]) +# configuration of optimizer +optimizer_config = dict(grad_clip=None) +# configuration of saving checkpoints periodically +checkpoint_config = dict(interval=1) +# save log periodically and multiple hooks can be used simultaneously +log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')]) +# register hooks to runner and those hooks will be invoked automatically +runner.register_training_hooks( + lr_config=lr_config, + optimizer_config=optimizer_config, + checkpoint_config=checkpoint_config, + log_config=log_config) +``` + +其中: + +- `lr_config` 用于配置 `LrUpdaterHook` +- `optimizer_config` 用于配置 `OptimizerHook` +- `checkpoint_config` 用于配置 `CheckPointHook` +- `log_config` 用于配置 `LoggerHook` + +除了上面提到的 4 个 Hook,MMCV 执行器自带 `IterTimerHook`。MMCV 需要先实例化执行器,再注册训练钩子,而 `MMEngine` 则在实例化阶段配置钩子。 + +**MMEngine 准备训练钩子** + +MMEngine 执行器将 MMCV 常用的训练钩子配置成默认钩子: + +- [RuntimeInfoHook](mmengine.hooks.RuntimeInfoHook) +- [IterTimerHook](mmengine.hooks.IterTimerHook) +- [DistSamplerSeedHook](mmengine.hooks.DistSamplerSeedHook) +- [LoggerHook](mmedge.hooks.LoggerHook) +- [CheckpointHook](mmengine.hooks.CheckpointHook) +- [ParamSchedulerHook](mmengine.hooks.ParamSchedulerHook) + +对比上例中 MMCV 配置的训练钩子: + +- `LrUpdaterHook` 对应 MMEngine 中的 `ParamSchedulerHook`,二者对应关系详见[迁移 `scheduler` 文档](./migrate_param_scheduler_from_mmcv.md) +- MMEngine 在模型的 [train_step](mmengine.BaseModel.train_step) 时更新参数,因此不需要配置优化器钩子(`OptimizerHook`) +- MMEngine 自带 `CheckPointHook`,可以使用默认配置 +- MMEngine 自带 `LoggerHook`,可以使用默认配置 + +因此我们只需要配置执行器[优化器参数调整策略(param_scheduler)](../tutorials/param_scheduler.md),就能达到和配置 `lr_config` 一样的效果。 +MMEngine 也支持注册自定义钩子,具体教程详见[执行器教程](../tutorials/runner.md#通过配置文件使用执行器) 和[迁移 `hook` 文档](./migrate_hook_from_mmcv.md)。 + + + + + + + + + + + + +
MMCV 常用训练钩子MMEngine 默认钩子
+ +```python +# MMCV 零散的配置训练钩子 +# 配置 LrUpdaterHook,相当于 MMEngine 的参数调度器 +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) + +# 配置 OptimizerHook,MMEngine 不需要 +optimizer_config = dict(grad_clip=None) + +# 配置 LoggerHook +log_config = dict( # LoggerHook + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +# 配置 CheckPointHook +checkpoint_config = dict(interval=1) # CheckPointHook +``` + + + +```python +# 配置参数调度器 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# MMEngine 集中配置默认钩子 +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) +``` + +
+ +```{note} +MMEngine 移除了 `OptimizerHook`,优化步骤在 model 中执行。 +``` + +本教程使用的 param_scheduler 如下: + +```python +from math import gamma + +param_scheduler = dict(type='MultiStepLR', milestones=[2, 3], gamma=0.1) +``` + +### 准备验证模块 + +MMCV 借助 `EvalHook` 实现验证流程,受限于篇幅,这里不做进一步展开。MMEngine 通过[验证循环控制器(ValLoop)](../tutorials/runner.md#自定义执行流程) 和[评测器(Evaluator)](../tutorials/metric_and_evaluator.md)实现执行流程,如果我们想基于自定义的评价指标完成验证流程,则需要定义一个 `Metric`,并将其注册至 `METRICS` 注册器: + +```python +import torch +from mmengine.evaluator import BaseMetric +from mmengine.registry import METRICS + +@METRICS.register_module(force=True) +class ToyAccuracyMetric(BaseMetric): + + def process(self, label, pred) -> None: + self.results.append((label[1], pred, len(label[1]))) + + def compute_metrics(self, results: list) -> dict: + num_sample = 0 + acc = 0 + for label, pred, batch_size in results: + acc += (label == torch.stack(pred)).sum() + num_sample += batch_size + return dict(Accuracy=acc / num_sample) +``` + +实现自定义 `Metric` 后,我们还需在执行器的构造参数中配置评测器和[验证循环控制器](../tutorials/runner.md#自定义执行流程),本教程中示例配置如下: + +```python +val_evaluator = dict(type='ToyAccuracyMetric') +val_cfg = dict(type='ValLoop') +``` + + + + + + + + + + + + +
MMCV 配置验证流程MMEngine 配置验证流程
+ +```python +eval_cfg = cfg.get('evaluation', {}) +eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' +eval_hook = DistEvalHook if distributed else EvalHook # 配置 EvalHook +runner.register_hook( + eval_hook(val_dataloader, **eval_cfg), priority='LOW') # 注册 EvalHook +``` + + + +```python +val_dataloader = val_dataloader # 配置验证数据 +val_evaluator = dict(type='ToyAccuracyMetric') # 配置评测器 +val_cfg = dict(type='ValLoop') # 配置验证循环控制器 +``` + +
+ +### 构建执行器 + +**MMCV 构建执行器** + +```python +runner = EpochBasedRunner( + model=model, + optimizer=optimizer, + work_dir=work_dir, + logger=logger, + max_epochs=4 +) +``` + +**MMEngine 构建执行器** + +`MMEngine` 执行器的作用域比 MMCV 更广,将设置随机种子、启动分布式训练等流程参数化。除了前几节提到的参数,上例中出现的`EpochBasedRunner`,`max_epochs`,`val_iterval` 现在由 `train_cfg` 配置: + +- `by_epoch`: `True` 时相当于 MMCV 的 `EpochBasedRunner`,False 时相当于 `IterBasedRunner`。 +- `max_epoch`/`max_iters`: 同 MMCV 执行器的配置 +- `val_iterval`: 同 `EvalHook` 的 `interval` 参数 + +`train_cfg` 实际上是训练循环控制器的构造参数,当 `by_epoch=True` 时,使用 `EpochBasedTrainLoop`。 + +```python +from mmengine.runner import Runner + +runner = Runner( + model=model, # 待优化的模型 + work_dir='./work_dir', # 配置工作目录 + randomness=randomness, # 配置随机种子 + env_cfg=env_cfg, # 配置环境变量 + launcher='none', # 分布式训练启动方式 + optim_wrapper=optim_wrapper, # 配置优化器 + param_scheduler=param_scheduler, # 配置学习率调度器 + train_dataloader=train_dataloader, # 配置训练数据 + train_cfg=dict(by_epoch=True, max_epochs=4, val_interval=1), # 配置训练循环控制器 + val_dataloader=val_dataloader, # 配置验证数据 + val_evaluator=val_evaluator, # 配置评测器 + val_cfg=val_cfg) # 配置验证循环控制器 +``` + +### 执行器加载检查点 + +**MMCV 加载检查点**: + +在训练之前执行加载权重、恢复训练的流程。 + +```python +if cfg.resume_from: + runner.resume(cfg.resume_from) +elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) +``` + +**MMEngine 加载检查点** + +```python +runner = Runner( + ... + load_from='/path/to/checkpoint', + resume=True +) +``` + + + + + + + + + + + + + + + + + + + +
MMCV 加载检查点配置MMEngine 加载检查点配置
加载检查点 + +```python +load_from = 'path/to/ckpt' +``` + + + +```python +load_from = 'path/to/ckpt' +resume = False +``` + +
恢复检查点 + +```python +resume_from = 'path/to/ckpt' +``` + + + +```python +load_from = 'path/to/ckpt' +resume = True +``` + +
+ +### 执行器训练流程 + +**MMCV 执行器训练流程**: + +在训练之前执行加载权重、恢复训练的流程。然后再执行 `runner.run`,并传入训练数据。 + +```python +if cfg.resume_from: + runner.resume(cfg.resume_from) +elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) +runner.run(data_loaders, cfg.workflow) +``` + +**MMEngine** 执行器训练流程 + +在执行器构建时配置加载权重、恢复训练参数 + +由于 MMEngine 的执行器在构造阶段就传入了训练数据,因此在调用 runner.train() 无需传入参数。 + +```python +runner.train() +``` + +### 执行器测试流程 + +MMCV 的执行器没有测试功能,因此需要自行实现测试脚本。MMEngine 的执行器只需要在构建时配置 `test_dataloader`、`test_cfg` 和 `test_evaluator`,然后再调用 `runner.test()` 执行测试流程。 + +**`work_dir` 和训练时一致,无需手动加载 checkpoint:** + +```python +runner = Runner( + model=model, + work_dir='./work_dir', + randomness=randomness, + env_cfg=env_cfg, + launcher='none', # 不开启分布式训练 + optim_wrapper=optim_wrapper, + train_dataloader=train_dataloader, + train_cfg=dict(by_epoch=True, max_epochs=5, val_interval=1), + val_dataloader=val_dataloader, + val_evaluator=val_evaluator, + val_cfg=val_cfg, + test_dataloader=val_dataloader, # 假设测试和验证使用相同的数据和评测器 + test_evaluator=val_evaluator, + test_cfg=dict(type='TestLoop'), +) +runner.test() +``` + +**`work_dir` 和训练时不一致,需要额外指定 load_from:** + +```python +runner = Runner( + model=model, + work_dir='./test_work_dir', + load_from='./work_dir/epoch_5.pth', # work_dir 不一致,指定 load_from,以加载指定的模型 + randomness=randomness, + env_cfg=env_cfg, + launcher='none', + optim_wrapper=optim_wrapper, + train_dataloader=train_dataloader, + train_cfg=dict(by_epoch=True, max_epochs=5, val_interval=1), + val_dataloader=val_dataloader, + val_evaluator=val_evaluator, + val_cfg=val_cfg, + test_dataloader=val_dataloader, + test_evaluator=val_evaluator, + test_cfg=dict(type='TestLoop'), +) +runner.test() +``` + +## 迁移自定义执行流程 + +使用 MMCV 执行器时,我们可能会重载 `runner.train/runner.val` 或者 `runner.run_iter` 实现自定义的训练、测试流程。以重载 `runner.train` 为例,假设我们想对每个批次的图片训练两遍,我们可以这样重载 MMCV 的执行器: + +```python +class CustomRunner(EpochBasedRunner): + def train(self, data_loader, **kwargs): + self.model.train() + self.mode = 'train' + self.data_loader = data_loader + self._max_iters = self._max_epochs * len(self.data_loader) + self.call_hook('before_train_epoch') + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self.data_batch = data_batch + self._inner_iter = i + for _ in range(2) + self.call_hook('before_train_iter') + self.run_iter(data_batch, train_mode=True, **kwargs) + self.call_hook('after_train_iter') + del self.data_batch + self._iter += 1 + + self.call_hook('after_train_epoch') + self._epoch += 1 +``` + +在 MMEngine 中,要实现上述功能,我们需要重载一个新的循环控制器 + +```python +from mmengine.registry import LOOPS +from mmengine.runner import EpochBasedTrainLoop + + +@LOOPS.register_module() +class CustomEpochBasedTrainLoop(EpochBasedTrainLoop): + def run_iter(self, idx, data_batch) -> None: + for _ in range(2): + super().run_iter(idx, data_batch) +``` + +在构建执行器时,指定 `train_cfg` 的 `type` 为 `CustomEpochBasedTrainLoop`。需要注意的是,`by_epoch` 和 `type` 不能同时配置,当配置 `by_epoch` 时,会推断训练循环控制器的类型为 `EpochBasedTrainLoop`。 + +```python +runner = Runner( + model=model, + work_dir='./test_work_dir', + randomness=randomness, + env_cfg=env_cfg, + launcher='none', + optim_wrapper=dict(optimizer=dict(type='SGD', lr=0.001, momentum=0.9)), + train_dataloader=train_dataloader, + train_cfg=dict( + type='CustomEpochBasedTrainLoop', + max_epochs=5, + val_interval=1), + val_dataloader=val_dataloader, + val_evaluator=val_evaluator, + val_cfg=val_cfg, + test_dataloader=val_dataloader, + test_evaluator=val_evaluator, + test_cfg=dict(type='TestLoop'), +) +runner.train() +``` + +如果有更加复杂的执行器迁移需求,可以参考[执行器教程](../tutorials/runner.md) 和[执行器设计文档](../design/runner.md)。