# 迁移 MMCV 模型到 MMEngine ## 简介 MMCV 早期支持的计算机视觉任务,例如目标检测、物体识别等,都采用了一种典型的模型参数优化流程,可以被归纳为以下四个步骤: 1. 计算损失 2. 计算梯度 3. 更新参数 4. 梯度清零 上述流程的一大特点就是调用位置统一(在训练迭代后调用)、执行步骤统一(依次执行步骤 1->2->3->4),非常契合[钩子(Hook)](../design/hook.md)的设计原则,因此这类任务通常会使用 `Hook` 来优化模型。MMCV 为此实现了一系列的 `Hook`,例如 `OptimizerHook`(单精度训练)、`Fp16OptimizerHook`(混合精度训练) 和 `GradientCumulativeFp16OptimizerHook`(混合精度训练 + 梯度累加),为这类任务提供各种优化策略。 一些例如生成对抗网络(GAN),自监督(Self-supervision)等领域的算法一般有更加灵活的训练流程,这类流程并不满足调用位置统一、执行步骤统一的原则,难以使用 `Hook` 对参数进行优化。为了支持训练这类任务,MMCV 的执行器会在调用 `model.train_step` 时,额外传入 `optimizer` 参数,让模型在 `train_step` 里实现自定义的优化流程。这样虽然可以支持训练这类任务,但也会导致无法使用各种 `OptimizerHook`,需要算法在 `train_step` 中实现混合精度训练、梯度累加等训练策略。 为了统一深度学习任务的参数优化流程,MMEngine 设计了[优化器封装](mmengine.optim.OptimWrapper),集成了混合精度训练、梯度累加等训练策略,各类深度学习任务一律在 `model.train_step` 里执行参数优化流程。 ## 优化流程的迁移 ### 常用的参数更新流程 考虑到目标检测、物体识别一类的深度学习任务参数优化的流程基本一致,我们可以通过继承[模型基类](../tutorials/model.md)来完成迁移。 **基于 MMCV 执行器的模型** 在介绍如何迁移模型之前,我们先来看一个基于 MMCV 执行器训练模型的最简示例: ```python import torch import torch.nn as nn from torch.optim import SGD from torch.utils.data import DataLoader from mmcv.runner import Runner from mmcv.utils.logging import get_logger train_dataset = [(torch.ones(1, 1), torch.ones(1, 1))] * 50 train_dataloader = DataLoader(train_dataset, batch_size=2) class MMCVToyModel(nn.Module): def __init__(self) -> None: super().__init__() self.linear = nn.Linear(1, 1) def forward(self, img, label, return_loss=False): feat = self.linear(img) loss1 = (feat - label).pow(2) loss2 = (feat - label).abs() loss = (loss1 + loss2).sum() return dict(loss=loss, num_samples=len(img), log_vars=dict( loss1=loss1.sum().item(), loss2=loss2.sum().item())) def train_step(self, data, optimizer=None): return self(*data, return_loss=True) def val_step(self, data, optimizer=None): return self(*data, return_loss=False) model = MMCVToyModel() optimizer = SGD(model.parameters(), lr=0.01) logger = get_logger('demo') lr_config = dict(policy='step', step=[2, 3]) optimizer_config = dict(grad_clip=None) log_config = dict(interval=10, hooks=[dict(type='TextLoggerHook')]) runner = Runner( model=model, work_dir='tmp_dir', optimizer=optimizer, logger=logger, max_epochs=5) runner.register_training_hooks( lr_config=lr_config, optimizer_config=optimizer_config, log_config=log_config) runner.run([train_dataloader], [('train', 1)]) ``` 基于 MMCV 执行器训练模型时,我们必须实现 `train_step` 接口,并返回一个字典,字典需要包含以下三个字段: - loss:传给 `OptimizerHook` 计算梯度 - num_samples:传给 `LogBuffer`,用于计算平滑后的损失 - log_vars:传给 `LogBuffer` 用于计算平滑后的损失 **基于 MMEngine 执行器的模型** 基于 MMEngine 的执行器,实现同样逻辑的代码: ```python import torch import torch.nn as nn from torch.utils.data import DataLoader from mmengine.runner import Runner from mmengine.model import BaseModel train_dataset = [(torch.ones(1, 1), torch.ones(1, 1))] * 50 train_dataloader = DataLoader(train_dataset, batch_size=2) class MMEngineToyModel(BaseModel): def __init__(self) -> None: super().__init__() self.linear = nn.Linear(1, 1) def forward(self, img, label, mode): feat = self.linear(img) # 被 `train_step` 调用,返回用于更新参数的损失字典 if mode == 'loss': loss1 = (feat - label).pow(2) loss2 = (feat - label).abs() return dict(loss1=loss1, loss2=loss2) # 被 `val_step` 调用,返回传给 `evaluator` 的预测结果 elif mode == 'predict': return [_feat for _feat in feat] # tensor 模式,功能详见模型教程文档: tutorials/model.md else: pass runner = Runner( model=MMEngineToyModel(), work_dir='tmp_dir', train_dataloader=train_dataloader, train_cfg=dict(by_epoch=True, max_epochs=5), optim_wrapper=dict(optimizer=dict(type='SGD', lr=0.01))) runner.train() ``` MMEngine 实现了模型基类,模型基类在 `train_step` 里实现了 `OptimizerHook` 的优化流程。因此上例中,我们无需实现 `train_step`,运行时直接调用基类的 `train_step`。
MMCV 模型 | MMEngine 模型 |
---|---|
```python
class MMCVToyModel(nn.Module):
def __init__(self) -> None:
super().__init__()
self.linear = nn.Linear(1, 1)
def forward(self, img, label, return_loss=False):
feat = self.linear(img)
loss1 = (feat - label).pow(2)
loss2 = (feat - label).abs()
loss = (loss1 + loss2).sum()
return dict(loss=loss,
num_samples=len(img),
log_vars=dict(
loss1=loss1.sum().item(),
loss2=loss2.sum().item()))
def train_step(self, data, optimizer=None):
return self(*data, return_loss=True)
def val_step(self, data, optimizer=None):
return self(*data, return_loss=False)
```
|
```python
class MMEngineToyModel(BaseModel):
def __init__(self) -> None:
super().__init__()
self.linear = nn.Linear(1, 1)
def forward(self, img, label, mode):
if mode == 'loss':
feat = self.linear(img)
loss1 = (feat - label).pow(2)
loss2 = (feat - label).abs()
return dict(loss1=loss1, loss2=loss2)
elif mode == 'predict':
return [_feat for _feat in feat]
else:
pass
# 模型基类 `train_step` 等效代码
# def train_step(self, data, optim_wrapper):
# data = self.data_preprocessor(data)
# loss_dict = self(*data, mode='loss')
# loss_dict['loss1'] = loss_dict['loss1'].sum()
# loss_dict['loss2'] = loss_dict['loss2'].sum()
# loss = (loss_dict['loss1'] + loss_dict['loss2']).sum()
# 调用优化器封装更新模型参数
# optim_wrapper.update_params(loss)
# return loss_dict
```
|
Training gan in MMCV | Training gan in MMEngine |
---|---|
```python
def train_discriminator(self, inputs, optimizer):
real_imgs = inputs['inputs']
z = torch.randn(
(real_imgs.shape[0], self.noise_size)).type_as(real_imgs)
with torch.no_grad():
fake_imgs = self.generator(z)
disc_pred_fake = self.discriminator(fake_imgs)
disc_pred_real = self.discriminator(real_imgs)
parsed_losses, log_vars = self.disc_loss(disc_pred_fake,
disc_pred_real)
parsed_losses.backward()
optimizer.step()
optimizer.zero_grad()
return log_vars
def train_generator(self, inputs, optimizer_wrapper):
real_imgs = inputs['inputs']
z = torch.randn(inputs['inputs'].shape[0], self.noise_size).type_as(
real_imgs)
fake_imgs = self.generator(z)
disc_pred_fake = self.discriminator(fake_imgs)
parsed_loss, log_vars = self.gen_loss(disc_pred_fake)
parsed_losses.backward()
optimizer.step()
optimizer.zero_grad()
return log_vars
```
|
```python
def train_discriminator(self, inputs, optimizer_wrapper):
real_imgs = inputs['inputs']
z = torch.randn(
(real_imgs.shape[0], self.noise_size)).type_as(real_imgs)
with torch.no_grad():
fake_imgs = self.generator(z)
disc_pred_fake = self.discriminator(fake_imgs)
disc_pred_real = self.discriminator(real_imgs)
parsed_losses, log_vars = self.disc_loss(disc_pred_fake,
disc_pred_real)
optimizer_wrapper.update_params(parsed_losses)
return log_vars
def train_generator(self, inputs, optimizer_wrapper):
real_imgs = inputs['inputs']
z = torch.randn(real_imgs.shape[0], self.noise_size).type_as(real_imgs)
fake_imgs = self.generator(z)
disc_pred_fake = self.discriminator(fake_imgs)
parsed_loss, log_vars = self.gen_loss(disc_pred_fake)
optimizer_wrapper.update_params(parsed_loss)
return log_vars
```
|
MMCV 分布式训练构建模型 | MMEngine 分布式训练 |
---|---|
```python
model = MMDistributedDataParallel(
model,
device_ids=[int(os.environ['LOCAL_RANK'])],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
...
runner = Runner(model=model, ...)
```
|
```python
runner = Runner(
model=model,
launcher='pytorch', #开启分布式训练
..., # 其他参数
)
```
|