diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py index 7ab29d8d2..71a7e182a 100644 --- a/ppcls/engine/engine.py +++ b/ppcls/engine/engine.py @@ -214,16 +214,19 @@ class Engine(object): if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( - self.model, self.config["Global"]["pretrained_model"]) + [self.model, getattr(self, 'train_loss_func', None)], + self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( - self.model, self.config["Global"]["pretrained_model"]) + [self.model, getattr(self, 'train_loss_func', None)], + self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( - self.config["Optimizer"], self.config["Global"]["epochs"], - len(self.train_dataloader), [self.model]) + self.config, self.config["Global"]["epochs"], + len(self.train_dataloader), + [self.model, self.train_loss_func]) # for amp training if self.amp: @@ -241,6 +244,11 @@ class Engine(object): optimizers=self.optimizer, level=amp_level, save_dtype='float32') + if len(self.train_loss_func.parameters()) > 0: + self.train_loss_func = paddle.amp.decorate( + models=self.train_loss_func, + level=amp_level, + save_dtype='float32') # for distributed world_size = dist.get_world_size() @@ -251,7 +259,10 @@ class Engine(object): if self.config["Global"]["distributed"]: dist.init_parallel_env() self.model = paddle.DataParallel(self.model) - + if self.mode == 'train' and len(self.train_loss_func.parameters( + )) > 0: + self.train_loss_func = paddle.DataParallel( + self.train_loss_func) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators(self.config["Infer"][ @@ -279,9 +290,9 @@ class Engine(object): # global iter counter self.global_step = 0 - if self.config["Global"]["checkpoints"] is not None: - metric_info = init_model(self.config["Global"], self.model, - self.optimizer) + if self.config.Global.checkpoints is not None: + metric_info = init_model(self.config.Global, self.model, + self.optimizer, self.train_loss_func) if metric_info is not None: best_metric.update(metric_info) @@ -317,7 +328,8 @@ class Engine(object): best_metric, self.output_dir, model_name=self.config["Arch"]["name"], - prefix="best_model") + prefix="best_model", + loss=self.train_loss_func) logger.info("[Eval][Epoch {}][best metric: {}]".format( epoch_id, best_metric["metric"])) logger.scaler( @@ -336,7 +348,8 @@ class Engine(object): "epoch": epoch_id}, self.output_dir, model_name=self.config["Arch"]["name"], - prefix="epoch_{}".format(epoch_id)) + prefix="epoch_{}".format(epoch_id), + loss=self.train_loss_func) # save the latest model save_load.save_model( self.model, @@ -344,7 +357,8 @@ class Engine(object): "epoch": epoch_id}, self.output_dir, model_name=self.config["Arch"]["name"], - prefix="latest") + prefix="latest", + loss=self.train_loss_func) if self.vdl_writer is not None: self.vdl_writer.close() diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py index b15c1088a..1e944a609 100644 --- a/ppcls/engine/train/train.py +++ b/ppcls/engine/train/train.py @@ -53,16 +53,22 @@ def train_epoch(engine, epoch_id, print_batch_step): out = forward(engine, batch) loss_dict = engine.train_loss_func(out, batch[1]) - # step opt and lr + # step opt if engine.amp: scaled = engine.scaler.scale(loss_dict["loss"]) scaled.backward() - engine.scaler.minimize(engine.optimizer, scaled) + for i in range(len(engine.optimizer)): + engine.scaler.minimize(engine.optimizer[i], scaled) else: loss_dict["loss"].backward() - engine.optimizer.step() - engine.optimizer.clear_grad() - engine.lr_sch.step() + for i in range(len(engine.optimizer)): + engine.optimizer[i].step() + # clear grad + for i in range(len(engine.optimizer)): + engine.optimizer[i].clear_grad() + # step lr + for i in range(len(engine.lr_sch)): + engine.lr_sch[i].step() # below code just for logging # update metric_for_logger diff --git a/ppcls/engine/train/utils.py b/ppcls/engine/train/utils.py index 339591859..7f64104da 100644 --- a/ppcls/engine/train/utils.py +++ b/ppcls/engine/train/utils.py @@ -38,7 +38,10 @@ def update_loss(trainer, loss_dict, batch_size): def log_info(trainer, batch_size, epoch_id, iter_id): - lr_msg = "lr: {:.5f}".format(trainer.lr_sch.get_lr()) + lr_msg = ", ".join([ + "lr_{}: {:.8f}".format(i + 1, lr.get_lr()) + for i, lr in enumerate(trainer.lr_sch) + ]) metric_msg = ", ".join([ "{}: {:.5f}".format(key, trainer.output_info[key].avg) for key in trainer.output_info @@ -59,11 +62,12 @@ def log_info(trainer, batch_size, epoch_id, iter_id): len(trainer.train_dataloader), lr_msg, metric_msg, time_msg, ips_msg, eta_msg)) - logger.scaler( - name="lr", - value=trainer.lr_sch.get_lr(), - step=trainer.global_step, - writer=trainer.vdl_writer) + for i, lr in enumerate(trainer.lr_sch): + logger.scaler( + name="lr_{}".format(i + 1), + value=lr.get_lr(), + step=trainer.global_step, + writer=trainer.vdl_writer) for key in trainer.output_info: logger.scaler( name="train_{}".format(key), diff --git a/ppcls/loss/__init__.py b/ppcls/loss/__init__.py index 8513438e8..c3281b0e5 100644 --- a/ppcls/loss/__init__.py +++ b/ppcls/loss/__init__.py @@ -47,6 +47,7 @@ class CombinedLoss(nn.Layer): param.keys()) self.loss_weight.append(param.pop("weight")) self.loss_func.append(eval(name)(**param)) + self.loss_func = nn.LayerList(self.loss_func) def __call__(self, input, batch): loss_dict = {} diff --git a/ppcls/optimizer/__init__.py b/ppcls/optimizer/__init__.py index 61db39f89..a440eac46 100644 --- a/ppcls/optimizer/__init__.py +++ b/ppcls/optimizer/__init__.py @@ -18,6 +18,7 @@ from __future__ import print_function import copy import paddle +from typing import Dict, List from ppcls.utils import logger @@ -44,29 +45,78 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch): # model_list is None in static graph def build_optimizer(config, epochs, step_each_epoch, model_list=None): config = copy.deepcopy(config) - # step1 build lr - lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch) - logger.debug("build lr ({}) success..".format(lr)) - # step2 build regularization - if 'regularizer' in config and config['regularizer'] is not None: - if 'weight_decay' in config: - logger.warning( - "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored." - ) - reg_config = config.pop('regularizer') - reg_name = reg_config.pop('name') + 'Decay' - reg = getattr(paddle.regularizer, reg_name)(**reg_config) - config["weight_decay"] = reg - logger.debug("build regularizer ({}) success..".format(reg)) - # step3 build optimizer - optim_name = config.pop('name') - if 'clip_norm' in config: - clip_norm = config.pop('clip_norm') - grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) - else: - grad_clip = None - optim = getattr(optimizer, optim_name)(learning_rate=lr, - grad_clip=grad_clip, - **config)(model_list=model_list) - logger.debug("build optimizer ({}) success..".format(optim)) - return optim, lr + optim_config = config["Optimizer"] + if isinstance(optim_config, dict): + # convert {'name': xxx, **optim_cfg} to [{name: {scope: xxx, **optim_cfg}}] + optim_name = optim_config.pop("name") + optim_config: List[Dict[str, Dict]] = [{ + optim_name: { + 'scope': "all", + ** + optim_config + } + }] + optim_list = [] + lr_list = [] + """NOTE: + Currently only support optim objets below. + 1. single optimizer config. + 2. next level uner Arch, such as Arch.backbone, Arch.neck, Arch.head. + 3. loss which has parameters, such as CenterLoss. + """ + for optim_item in optim_config: + # optim_cfg = {optim_name: {scope: xxx, **optim_cfg}} + # step1 build lr + optim_name = list(optim_item.keys())[0] # get optim_name + optim_scope = optim_item[optim_name].pop('scope') # get optim_scope + optim_cfg = optim_item[optim_name] # get optim_cfg + + lr = build_lr_scheduler(optim_cfg.pop('lr'), epochs, step_each_epoch) + logger.debug("build lr ({}) for scope ({}) success..".format( + lr, optim_scope)) + # step2 build regularization + if 'regularizer' in optim_cfg and optim_cfg['regularizer'] is not None: + if 'weight_decay' in optim_cfg: + logger.warning( + "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored." + ) + reg_config = optim_cfg.pop('regularizer') + reg_name = reg_config.pop('name') + 'Decay' + reg = getattr(paddle.regularizer, reg_name)(**reg_config) + optim_cfg["weight_decay"] = reg + logger.debug("build regularizer ({}) for scope ({}) success..". + format(reg, optim_scope)) + # step3 build optimizer + if 'clip_norm' in optim_cfg: + clip_norm = optim_cfg.pop('clip_norm') + grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) + else: + grad_clip = None + optim_model = [] + for i in range(len(model_list)): + if len(model_list[i].parameters()) == 0: + continue + if optim_scope == "all": + # optimizer for all + optim_model.append(model_list[i]) + else: + if optim_scope.endswith("Loss"): + # optimizer for loss + for m in model_list[i].sublayers(True): + if m.__class_name == optim_scope: + optim_model.append(m) + else: + # opmizer for module in model, such as backbone, neck, head... + if hasattr(model_list[i], optim_scope): + optim_model.append(getattr(model_list[i], optim_scope)) + + assert len(optim_model) == 1, \ + "Invalid optim model for optim scope({}), number of optim_model={}".format(optim_scope, len(optim_model)) + optim = getattr(optimizer, optim_name)( + learning_rate=lr, grad_clip=grad_clip, + **optim_cfg)(model_list=optim_model) + logger.debug("build optimizer ({}) for scope ({}) success..".format( + optim, optim_scope)) + optim_list.append(optim) + lr_list.append(lr) + return optim_list, lr_list diff --git a/ppcls/utils/save_load.py b/ppcls/utils/save_load.py index 625a28483..093255379 100644 --- a/ppcls/utils/save_load.py +++ b/ppcls/utils/save_load.py @@ -18,9 +18,6 @@ from __future__ import print_function import errno import os -import re -import shutil -import tempfile import paddle from ppcls.utils import logger @@ -47,10 +44,15 @@ def _mkdir_if_not_exist(path): def load_dygraph_pretrain(model, path=None): if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')): - raise ValueError("Model pretrain path {} does not " + raise ValueError("Model pretrain path {}.pdparams does not " "exists.".format(path)) param_state_dict = paddle.load(path + ".pdparams") - model.set_dict(param_state_dict) + if isinstance(model, list): + for m in model: + if hasattr(m, 'set_dict'): + m.set_dict(param_state_dict) + else: + model.set_dict(param_state_dict) return @@ -85,7 +87,7 @@ def load_distillation_model(model, pretrained_model): pretrained_model)) -def init_model(config, net, optimizer=None): +def init_model(config, net, optimizer=None, loss: paddle.nn.Layer=None): """ load model from checkpoint or pretrained_model """ @@ -95,11 +97,15 @@ def init_model(config, net, optimizer=None): "Given dir {}.pdparams not exist.".format(checkpoints) assert os.path.exists(checkpoints + ".pdopt"), \ "Given dir {}.pdopt not exist.".format(checkpoints) - para_dict = paddle.load(checkpoints + ".pdparams") + # load state dict opti_dict = paddle.load(checkpoints + ".pdopt") + para_dict = paddle.load(checkpoints + ".pdparams") metric_dict = paddle.load(checkpoints + ".pdstates") - net.set_dict(para_dict) - optimizer.set_state_dict(opti_dict) + # set state dict + net.set_state_dict(para_dict) + loss.set_state_dict(para_dict) + for i in range(len(optimizer)): + optimizer[i].set_state_dict(opti_dict) logger.info("Finish load checkpoints from {}".format(checkpoints)) return metric_dict @@ -120,7 +126,8 @@ def save_model(net, metric_info, model_path, model_name="", - prefix='ppcls'): + prefix='ppcls', + loss: paddle.nn.Layer=None): """ save model to the target path """ @@ -130,7 +137,14 @@ def save_model(net, _mkdir_if_not_exist(model_path) model_path = os.path.join(model_path, prefix) - paddle.save(net.state_dict(), model_path + ".pdparams") - paddle.save(optimizer.state_dict(), model_path + ".pdopt") + params_state_dict = net.state_dict() + loss_state_dict = loss.state_dict() + keys_inter = set(params_state_dict.keys()) & set(loss_state_dict.keys()) + assert len(keys_inter) == 0, \ + f"keys in model and loss state_dict must be unique, but got intersection {keys_inter}" + params_state_dict.update(loss_state_dict) + + paddle.save(params_state_dict, model_path + ".pdparams") + paddle.save([opt.state_dict() for opt in optimizer], model_path + ".pdopt") paddle.save(metric_info, model_path + ".pdstates") logger.info("Already save model in {}".format(model_path))