fix adamwdl bug (#3097)

This commit is contained in:
sky 2024-03-04 14:18:29 +08:00 committed by GitHub
parent 1ded6d1cbd
commit 54767fdda4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -21,6 +21,7 @@ import inspect
if not hasattr(inspect, 'getargspec'): if not hasattr(inspect, 'getargspec'):
inspect.getargspec = inspect.getfullargspec inspect.getargspec = inspect.getfullargspec
import paddle
from paddle import optimizer as optim from paddle import optimizer as optim
from ppcls.utils import logger from ppcls.utils import logger
from functools import partial from functools import partial
@ -380,6 +381,8 @@ class AdamWDL(object):
self.layerwise_decay = layerwise_decay self.layerwise_decay = layerwise_decay
self.name_dict = name_dict self.name_dict = name_dict
self.n_layers = n_layers self.n_layers = n_layers
self._coeff = weight_decay
self._lr_to_coeff = dict()
self.set_param_lr_func = partial( self.set_param_lr_func = partial(
self._layerwise_lr_decay, layerwise_decay, name_dict, n_layers) self._layerwise_lr_decay, layerwise_decay, name_dict, n_layers)
super().__init__( super().__init__(
@ -393,8 +396,7 @@ class AdamWDL(object):
apply_decay_param_fun=apply_decay_param_fun, apply_decay_param_fun=apply_decay_param_fun,
weight_decay=weight_decay, weight_decay=weight_decay,
lazy_mode=lazy_mode, lazy_mode=lazy_mode,
multi_precision=multi_precision, multi_precision=multi_precision,)
lr_ratio=self.set_param_lr_func)
# Layerwise decay # Layerwise decay
def _layerwise_lr_decay(self, decay_rate, name_dict, n_layers, param): def _layerwise_lr_decay(self, decay_rate, name_dict, n_layers, param):
@ -422,6 +424,68 @@ class AdamWDL(object):
ratio = decay_rate**(n_layers + 1) ratio = decay_rate**(n_layers + 1)
# param.optimize_attr["learning_rate"] *= ratio # param.optimize_attr["learning_rate"] *= ratio
return ratio return ratio
def _append_decoupled_weight_decay(self, block, param_and_grad):
"""
Add decoupled weight decay op.
parameter = parameter - parameter * coeff * lr
Args:
block: block in which variable is to be created
param_and_grad: (parameters, gradients) pairs,
the parameters need to decay.
Raises:
Exception: The type of coeff and parameter is not consistent.
"""
if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad)
param, grad = param_and_grad
if self._apply_decay_param_fun is not None and not self._apply_decay_param_fun(param.name):
return
if isinstance(self._learning_rate, float):
learning_rate = self._learning_rate
else:
# NOTE. We add this function to the _append_optimize_op(),
# for we must make sure _create_param_lr() be called after
# optimizer._create_global_learning_rate().
learning_rate = self._create_param_lr(param_and_grad)
with block.program._optimized_guard([param, grad]), paddle.static.name_scope("weight decay"):
self._params_name.add(param.name)
# If it has been calculated, the result will be reused.
# NOTE(wangxi): In dygraph mode, apply_gradient will be executed
# every step, so need clear _lr_to_coeff every step,
# we do this in _create_optimization_pass
decay_coeff = self._lr_to_coeff.get(learning_rate, None)
if decay_coeff is None:
# NOTE(wangxi): for pipeline to set device:all
with paddle.static.device_guard(None):
decay_coeff = 1.0 - learning_rate * self._coeff
self._lr_to_coeff[learning_rate] = decay_coeff
find_master = self._multi_precision and param.dtype == paddle.float16
if find_master:
master_weight = self._master_weights[param.name]
scaled_param = master_weight * decay_coeff
paddle.assign(scaled_param, output=master_weight)
else:
scaled_param = param * decay_coeff
paddle.assign(scaled_param, output=param)
def _append_optimize_op(self, block, param_and_grad):
if self.set_param_lr_func is None:
return super()._append_optimize_op(block, param_and_grad)
self._append_decoupled_weight_decay(block, param_and_grad)
prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
ratio = self.set_param_lr_func(param_and_grad[0])
param_and_grad[0].optimize_attr["learning_rate"] *= ratio
# excute Adam op
res = super()._append_optimize_op(block, param_and_grad)
param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
return res
def __call__(self, model_list): def __call__(self, model_list):
model = model_list[0] model = model_list[0]
@ -442,7 +506,6 @@ class AdamWDL(object):
weight_decay = 0. weight_decay = 0.
else: else:
parameters = model.parameters() parameters = model.parameters()
opt_args = dict( opt_args = dict(
learning_rate=self.learning_rate, weight_decay=self.weight_decay) learning_rate=self.learning_rate, weight_decay=self.weight_decay)
opt_args['parameters'] = parameters opt_args['parameters'] = parameters
@ -458,7 +521,6 @@ class AdamWDL(object):
name_dict[p.name] = n name_dict[p.name] = n
opt_args['name_dict'] = name_dict opt_args['name_dict'] = name_dict
opt_args['n_layers'] = model.get_num_layers() opt_args['n_layers'] = model.get_num_layers()
optimizer = self.AdamWDLImpl(**opt_args) optimizer = self.AdamWDLImpl(**opt_args)
return optimizer return optimizer