mirror of
https://github.com/PaddlePaddle/PaddleClas.git
synced 2025-06-03 21:55:06 +08:00
fix adamwdl bug (#3097)
This commit is contained in:
parent
1ded6d1cbd
commit
54767fdda4
@ -21,6 +21,7 @@ import inspect
|
|||||||
if not hasattr(inspect, 'getargspec'):
|
if not hasattr(inspect, 'getargspec'):
|
||||||
inspect.getargspec = inspect.getfullargspec
|
inspect.getargspec = inspect.getfullargspec
|
||||||
|
|
||||||
|
import paddle
|
||||||
from paddle import optimizer as optim
|
from paddle import optimizer as optim
|
||||||
from ppcls.utils import logger
|
from ppcls.utils import logger
|
||||||
from functools import partial
|
from functools import partial
|
||||||
@ -380,6 +381,8 @@ class AdamWDL(object):
|
|||||||
self.layerwise_decay = layerwise_decay
|
self.layerwise_decay = layerwise_decay
|
||||||
self.name_dict = name_dict
|
self.name_dict = name_dict
|
||||||
self.n_layers = n_layers
|
self.n_layers = n_layers
|
||||||
|
self._coeff = weight_decay
|
||||||
|
self._lr_to_coeff = dict()
|
||||||
self.set_param_lr_func = partial(
|
self.set_param_lr_func = partial(
|
||||||
self._layerwise_lr_decay, layerwise_decay, name_dict, n_layers)
|
self._layerwise_lr_decay, layerwise_decay, name_dict, n_layers)
|
||||||
super().__init__(
|
super().__init__(
|
||||||
@ -393,8 +396,7 @@ class AdamWDL(object):
|
|||||||
apply_decay_param_fun=apply_decay_param_fun,
|
apply_decay_param_fun=apply_decay_param_fun,
|
||||||
weight_decay=weight_decay,
|
weight_decay=weight_decay,
|
||||||
lazy_mode=lazy_mode,
|
lazy_mode=lazy_mode,
|
||||||
multi_precision=multi_precision,
|
multi_precision=multi_precision,)
|
||||||
lr_ratio=self.set_param_lr_func)
|
|
||||||
|
|
||||||
# Layerwise decay
|
# Layerwise decay
|
||||||
def _layerwise_lr_decay(self, decay_rate, name_dict, n_layers, param):
|
def _layerwise_lr_decay(self, decay_rate, name_dict, n_layers, param):
|
||||||
@ -422,6 +424,68 @@ class AdamWDL(object):
|
|||||||
ratio = decay_rate**(n_layers + 1)
|
ratio = decay_rate**(n_layers + 1)
|
||||||
# param.optimize_attr["learning_rate"] *= ratio
|
# param.optimize_attr["learning_rate"] *= ratio
|
||||||
return ratio
|
return ratio
|
||||||
|
def _append_decoupled_weight_decay(self, block, param_and_grad):
|
||||||
|
"""
|
||||||
|
Add decoupled weight decay op.
|
||||||
|
parameter = parameter - parameter * coeff * lr
|
||||||
|
Args:
|
||||||
|
block: block in which variable is to be created
|
||||||
|
param_and_grad: (parameters, gradients) pairs,
|
||||||
|
the parameters need to decay.
|
||||||
|
Raises:
|
||||||
|
Exception: The type of coeff and parameter is not consistent.
|
||||||
|
"""
|
||||||
|
if isinstance(param_and_grad, dict):
|
||||||
|
param_and_grad = self._update_param_group(param_and_grad)
|
||||||
|
param, grad = param_and_grad
|
||||||
|
|
||||||
|
if self._apply_decay_param_fun is not None and not self._apply_decay_param_fun(param.name):
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(self._learning_rate, float):
|
||||||
|
learning_rate = self._learning_rate
|
||||||
|
else:
|
||||||
|
# NOTE. We add this function to the _append_optimize_op(),
|
||||||
|
# for we must make sure _create_param_lr() be called after
|
||||||
|
# optimizer._create_global_learning_rate().
|
||||||
|
learning_rate = self._create_param_lr(param_and_grad)
|
||||||
|
|
||||||
|
with block.program._optimized_guard([param, grad]), paddle.static.name_scope("weight decay"):
|
||||||
|
self._params_name.add(param.name)
|
||||||
|
|
||||||
|
# If it has been calculated, the result will be reused.
|
||||||
|
# NOTE(wangxi): In dygraph mode, apply_gradient will be executed
|
||||||
|
# every step, so need clear _lr_to_coeff every step,
|
||||||
|
# we do this in _create_optimization_pass
|
||||||
|
decay_coeff = self._lr_to_coeff.get(learning_rate, None)
|
||||||
|
if decay_coeff is None:
|
||||||
|
# NOTE(wangxi): for pipeline to set device:all
|
||||||
|
with paddle.static.device_guard(None):
|
||||||
|
decay_coeff = 1.0 - learning_rate * self._coeff
|
||||||
|
self._lr_to_coeff[learning_rate] = decay_coeff
|
||||||
|
|
||||||
|
find_master = self._multi_precision and param.dtype == paddle.float16
|
||||||
|
if find_master:
|
||||||
|
master_weight = self._master_weights[param.name]
|
||||||
|
scaled_param = master_weight * decay_coeff
|
||||||
|
paddle.assign(scaled_param, output=master_weight)
|
||||||
|
else:
|
||||||
|
scaled_param = param * decay_coeff
|
||||||
|
paddle.assign(scaled_param, output=param)
|
||||||
|
|
||||||
|
def _append_optimize_op(self, block, param_and_grad):
|
||||||
|
if self.set_param_lr_func is None:
|
||||||
|
return super()._append_optimize_op(block, param_and_grad)
|
||||||
|
|
||||||
|
self._append_decoupled_weight_decay(block, param_and_grad)
|
||||||
|
prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
|
||||||
|
ratio = self.set_param_lr_func(param_and_grad[0])
|
||||||
|
param_and_grad[0].optimize_attr["learning_rate"] *= ratio
|
||||||
|
|
||||||
|
# excute Adam op
|
||||||
|
res = super()._append_optimize_op(block, param_and_grad)
|
||||||
|
param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
|
||||||
|
return res
|
||||||
|
|
||||||
def __call__(self, model_list):
|
def __call__(self, model_list):
|
||||||
model = model_list[0]
|
model = model_list[0]
|
||||||
@ -442,7 +506,6 @@ class AdamWDL(object):
|
|||||||
weight_decay = 0.
|
weight_decay = 0.
|
||||||
else:
|
else:
|
||||||
parameters = model.parameters()
|
parameters = model.parameters()
|
||||||
|
|
||||||
opt_args = dict(
|
opt_args = dict(
|
||||||
learning_rate=self.learning_rate, weight_decay=self.weight_decay)
|
learning_rate=self.learning_rate, weight_decay=self.weight_decay)
|
||||||
opt_args['parameters'] = parameters
|
opt_args['parameters'] = parameters
|
||||||
@ -458,7 +521,6 @@ class AdamWDL(object):
|
|||||||
name_dict[p.name] = n
|
name_dict[p.name] = n
|
||||||
opt_args['name_dict'] = name_dict
|
opt_args['name_dict'] = name_dict
|
||||||
opt_args['n_layers'] = model.get_num_layers()
|
opt_args['n_layers'] = model.get_num_layers()
|
||||||
|
|
||||||
optimizer = self.AdamWDLImpl(**opt_args)
|
optimizer = self.AdamWDLImpl(**opt_args)
|
||||||
|
|
||||||
return optimizer
|
return optimizer
|
Loading…
x
Reference in New Issue
Block a user