diff --git a/timm/optim/_optim_factory.py b/timm/optim/_optim_factory.py index b3759a37..97cbfd22 100644 --- a/timm/optim/_optim_factory.py +++ b/timm/optim/_optim_factory.py @@ -432,17 +432,17 @@ def _register_adam_variants(registry: OptimizerRegistry) -> None: OptimInfo( name='adafactorbv', opt_class=AdafactorBigVision, - description='Big Vision variant of Adafactor with factored gradients, half precision momentum.', + description='Big Vision variant of Adafactor with factored gradients, half precision momentum', ), OptimInfo( name='adopt', opt_class=Adopt, - description='Memory-efficient implementation of Adam with factored gradients', + description='Modified Adam that can converge with any β2 with the optimal rate', ), OptimInfo( name='adoptw', opt_class=Adopt, - description='Memory-efficient implementation of Adam with factored gradients', + description='Modified AdamW (decoupled decay) that can converge with any β2 with the optimal rate', defaults={'decoupled': True} ), ] diff --git a/timm/optim/adopt.py b/timm/optim/adopt.py index 648d9b6a..486cb626 100644 --- a/timm/optim/adopt.py +++ b/timm/optim/adopt.py @@ -51,6 +51,10 @@ def _get_value(x): class Adopt(Optimizer): + """ + ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate: https://arxiv.org/abs/2411.02853 + + """ def __init__( self, params,