Remove an indent level in init_group for adopt, update optim tests, adopt failing rosenbrock

This commit is contained in:
Ross Wightman 2024-11-08 10:38:42 -08:00
parent 6db271015d
commit d73e8e7531
2 changed files with 102 additions and 66 deletions

View File

@ -175,7 +175,7 @@ def _test_basic_cases(constructor, scheduler_constructors=None):
)
def _test_model(optimizer, params, device=torch.device('cpu')):
def _test_model(optimizer, params, device=torch.device('cpu'), after_step=0):
weight = torch.tensor(
[[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]],
device=device, requires_grad=True)
@ -206,7 +206,8 @@ def _test_model(optimizer, params, device=torch.device('cpu')):
loss = output.sum()
loss.backward()
loss = loss.item()
assert loss < prev_loss
if i > after_step:
assert loss < prev_loss
prev_loss = loss
optimizer.step()
@ -235,31 +236,44 @@ def _test_rosenbrock(constructor, scheduler_constructors=None):
solution = torch.tensor([1, 1])
initial_dist = params.clone().detach().dist(solution)
def eval(params, w):
def get_grad(_param, _sparse_grad, _w):
grad = drosenbrock(params.clone().detach())
# Depending on w, provide only the x or y gradient
if _sparse_grad:
if _w:
i = torch.tensor([[0, 0]], dtype=torch.int64)
x = grad[0]
v = torch.tensor([x / 4.0, x - x / 4.0])
else:
i = torch.tensor([[1, 1]], dtype=torch.int64)
y = grad[1]
v = torch.tensor([y - y / 4.0, y / 4.0])
grad_out = torch.sparse_coo_tensor(i, v, (2,), dtype=v.dtype)
else:
if _w:
grad_out = torch.tensor([grad[0], 0], dtype=_param.dtype)
else:
grad_out = torch.tensor([0, grad[1]], dtype=_param.dtype)
return grad_out
def eval(_param, _sparse_grad, _w):
# Depending on w, provide only the x or y gradient
optimizer.zero_grad()
loss = rosenbrock(params)
loss = rosenbrock(_param)
loss.backward()
grad = drosenbrock(params.clone().detach())
# NB: We torture test the optimizer by returning an
# uncoalesced sparse tensor
if w:
i = torch.LongTensor([[0, 0]])
x = grad[0]
v = torch.tensor([x / 4., x - x / 4.])
else:
i = torch.LongTensor([[1, 1]])
y = grad[1]
v = torch.tensor([y - y / 4., y / 4.])
x = torch.sparse.DoubleTensor(i, v, torch.Size([2])).to(dtype=v.dtype)
grad_out = get_grad(_param, _sparse_grad, _w)
with torch.no_grad():
params.grad = x.to_dense()
_param.grad = grad_out.to_dense()
return loss
for i in range(2000):
# Do cyclic coordinate descent
w = i % 2
optimizer.step(functools.partial(eval, params, w))
optimizer.step(functools.partial(eval, params, True, w))
for scheduler in schedulers:
if isinstance(scheduler, PlateauLRScheduler):
scheduler.step(rosenbrock(params))
@ -340,7 +354,7 @@ def test_sgd(optimizer):
_test_model(optimizer, dict(lr=1e-3))
@pytest.mark.parametrize('optimizer', ['adamw', 'adam', 'nadam', 'adamax'])
@pytest.mark.parametrize('optimizer', ['adamw', 'adam', 'nadam', 'adamax', 'nadamw'])
def test_adam(optimizer):
_test_basic_cases(
lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3)
@ -363,6 +377,30 @@ def test_adam(optimizer):
_test_model(optimizer, dict(lr=5e-2))
@pytest.mark.parametrize('optimizer', ['adopt', 'adoptw'])
def test_adopt(optimizer):
_test_basic_cases(
lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3)
)
_test_basic_cases(
lambda weight, bias: create_optimizer_v2(
_build_params_dict(weight, bias, lr=3e-3),
optimizer,
lr=1e-3)
)
_test_basic_cases(
lambda weight, bias: create_optimizer_v2(
_build_params_dict_single(weight, bias, lr=3e-3),
optimizer,
lr=1e-3)
)
# FIXME rosenbrock is not passing for ADOPT
# _test_rosenbrock(
# lambda params: create_optimizer_v2(params, optimizer, lr=1e-3)
# )
_test_model(optimizer, dict(lr=5e-2), after_step=1) # note no convergence in first step for ADOPT
@pytest.mark.parametrize('optimizer', ['adabelief'])
def test_adabelief(optimizer):
_test_basic_cases(
@ -446,7 +484,7 @@ def test_adaother(optimizer):
_test_model(optimizer, dict(lr=5e-2))
@pytest.mark.parametrize('optimizer', ['adafactor'])
@pytest.mark.parametrize('optimizer', ['adafactor', 'adafactorbv'])
def test_adafactor(optimizer):
_test_basic_cases(
lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3)

View File

@ -129,58 +129,55 @@ class Adopt(Optimizer):
):
has_complex = False
for p in group["params"]:
if p.grad is not None:
has_complex |= torch.is_complex(p)
params_with_grad.append(p)
if p.grad.is_sparse:
raise RuntimeError(
"ADOPT does not support sparse gradients"
)
grads.append(p.grad)
if p.grad is None:
continue
has_complex |= torch.is_complex(p)
params_with_grad.append(p)
if p.grad.is_sparse:
raise RuntimeError(
"ADOPT does not support sparse gradients"
)
grads.append(p.grad)
state = self.state[p]
# Lazy state initialization
if len(state) == 0:
# note(crcrpar): [special device hosting for step]
# Deliberately host `step` on CPU if both capturable and fused are off.
# This is because kernel launches are costly on CUDA and XLA.
state["step"] = (
torch.zeros(
(),
dtype=_get_scalar_dtype(),
device=p.device,
)
if group["capturable"]
else torch.tensor(0.0, dtype=_get_scalar_dtype())
)
# Exponential moving average of gradient values
state["exp_avg"] = torch.zeros_like(
p, memory_format=torch.preserve_format
)
# Exponential moving average of squared gradient values
state["exp_avg_sq"] = torch.zeros_like(
p, memory_format=torch.preserve_format
state = self.state[p]
# Lazy state initialization
if len(state) == 0:
# note(crcrpar): [special device hosting for step]
# Deliberately host `step` on CPU if both capturable and fused are off.
# This is because kernel launches are costly on CUDA and XLA.
state["step"] = (
torch.zeros(
(),
dtype=_get_scalar_dtype(),
device=p.grad.device,
)
if group["capturable"]
else torch.tensor(0.0, dtype=_get_scalar_dtype())
)
# Exponential moving average of gradient values
state["exp_avg"] = torch.zeros_like(
p.grad, memory_format=torch.preserve_format
)
# Exponential moving average of squared gradient values
state["exp_avg_sq"] = torch.zeros_like(
p.grad, memory_format=torch.preserve_format
)
exp_avgs.append(state["exp_avg"])
exp_avg_sqs.append(state["exp_avg_sq"])
exp_avgs.append(state["exp_avg"])
exp_avg_sqs.append(state["exp_avg_sq"])
if group["differentiable"] and state["step"].requires_grad:
raise RuntimeError(
"`requires_grad` is not supported for `step` in differentiable mode"
)
if group["differentiable"] and state["step"].requires_grad:
raise RuntimeError(
"`requires_grad` is not supported for `step` in differentiable mode"
)
# Foreach without capturable does not support a tensor lr
if (
group["foreach"]
and torch.is_tensor(group["lr"])
and not group["capturable"]
):
raise RuntimeError(
"lr as a Tensor is not supported for capturable=False and foreach=True"
)
# Foreach without capturable does not support a tensor lr
if group["foreach"] and torch.is_tensor(group["lr"]) and not group["capturable"]:
raise RuntimeError(
"lr as a Tensor is not supported for capturable=False and foreach=True"
)
state_steps.append(state["step"])
state_steps.append(state["step"])
return has_complex
#@_use_grad_for_differentiable # FIXME internal context mgr, can't use
@ -312,6 +309,7 @@ def _single_tensor_adopt(
exp_avg.mul_(beta1).addcdiv_(grad, denom, value=1 - beta1)
param.add_(exp_avg, alpha=-lr)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)