EasyCV/tests/core/sailfish/test_arcface.py

529 lines
20 KiB
Python

# Copyright 2019 Alibaba Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ArcFaceLinear module tests."""
import os
import random
import unittest
import numpy as np
import torch
from easycv.core import sailfish
def mp_vs_ddp_main(gpu, gpus_per_worker):
r"""Model parallel vs. DDP"""
torch.cuda.set_device(gpu)
torch.distributed.init_process_group(
'nccl', rank=gpu, world_size=gpus_per_worker)
try:
num_steps = 5
freeze_num_steps = 5
learning_rate = 0.1
batch_size = 2
image_size = 5
emb_size = 3
num_classes = 8
margin_m = 0.5
margin_s = 64
momentum = 0.9
model_parallel = sailfish.ModelParallel(gpu, gpus_per_worker)
zeros_init = sailfish.ZerosInitializer()
# baseline
torch.manual_seed(42)
random.seed(42)
baseline_fe = torch.nn.Linear(image_size, emb_size).cuda()
baseline_fe = torch.nn.parallel.DistributedDataParallel(
baseline_fe, device_ids=[gpu])
baseline_fe_params = list(baseline_fe.parameters())
baseline_fc = sailfish.ArcFaceLinear(
emb_size,
num_classes,
margin=margin_m,
scale=margin_s,
weight_initializer=zeros_init).cuda()
baseline_fc = torch.nn.parallel.DistributedDataParallel(
baseline_fc, device_ids=[gpu])
baseline_fc_params = list(baseline_fc.parameters())
baseline_criterion = torch.nn.CrossEntropyLoss().cuda()
baseline_optimizer = torch.optim.SGD(
[{
'params': baseline_fe.parameters()
}, {
'params': baseline_fc.parameters()
}],
lr=learning_rate,
momentum=momentum)
baseline_fe.train()
baseline_fc.train()
baseline_criterion.train()
# hybrid parallelism
torch.manual_seed(42)
random.seed(42)
fe = torch.nn.Linear(image_size, emb_size).cuda()
fe = torch.nn.parallel.DistributedDataParallel(fe, device_ids=[gpu])
fe_params = list(fe.parameters())
fc = sailfish.ArcFaceLinear(
emb_size,
num_classes,
margin=margin_m,
scale=margin_s,
weight_initializer=zeros_init,
parallel=model_parallel).cuda()
fc_params = list(fc.parameters())
criterion = sailfish.CrossEntropyLoss(parallel=model_parallel).cuda()
optimizer = torch.optim.SGD([{
'params': fe.parameters()
}, {
'params': fc.parameters()
}],
lr=learning_rate,
momentum=momentum)
fe.train()
fc.train()
criterion.train()
for step in range(num_steps):
# baseline
torch.manual_seed(42 * step + gpu)
random.seed(42 * step + gpu)
baseline_data = torch.randn([batch_size, image_size]).cuda()
baseline_label = torch.as_tensor([
random.randint(0, num_classes - 1) for _ in range(batch_size)
]).cuda()
baseline_features = baseline_fe(baseline_data)
baseline_logits = baseline_fc(baseline_features, baseline_label)
baseline_loss = baseline_criterion(baseline_logits, baseline_label)
baseline_loss = model_parallel.reduce_sum(baseline_loss)
baseline_loss = baseline_loss / gpus_per_worker
baseline_optimizer.zero_grad()
baseline_loss.backward()
baseline_optimizer.step()
# hybrid parallelism
torch.manual_seed(42 * step + gpu)
random.seed(42 * step + gpu)
data = torch.randn([batch_size, image_size]).cuda()
label = torch.as_tensor([
random.randint(0, num_classes - 1) for _ in range(batch_size)
]).cuda()
features = fe(data)
all_features = model_parallel.gather(features)
all_label = model_parallel.gather_target(label)
shard_logits = fc(all_features, all_label)
loss = criterion(shard_logits, all_label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# eval
torch.manual_seed(42 * step + gpu)
random.seed(42 * step + gpu)
with torch.no_grad():
gathered_logits = model_parallel.gather(shard_logits, dim=1)
gathered_baseline_logits = model_parallel.gather(
baseline_logits, dim=0)
logits_norm_val = torch.norm(gathered_logits).item()
baseline_logits_norm_val = torch.norm(
gathered_baseline_logits).item()
np.testing.assert_allclose(
logits_norm_val,
baseline_logits_norm_val,
rtol=1e-5,
atol=1e-4,
err_msg='logits at gpu {} step {}'.format(gpu, step))
loss_val = loss.cpu().detach().numpy()
baseline_loss_val = baseline_loss.cpu().detach().numpy()
np.testing.assert_allclose(
loss_val,
baseline_loss_val,
rtol=1e-5,
atol=1e-4,
err_msg='loss at gpu {} step {}'.format(gpu, step))
fc_grad = model_parallel.gather(fc_params[0].grad)
baseline_fc_grad = baseline_fc_params[0].grad
np.testing.assert_allclose(
fc_grad.cpu().detach().numpy(),
baseline_fc_grad.cpu().detach().numpy(),
rtol=1e-5,
atol=1e-4,
err_msg='fc grad at gpu {} step {}'.format(gpu, step))
fe_weight = fe_params[0]
baseline_fe_weight = baseline_fe_params[0]
np.testing.assert_allclose(
fe_weight.cpu().detach().numpy(),
baseline_fe_weight.cpu().detach().numpy(),
rtol=1e-5,
atol=1e-4,
err_msg='fe weight at gpu {} step {}'.format(gpu, step))
for p in baseline_fe.parameters():
p.requires_grad = False
for p in fe.parameters():
p.requires_grad = False
for step in range(freeze_num_steps):
# baseline
torch.manual_seed(100 * step + gpu)
random.seed(100 * step + gpu)
baseline_data = torch.randn([batch_size, image_size]).cuda()
baseline_label = torch.as_tensor([
random.randint(0, num_classes - 1) for _ in range(batch_size)
]).cuda()
baseline_features = baseline_fe(baseline_data)
baseline_logits = baseline_fc(baseline_features, baseline_label)
baseline_loss = baseline_criterion(baseline_logits, baseline_label)
baseline_loss = model_parallel.reduce_sum(baseline_loss)
baseline_loss = baseline_loss / gpus_per_worker
baseline_optimizer.zero_grad()
baseline_loss.backward()
baseline_optimizer.step()
# hybrid parallelism
torch.manual_seed(100 * step + gpu)
random.seed(100 * step + gpu)
data = torch.randn([batch_size, image_size]).cuda()
label = torch.as_tensor([
random.randint(0, num_classes - 1) for _ in range(batch_size)
]).cuda()
features = fe(data)
all_features = model_parallel.gather(features)
all_label = model_parallel.gather_target(label)
shard_logits = fc(all_features, all_label)
loss = criterion(shard_logits, all_label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# eval
torch.manual_seed(100 * step + gpu)
random.seed(100 * step + gpu)
with torch.no_grad():
gathered_logits = model_parallel.gather(shard_logits, dim=1)
gathered_baseline_logits = model_parallel.gather(
baseline_logits, dim=0)
logits_norm_val = torch.norm(gathered_logits).item()
baseline_logits_norm_val = torch.norm(
gathered_baseline_logits).item()
np.testing.assert_allclose(
logits_norm_val,
baseline_logits_norm_val,
rtol=1e-5,
atol=1e-4,
err_msg='freeze logits at gpu {} step {}'.format(
gpu, step))
loss_val = loss.cpu().detach().numpy()
baseline_loss_val = baseline_loss.cpu().detach().numpy()
np.testing.assert_allclose(
loss_val,
baseline_loss_val,
rtol=1e-5,
atol=1e-4,
err_msg='freeze loss at gpu {} step {}'.format(gpu, step))
fc_grad = model_parallel.gather(fc_params[0].grad)
baseline_fc_grad = baseline_fc_params[0].grad
np.testing.assert_allclose(
fc_grad.cpu().detach().numpy(),
baseline_fc_grad.cpu().detach().numpy(),
rtol=1e-5,
atol=1e-4,
err_msg='freeze fc grad at gpu {} step {}'.format(
gpu, step))
fe_weight = fe_params[0]
baseline_fe_weight = baseline_fe_params[0]
np.testing.assert_allclose(
fe_weight.cpu().detach().numpy(),
baseline_fe_weight.cpu().detach().numpy(),
rtol=1e-5,
atol=1e-4,
err_msg='freeze fe weight at gpu {} step {}'.format(
gpu, step))
finally:
torch.distributed.destroy_process_group()
def mp_main(gpu,
gpus_per_worker,
results,
num_steps=1,
batch_size=1,
num_classes=8):
r"""Model parallel"""
torch.cuda.set_device(gpu)
torch.distributed.init_process_group(
'nccl', rank=gpu, world_size=gpus_per_worker)
zeros_init = sailfish.ZerosInitializer()
try:
emb_size = 3
learning_rate = 0.1
margin_m = 0.5
margin_s = 64
momentum = 0.9
image_size = 6
model_parallel = sailfish.ModelParallel(gpu, gpus_per_worker)
# hybrid parallelism
torch.manual_seed(42)
random.seed(42)
fe = torch.nn.Linear(image_size, emb_size).cuda()
fc = sailfish.ArcFaceLinear(
emb_size,
num_classes,
margin=margin_m,
scale=margin_s,
weight_initializer=zeros_init,
parallel=model_parallel).cuda()
fc_params = list(fc.parameters())
criterion = sailfish.CrossEntropyLoss(parallel=model_parallel).cuda()
optimizer = torch.optim.SGD(
fc.parameters(), lr=learning_rate, momentum=momentum)
fc.train()
criterion.train()
for step in range(num_steps):
baseline = results[step]
torch.manual_seed(42 * step + gpu)
random.seed(42 * step + gpu)
data = torch.randn([batch_size, image_size]).cuda()
features = fe(data)
label = torch.as_tensor([
random.randint(0, num_classes - 1) for _ in range(batch_size)
]).cuda()
all_features = model_parallel.gather(features)
all_label = model_parallel.gather_target(label)
torch.manual_seed(42 * step)
random.seed(42 * step)
np.testing.assert_equal(
list(all_features.size()),
baseline['features/size'],
err_msg='Wrong features size at gpu {} step {}'.format(
gpu, step))
np.testing.assert_allclose(
torch.norm(all_features).item(),
baseline['features/norm'],
rtol=1e-5,
err_msg='Wrong features norm at gpu {} step {}'.format(
gpu, step))
shard_logits = fc(all_features, all_label)
loss = criterion(shard_logits, all_label)
np.testing.assert_allclose(
loss.item(),
baseline['loss'],
rtol=1e-5,
err_msg='Wrong loss at gpu {} step {}'.format(gpu, step))
optimizer.zero_grad()
loss.backward()
optimizer.step()
fc_grad = model_parallel.gather(fc_params[0].grad)
np.testing.assert_allclose(
torch.norm(fc_grad).item(),
baseline['logits/grad/norm'],
rtol=1e-5,
err_msg='Wrong logits grad at gpu {} step {}'.format(
gpu, step))
finally:
torch.distributed.destroy_process_group()
def baseline_main(gpus_per_worker, num_steps=1, batch_size=1, num_classes=8):
r"""run on 1 GPU"""
emb_size = 3
learning_rate = 0.1
momentum = 0.9
image_size = 6
zeros_init = sailfish.ZerosInitializer()
# hybrid parallelism
torch.manual_seed(42)
random.seed(42)
fe = torch.nn.Linear(image_size, emb_size).cuda()
fc = sailfish.ArcFaceLinear(
emb_size, num_classes, weight_initializer=zeros_init).cuda()
fc_params = list(fc.parameters())
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(
fc.parameters(), lr=learning_rate, momentum=momentum)
fc.train()
criterion.train()
results = []
for step in range(num_steps):
result_item = {}
features_list = []
label_list = []
for gpu in range(gpus_per_worker):
torch.manual_seed(42 * step + gpu)
random.seed(42 * step + gpu)
features_list.append(
fe(torch.randn([batch_size, image_size]).cuda()))
label_list.append(
torch.as_tensor([
random.randint(0, num_classes - 1)
for _ in range(batch_size)
]).cuda())
all_features = torch.cat(features_list)
all_label = torch.cat(label_list)
torch.manual_seed(42 * step)
random.seed(42 * step)
result_item['features/size'] = list(all_features.size())
result_item['features/norm'] = torch.norm(all_features).item()
logits = fc(all_features, all_label)
loss = criterion(logits, all_label)
result_item['loss'] = loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
result_item['logits/grad/norm'] = torch.norm(fc_params[0].grad).item()
results.append(result_item)
return results
class TestArcFaceLinear(unittest.TestCase):
r"""Test sailfish.ArcFaceLinear."""
def test_no_parallel(self):
r"""Test sailfish.ArcFaceLinear without parallel."""
in_features = 1
out_features = 2
margin_m = random.random()
margin_s = random.random()
features = torch.randn([1, in_features])
label = torch.as_tensor([random.randint(0, out_features - 1)])
torch.manual_seed(42)
random.seed(42)
baseline = sailfish.ArcFaceLinear(
in_features, out_features, margin=margin_m, scale=margin_s)
baseline_optimizer = torch.optim.SGD(baseline.parameters(), lr=1.)
baseline.train()
baseline_logits = baseline(features, label)
baseline_loss = torch.sum(baseline_logits)
baseline_optimizer.zero_grad()
baseline_loss.backward()
baseline_optimizer.step()
torch.manual_seed(42)
random.seed(42)
fc = sailfish.ArcFaceLinear(
in_features, out_features, margin=margin_m, scale=margin_s)
optimizer = torch.optim.SGD(fc.parameters(), lr=1.)
fc.train()
logits = fc(features, label)
loss = torch.sum(logits)
optimizer.zero_grad()
loss.backward()
optimizer.step()
np.testing.assert_allclose(
logits.detach().numpy(),
baseline_logits.detach().numpy(),
err_msg='logits not equal to baseline')
np.testing.assert_allclose(
[p.detach().numpy() for p in baseline.parameters()],
[p.detach().numpy() for p in fc.parameters()],
err_msg='parameters not equal to baseline')
def test_mp(self):
r"""Test sailfish.ArcFaceLinear on 1 GPU."""
in_features = 1
out_features = 2
features = torch.randn([1, in_features])
label = torch.as_tensor([random.randint(0, out_features - 1)])
torch.manual_seed(42)
random.seed(42)
baseline = sailfish.ArcFaceLinear(in_features, out_features)
baseline_optimizer = torch.optim.SGD(baseline.parameters(), lr=1.)
baseline.train()
baseline_logits = baseline(features, label)
baseline_loss = torch.sum(baseline_logits)
baseline_optimizer.zero_grad()
baseline_loss.backward()
baseline_optimizer.step()
torch.manual_seed(42)
random.seed(42)
model_parallel = sailfish.ModelParallel(0, 1)
fc = sailfish.ArcFaceLinear(
in_features, out_features, parallel=model_parallel)
optimizer = torch.optim.SGD(fc.parameters(), lr=1.)
fc.train()
logits = fc(features, label)
loss = torch.sum(logits)
optimizer.zero_grad()
loss.backward()
optimizer.step()
np.testing.assert_allclose(
logits.detach().numpy(),
baseline_logits.detach().numpy(),
err_msg='logits not equal to baseline')
np.testing.assert_allclose(
[p.detach().numpy() for p in baseline.parameters()],
[p.detach().numpy() for p in fc.parameters()],
err_msg='parameters not equal to baseline')
def cant_test_mp_vs_ddp(self):
r"""Test sailfish.ArcFaceLinear with model parallel."""
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '24601'
os.environ['WORLD_SIZE'] = '1'
os.environ['RANK'] = '0'
gpus_per_worker = torch.cuda.device_count()
torch.multiprocessing.spawn(
mp_vs_ddp_main,
args=(gpus_per_worker, ),
nprocs=gpus_per_worker,
join=True)
def test_mp_vs_1gpu(self):
r"""Test sailfish.ArcFaceLinear with model parallel."""
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '24601'
os.environ['WORLD_SIZE'] = '1'
os.environ['RANK'] = '0'
gpus_per_worker = torch.cuda.device_count()
num_steps = 5
batch_size = 1
num_classes = gpus_per_worker
results = baseline_main(gpus_per_worker, num_steps, batch_size,
num_classes)
torch.multiprocessing.spawn(
mp_main,
args=(gpus_per_worker, results, num_steps, batch_size,
num_classes),
nprocs=gpus_per_worker,
join=True)
if __name__ == '__main__':
unittest.main()