mirror of https://github.com/alibaba/EasyCV.git
429 lines
12 KiB
Python
429 lines
12 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
"""Contains functions which are convenient for unit testing."""
|
|
import copy
|
|
import logging
|
|
import os
|
|
import pickle
|
|
import shutil
|
|
import socket
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import timeit
|
|
import unittest
|
|
import uuid
|
|
from multiprocessing import Process
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
from easycv.file import io
|
|
from easycv.framework.errors import RuntimeError
|
|
|
|
TEST_DIR = '/tmp/ev_pytorch_test'
|
|
|
|
|
|
def get_tmp_dir():
|
|
if os.environ.get('TEST_DIR', '') != '':
|
|
global TEST_DIR
|
|
TEST_DIR = os.environ['TEST_DIR']
|
|
dir_name = os.path.join(TEST_DIR, uuid.uuid4().hex)
|
|
if os.path.exists(dir_name):
|
|
shutil.rmtree(dir_name)
|
|
os.makedirs(dir_name)
|
|
return dir_name
|
|
|
|
|
|
def clear_all_tmp_dirs():
|
|
shutil.rmtree(TEST_DIR)
|
|
|
|
|
|
def replace_data_for_test(cfg):
|
|
"""
|
|
replace real data with test data
|
|
|
|
Args:
|
|
cfg: Config object
|
|
"""
|
|
pass
|
|
|
|
|
|
# function dectorator to run function in subprocess
|
|
# if a function will start a tf session. Because tensorflow
|
|
# gpu memory will not be cleared until the process exit
|
|
def RunAsSubprocess(f):
|
|
|
|
def wrapped_f(*args, **kw):
|
|
p = Process(target=f, args=args, kwargs=kw)
|
|
p.start()
|
|
p.join(timeout=600)
|
|
assert p.exitcode == 0, 'subprocess run failed: %s' % f.__name__
|
|
|
|
return wrapped_f
|
|
|
|
|
|
def clean_up(test_dir):
|
|
if test_dir is not None:
|
|
shutil.rmtree(test_dir)
|
|
|
|
|
|
def run_in_subprocess(cmd):
|
|
try:
|
|
with subprocess.Popen(
|
|
cmd, shell=True, stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE) as return_info:
|
|
while True:
|
|
next_line = return_info.stdout.readline()
|
|
return_line = next_line.decode('utf-8', 'ignore').strip()
|
|
if return_line == '' and return_info.poll() != None:
|
|
break
|
|
if return_line != '':
|
|
logging.info(return_line)
|
|
|
|
err_lines = ''
|
|
while True:
|
|
next_line = return_info.stderr.readline()
|
|
return_line = next_line.decode('utf-8', 'ignore').strip()
|
|
if return_line == '' and return_info.poll() != None:
|
|
break
|
|
if return_line != '':
|
|
logging.info(return_line)
|
|
err_lines += return_line + '\n'
|
|
|
|
return_code = return_info.wait()
|
|
if return_code:
|
|
raise RuntimeError(err_lines)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
|
|
def dist_exec_wrapper(cmd,
|
|
nproc_per_node,
|
|
node_rank=0,
|
|
nnodes=1,
|
|
port='29527',
|
|
addr='127.0.0.1',
|
|
python_path=None):
|
|
"""
|
|
donot forget init dist in your function or script of cmd
|
|
```python
|
|
from mmcv.runner import init_dist
|
|
init_dist(launcher='pytorch')
|
|
```
|
|
"""
|
|
|
|
dist_world_size = nproc_per_node * nnodes
|
|
cur_env = os.environ.copy()
|
|
cur_env['WORLD_SIZE'] = str(dist_world_size)
|
|
cur_env['MASTER_ADDR'] = addr
|
|
|
|
if is_port_used(port):
|
|
port = str(get_random_port())
|
|
logging.warning('Given port is used, change to port %s' % port)
|
|
|
|
cur_env['MASTER_PORT'] = port
|
|
processes = []
|
|
|
|
for local_rank in range(0, nproc_per_node):
|
|
# rank of each process
|
|
dist_rank = nproc_per_node * node_rank + local_rank
|
|
cur_env['RANK'] = str(dist_rank)
|
|
cur_env['LOCAL_RANK'] = str(local_rank)
|
|
if python_path:
|
|
cur_env['PYTHONPATH'] = ':'.join(
|
|
(cur_env['PYTHONPATH'], python_path))
|
|
|
|
process = subprocess.Popen(cmd, env=cur_env, shell=True)
|
|
processes.append(process)
|
|
|
|
for process in processes:
|
|
process.wait()
|
|
if process.returncode != 0:
|
|
raise subprocess.CalledProcessError(
|
|
returncode=process.returncode, cmd=cmd)
|
|
|
|
|
|
def is_port_used(port, host='127.0.0.1'):
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
try:
|
|
s.connect((host, int(port)))
|
|
return True
|
|
except:
|
|
return False
|
|
finally:
|
|
s.close()
|
|
|
|
|
|
def get_random_port():
|
|
while True:
|
|
port = np.random.randint(low=5000, high=10000)
|
|
if is_port_used(port):
|
|
continue
|
|
else:
|
|
break
|
|
|
|
logging.info('Random port: %s' % port)
|
|
|
|
return port
|
|
|
|
|
|
def pseudo_dist_init():
|
|
os.environ['RANK'] = '0'
|
|
os.environ['WORLD_SIZE'] = '1'
|
|
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
|
os.environ['MASTER_PORT'] = str(get_random_port())
|
|
torch.cuda.set_device(0)
|
|
from torch import distributed as dist
|
|
dist.init_process_group(backend='nccl')
|
|
|
|
|
|
def computeStats(backend, timings, batch_size=1, model_name='default'):
|
|
"""
|
|
compute the statistical metric of time and speed
|
|
"""
|
|
times = np.array(timings)
|
|
steps = len(times)
|
|
speeds = batch_size / times
|
|
time_mean = np.mean(times)
|
|
time_med = np.median(times)
|
|
time_99th = np.percentile(times, 99)
|
|
time_std = np.std(times, ddof=0)
|
|
speed_mean = np.mean(speeds)
|
|
speed_med = np.median(speeds)
|
|
|
|
msg = ('\n%s =================================\n'
|
|
'batch size=%d, num iterations=%d\n'
|
|
' Median FPS: %.1f, mean: %.1f\n'
|
|
' Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n'
|
|
) % (
|
|
backend,
|
|
batch_size,
|
|
steps,
|
|
speed_med,
|
|
speed_mean,
|
|
time_med,
|
|
time_mean,
|
|
time_99th,
|
|
time_std,
|
|
)
|
|
|
|
meas = {
|
|
'Name': model_name,
|
|
'Backend': backend,
|
|
'Median(FPS)': speed_med,
|
|
'Mean(FPS)': speed_mean,
|
|
'Median(ms)': time_med,
|
|
'Mean(ms)': time_mean,
|
|
'99th_p': time_99th,
|
|
'std_dev': time_std,
|
|
}
|
|
|
|
return meas
|
|
|
|
|
|
@torch.no_grad()
|
|
def benchmark(predictor,
|
|
input_data_list,
|
|
backend='BACKEND',
|
|
batch_size=1,
|
|
model_name='default',
|
|
num=200):
|
|
"""
|
|
evaluate the time and speed of different models
|
|
"""
|
|
|
|
timings = []
|
|
for i in range(num):
|
|
start_time = timeit.default_timer()
|
|
output = predictor.predict(input_data_list)[0]
|
|
torch.cuda.synchronize()
|
|
end_time = timeit.default_timer()
|
|
meas_time = end_time - start_time
|
|
timings.append(meas_time)
|
|
|
|
return computeStats(backend, timings, batch_size, model_name)
|
|
|
|
|
|
_DIST_SCRIPT_TEMPLATE = """
|
|
import ast
|
|
import argparse
|
|
import pickle
|
|
import torch
|
|
from torch import distributed as dist
|
|
from easycv.utils.dist_utils import is_master
|
|
import {}
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--save_all_ranks', type=ast.literal_eval, help='save all ranks results')
|
|
parser.add_argument('--save_file', type=str, help='save file')
|
|
parser.add_argument('--local_rank', type=int, default=0)
|
|
args = parser.parse_args()
|
|
|
|
|
|
def main():
|
|
results = {}.{}({}) # module.func(params)
|
|
|
|
if args.save_all_ranks:
|
|
save_file = args.save_file + str(dist.get_rank())
|
|
with open(save_file, 'wb') as f:
|
|
pickle.dump(results, f)
|
|
else:
|
|
if is_master():
|
|
with open(args.save_file, 'wb') as f:
|
|
pickle.dump(results, f)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
"""
|
|
|
|
|
|
class DistributedTestCase(unittest.TestCase):
|
|
"""Distributed TestCase for test function with distributed mode.
|
|
Examples:
|
|
import torch
|
|
from mmcv.runner import init_dist
|
|
from torch import distributed as dist
|
|
|
|
def _test_func(*args, **kwargs):
|
|
init_dist(launcher='pytorch')
|
|
rank = dist.get_rank()
|
|
if rank == 0:
|
|
value = torch.tensor(1.0).cuda()
|
|
else:
|
|
value = torch.tensor(2.0).cuda()
|
|
dist.all_reduce(value)
|
|
return value.cpu().numpy()
|
|
|
|
class DistTest(DistributedTestCase):
|
|
def test_function_dist(self):
|
|
args = () # args should be python builtin type
|
|
kwargs = {} # kwargs should be python builtin type
|
|
self.start_with_torch(
|
|
_test_func,
|
|
num_gpus=2,
|
|
assert_callback=lambda x: self.assertEqual(x, 3.0),
|
|
*args,
|
|
**kwargs,
|
|
)
|
|
"""
|
|
|
|
def _start(self,
|
|
dist_start_cmd,
|
|
func,
|
|
num_gpus,
|
|
assert_callback=None,
|
|
save_all_ranks=False,
|
|
*args,
|
|
**kwargs):
|
|
script_path = func.__code__.co_filename
|
|
script_dir, script_name = os.path.split(script_path)
|
|
script_name = os.path.splitext(script_name)[0]
|
|
func_name = func.__qualname__
|
|
|
|
func_params = []
|
|
for arg in args:
|
|
if isinstance(arg, str):
|
|
arg = ('\'{}\''.format(arg))
|
|
func_params.append(str(arg))
|
|
|
|
for k, v in kwargs.items():
|
|
if isinstance(v, str):
|
|
v = ('\'{}\''.format(v))
|
|
func_params.append('{}={}'.format(k, v))
|
|
|
|
func_params = ','.join(func_params).strip(',')
|
|
|
|
tmp_run_file = tempfile.NamedTemporaryFile(suffix='.py').name
|
|
tmp_res_file = tempfile.NamedTemporaryFile(suffix='.pkl').name
|
|
|
|
with open(tmp_run_file, 'w') as f:
|
|
print('save temporary run file to : {}'.format(tmp_run_file))
|
|
print('save results to : {}'.format(tmp_res_file))
|
|
run_file_content = _DIST_SCRIPT_TEMPLATE.format(
|
|
script_name, script_name, func_name, func_params)
|
|
f.write(run_file_content)
|
|
|
|
tmp_res_files = []
|
|
if save_all_ranks:
|
|
for i in range(num_gpus):
|
|
tmp_res_files.append(tmp_res_file + str(i))
|
|
else:
|
|
tmp_res_files = [tmp_res_file]
|
|
self.addCleanup(self.clean_tmp, [tmp_run_file] + tmp_res_files)
|
|
|
|
tmp_env = copy.deepcopy(os.environ)
|
|
tmp_env['PYTHONPATH'] = ':'.join(
|
|
(tmp_env.get('PYTHONPATH', ''), script_dir)).lstrip(':')
|
|
script_params = '--save_all_ranks=%s --save_file=%s' % (save_all_ranks,
|
|
tmp_res_file)
|
|
script_cmd = '%s %s %s' % (dist_start_cmd, tmp_run_file, script_params)
|
|
print('script command: %s' % script_cmd)
|
|
res = subprocess.call(script_cmd, shell=True, env=tmp_env)
|
|
|
|
script_res = []
|
|
for res_file in tmp_res_files:
|
|
with open(res_file, 'rb') as f:
|
|
script_res.append(pickle.load(f))
|
|
if not save_all_ranks:
|
|
script_res = script_res[0]
|
|
|
|
if assert_callback:
|
|
assert_callback(script_res)
|
|
|
|
self.assertEqual(
|
|
res,
|
|
0,
|
|
msg='The test function ``{}`` in ``{}`` run failed!'.format(
|
|
func_name, script_name))
|
|
|
|
return script_res
|
|
|
|
def start_with_torch(self,
|
|
func,
|
|
num_gpus,
|
|
assert_callback=None,
|
|
save_all_ranks=False,
|
|
*args,
|
|
**kwargs):
|
|
ip = socket.gethostbyname(socket.gethostname())
|
|
dist_start_cmd = '%s -m torch.distributed.launch --nproc_per_node=%d --master_addr=\'%s\' --master_port=%s' % (
|
|
sys.executable, num_gpus, ip, get_random_port())
|
|
|
|
return self._start(
|
|
dist_start_cmd=dist_start_cmd,
|
|
func=func,
|
|
num_gpus=num_gpus,
|
|
assert_callback=assert_callback,
|
|
save_all_ranks=save_all_ranks,
|
|
*args,
|
|
**kwargs)
|
|
|
|
def start_with_torchacc(self,
|
|
func,
|
|
num_gpus,
|
|
assert_callback=None,
|
|
save_all_ranks=False,
|
|
*args,
|
|
**kwargs):
|
|
ip = socket.gethostbyname(socket.gethostname())
|
|
dist_start_cmd = 'xlarun --nproc_per_node=%d --master_addr=\'%s\' --master_port=%s' % (
|
|
num_gpus, ip, get_random_port())
|
|
|
|
return self._start(
|
|
dist_start_cmd=dist_start_cmd,
|
|
func=func,
|
|
num_gpus=num_gpus,
|
|
assert_callback=assert_callback,
|
|
save_all_ranks=save_all_ranks,
|
|
*args,
|
|
**kwargs)
|
|
|
|
def clean_tmp(self, tmp_file_list):
|
|
for file in tmp_file_list:
|
|
if io.exists(file):
|
|
if io.isdir(file):
|
|
io.rmtree(file)
|
|
else:
|
|
io.remove(file)
|