[Bug fix] Fix efficient test for multi-node (#707)
* [Bug fix] Fix efficient test for multi-node * fixed CI * add efficient test dir * remove unused argspull/1801/head
parent
55085a85c3
commit
5184c6a8db
|
@ -1,17 +1,15 @@
|
|||
import os.path as osp
|
||||
import pickle
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
import mmcv
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from mmcv.engine import collect_results_cpu, collect_results_gpu
|
||||
from mmcv.image import tensor2imgs
|
||||
from mmcv.runner import get_dist_info
|
||||
|
||||
|
||||
def np2tmp(array, temp_file_name=None):
|
||||
def np2tmp(array, temp_file_name=None, tmpdir=None):
|
||||
"""Save ndarray to local numpy file.
|
||||
|
||||
Args:
|
||||
|
@ -19,6 +17,7 @@ def np2tmp(array, temp_file_name=None):
|
|||
temp_file_name (str): Numpy file name. If 'temp_file_name=None', this
|
||||
function will generate a file name with tempfile.NamedTemporaryFile
|
||||
to save ndarray. Default: None.
|
||||
tmpdir (str): Temporary directory to save Ndarray files. Default: None.
|
||||
|
||||
Returns:
|
||||
str: The numpy file name.
|
||||
|
@ -26,7 +25,7 @@ def np2tmp(array, temp_file_name=None):
|
|||
|
||||
if temp_file_name is None:
|
||||
temp_file_name = tempfile.NamedTemporaryFile(
|
||||
suffix='.npy', delete=False).name
|
||||
suffix='.npy', delete=False, dir=tmpdir).name
|
||||
np.save(temp_file_name, array)
|
||||
return temp_file_name
|
||||
|
||||
|
@ -58,6 +57,8 @@ def single_gpu_test(model,
|
|||
results = []
|
||||
dataset = data_loader.dataset
|
||||
prog_bar = mmcv.ProgressBar(len(dataset))
|
||||
if efficient_test:
|
||||
mmcv.mkdir_or_exist('.efficient_test')
|
||||
for i, data in enumerate(data_loader):
|
||||
with torch.no_grad():
|
||||
result = model(return_loss=False, **data)
|
||||
|
@ -90,11 +91,11 @@ def single_gpu_test(model,
|
|||
|
||||
if isinstance(result, list):
|
||||
if efficient_test:
|
||||
result = [np2tmp(_) for _ in result]
|
||||
result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
|
||||
results.extend(result)
|
||||
else:
|
||||
if efficient_test:
|
||||
result = np2tmp(result)
|
||||
result = np2tmp(result, tmpdir='.efficient_test')
|
||||
results.append(result)
|
||||
|
||||
batch_size = len(result)
|
||||
|
@ -120,7 +121,8 @@ def multi_gpu_test(model,
|
|||
model (nn.Module): Model to be tested.
|
||||
data_loader (utils.data.Dataloader): Pytorch data loader.
|
||||
tmpdir (str): Path of directory to save the temporary results from
|
||||
different gpus under cpu mode.
|
||||
different gpus under cpu mode. The same path is used for efficient
|
||||
test.
|
||||
gpu_collect (bool): Option to use either gpu or cpu to collect results.
|
||||
efficient_test (bool): Whether save the results as local numpy files to
|
||||
save CPU memory during evaluation. Default: False.
|
||||
|
@ -135,17 +137,19 @@ def multi_gpu_test(model,
|
|||
rank, world_size = get_dist_info()
|
||||
if rank == 0:
|
||||
prog_bar = mmcv.ProgressBar(len(dataset))
|
||||
if efficient_test:
|
||||
mmcv.mkdir_or_exist('.efficient_test')
|
||||
for i, data in enumerate(data_loader):
|
||||
with torch.no_grad():
|
||||
result = model(return_loss=False, rescale=True, **data)
|
||||
|
||||
if isinstance(result, list):
|
||||
if efficient_test:
|
||||
result = [np2tmp(_) for _ in result]
|
||||
result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
|
||||
results.extend(result)
|
||||
else:
|
||||
if efficient_test:
|
||||
result = np2tmp(result)
|
||||
result = np2tmp(result, tmpdir='.efficient_test')
|
||||
results.append(result)
|
||||
|
||||
if rank == 0:
|
||||
|
@ -159,80 +163,3 @@ def multi_gpu_test(model,
|
|||
else:
|
||||
results = collect_results_cpu(results, len(dataset), tmpdir)
|
||||
return results
|
||||
|
||||
|
||||
def collect_results_cpu(result_part, size, tmpdir=None):
|
||||
"""Collect results with CPU."""
|
||||
rank, world_size = get_dist_info()
|
||||
# create a tmp dir if it is not specified
|
||||
if tmpdir is None:
|
||||
MAX_LEN = 512
|
||||
# 32 is whitespace
|
||||
dir_tensor = torch.full((MAX_LEN, ),
|
||||
32,
|
||||
dtype=torch.uint8,
|
||||
device='cuda')
|
||||
if rank == 0:
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
tmpdir = torch.tensor(
|
||||
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
|
||||
dir_tensor[:len(tmpdir)] = tmpdir
|
||||
dist.broadcast(dir_tensor, 0)
|
||||
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
|
||||
else:
|
||||
mmcv.mkdir_or_exist(tmpdir)
|
||||
# dump the part result to the dir
|
||||
mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
|
||||
dist.barrier()
|
||||
# collect all parts
|
||||
if rank != 0:
|
||||
return None
|
||||
else:
|
||||
# load results of all parts from tmp dir
|
||||
part_list = []
|
||||
for i in range(world_size):
|
||||
part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
|
||||
part_list.append(mmcv.load(part_file))
|
||||
# sort the results
|
||||
ordered_results = []
|
||||
for res in zip(*part_list):
|
||||
ordered_results.extend(list(res))
|
||||
# the dataloader may pad some samples
|
||||
ordered_results = ordered_results[:size]
|
||||
# remove tmp dir
|
||||
shutil.rmtree(tmpdir)
|
||||
return ordered_results
|
||||
|
||||
|
||||
def collect_results_gpu(result_part, size):
|
||||
"""Collect results with GPU."""
|
||||
rank, world_size = get_dist_info()
|
||||
# dump result part to tensor with pickle
|
||||
part_tensor = torch.tensor(
|
||||
bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
|
||||
# gather all result part tensor shape
|
||||
shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
|
||||
shape_list = [shape_tensor.clone() for _ in range(world_size)]
|
||||
dist.all_gather(shape_list, shape_tensor)
|
||||
# padding result part tensor to max length
|
||||
shape_max = torch.tensor(shape_list).max()
|
||||
part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
|
||||
part_send[:shape_tensor[0]] = part_tensor
|
||||
part_recv_list = [
|
||||
part_tensor.new_zeros(shape_max) for _ in range(world_size)
|
||||
]
|
||||
# gather all result part
|
||||
dist.all_gather(part_recv_list, part_send)
|
||||
|
||||
if rank == 0:
|
||||
part_list = []
|
||||
for recv, shape in zip(part_recv_list, shape_list):
|
||||
part_list.append(
|
||||
pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
|
||||
# sort the results
|
||||
ordered_results = []
|
||||
for res in zip(*part_list):
|
||||
ordered_results.extend(list(res))
|
||||
# the dataloader may pad some samples
|
||||
ordered_results = ordered_results[:size]
|
||||
return ordered_results
|
||||
|
|
Loading…
Reference in New Issue