mirror of https://github.com/JDAI-CV/fast-reid.git
support multi-node training
parent
68c190b53c
commit
f57c5764e3
|
@ -32,6 +32,26 @@ If you want to train model with 4 GPUs, you can run:
|
|||
python3 tools/train_net.py --config-file ./configs/Market1501/bagtricks_R50.yml --num-gpus 4
|
||||
```
|
||||
|
||||
If you want to train model with multiple machines, you can run:
|
||||
|
||||
```
|
||||
# machine 1
|
||||
export GLOO_SOCKET_IFNAME=eth0
|
||||
export NCCL_SOCKET_IFNAME=eth0
|
||||
|
||||
python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
|
||||
--num-gpus 4 --num-machines 2 --machine-rank 0 --dist-url tcp://ip:port
|
||||
|
||||
# machine 2
|
||||
export GLOO_SOCKET_IFNAME=eth0
|
||||
export NCCL_SOCKET_IFNAME=eth0
|
||||
|
||||
python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
|
||||
--num-gpus 4 --num-machines 2 --machine-rank 1 --dist-url tcp://ip:port
|
||||
```
|
||||
|
||||
Make sure the dataset path and code are the same in different machines, and machines can communicate with each other.
|
||||
|
||||
To evaluate a model's performance, use
|
||||
|
||||
```bash
|
||||
|
|
|
@ -467,15 +467,18 @@ class DefaultTrainer(TrainerBase):
|
|||
results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP_ENABLED)
|
||||
results[dataset_name] = results_i
|
||||
|
||||
if comm.is_main_process():
|
||||
assert isinstance(
|
||||
results, dict
|
||||
), "Evaluator must return a dict on the main process. Got {} instead.".format(
|
||||
results
|
||||
)
|
||||
print_csv_format(results)
|
||||
if comm.is_main_process():
|
||||
assert isinstance(
|
||||
results, dict
|
||||
), "Evaluator must return a dict on the main process. Got {} instead.".format(
|
||||
results
|
||||
)
|
||||
logger.info("Evaluation results for {} in csv format:".format(dataset_name))
|
||||
results_i['dataset'] = dataset_name
|
||||
print_csv_format(results_i)
|
||||
|
||||
if len(results) == 1: results = list(results.values())[0]
|
||||
if len(results) == 1:
|
||||
results = list(results.values())[0]
|
||||
|
||||
return results
|
||||
|
||||
|
|
|
@ -360,19 +360,20 @@ class EvalHook(HookBase):
|
|||
)
|
||||
self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
|
||||
|
||||
# Remove extra memory cache of main process due to evaluation
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def after_epoch(self):
|
||||
next_epoch = self.trainer.epoch + 1
|
||||
is_final = next_epoch == self.trainer.max_epoch
|
||||
if is_final or (self._period > 0 and next_epoch % self._period == 0):
|
||||
self._do_eval()
|
||||
# Evaluation may take different time among workers.
|
||||
# A barrier make them start the next iteration together.
|
||||
comm.synchronize()
|
||||
|
||||
def after_epoch(self):
|
||||
next_epoch = self.trainer.epoch + 1
|
||||
if self._period > 0 and next_epoch % self._period == 0:
|
||||
self._do_eval()
|
||||
|
||||
def after_train(self):
|
||||
next_epoch = self.trainer.epoch + 1
|
||||
# This condition is to prevent the eval from running after a failed training
|
||||
if next_epoch % self._period != 0 and next_epoch >= self.trainer.max_epoch:
|
||||
self._do_eval()
|
||||
# func is likely a closure that holds reference to the trainer
|
||||
# therefore we clean it to avoid circular reference in the end
|
||||
del self._func
|
||||
|
|
|
@ -6,6 +6,7 @@ from contextlib import contextmanager
|
|||
|
||||
import torch
|
||||
|
||||
from fastreid.utils import comm
|
||||
from fastreid.utils.logger import log_every_n_seconds
|
||||
|
||||
|
||||
|
@ -96,6 +97,7 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
|
|||
Returns:
|
||||
The return value of `evaluator.evaluate()`
|
||||
"""
|
||||
num_devices = comm.get_world_size()
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Start inference on {} images".format(len(data_loader.dataset)))
|
||||
|
||||
|
@ -118,10 +120,11 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
|
|||
inputs["images"] = inputs["images"].flip(dims=[3])
|
||||
flip_outputs = model(inputs)
|
||||
outputs = (outputs + flip_outputs) / 2
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
total_compute_time += time.perf_counter() - start_compute_time
|
||||
evaluator.process(inputs, outputs)
|
||||
|
||||
idx += 1
|
||||
iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
|
||||
seconds_per_batch = total_compute_time / iters_after_start
|
||||
if idx >= num_warmup * 2 or seconds_per_batch > 30:
|
||||
|
@ -140,17 +143,18 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
|
|||
total_time_str = str(datetime.timedelta(seconds=total_time))
|
||||
# NOTE this format is parsed by grep
|
||||
logger.info(
|
||||
"Total inference time: {} ({:.6f} s / batch per device)".format(
|
||||
total_time_str, total_time / (total - num_warmup)
|
||||
"Total inference time: {} ({:.6f} s / batch per device, on {} devices)".format(
|
||||
total_time_str, total_time / (total - num_warmup), num_devices
|
||||
)
|
||||
)
|
||||
total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
|
||||
logger.info(
|
||||
"Total inference pure compute time: {} ({:.6f} s / batch per device)".format(
|
||||
total_compute_time_str, total_compute_time / (total - num_warmup)
|
||||
"Total inference pure compute time: {} ({:.6f} s / batch per device, on {} devices)".format(
|
||||
total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
|
||||
)
|
||||
)
|
||||
results = evaluator.evaluate()
|
||||
|
||||
# An evaluator may return None when not in main process.
|
||||
# Replace it by an empty dict instead to make it easier for downstream code to handle
|
||||
if results is None:
|
||||
|
|
|
@ -8,23 +8,21 @@ import numpy as np
|
|||
from tabulate import tabulate
|
||||
from termcolor import colored
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def print_csv_format(results):
|
||||
"""
|
||||
Print main metrics in a format similar to Detectron,
|
||||
Print main metrics in a format similar to Detectron2,
|
||||
so that they are easy to copypaste into a spreadsheet.
|
||||
Args:
|
||||
results (OrderedDict[dict]): task_name -> {metric -> score}
|
||||
results (OrderedDict): {metric -> score}
|
||||
"""
|
||||
assert isinstance(results, OrderedDict), results # unordered results cannot be properly printed
|
||||
task = list(results.keys())[0]
|
||||
metrics = ["Datasets"] + [k for k in results[task]]
|
||||
# unordered results cannot be properly printed
|
||||
assert isinstance(results, OrderedDict) or not len(results), results
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
csv_results = []
|
||||
for task, res in results.items():
|
||||
csv_results.append((task, *list(res.values())))
|
||||
dataset_name = results.pop('dataset')
|
||||
metrics = ["Dataset"] + [k for k in results]
|
||||
csv_results = [(dataset_name, *list(results.values()))]
|
||||
|
||||
# tabulate it
|
||||
table = tabulate(
|
||||
|
|
Loading…
Reference in New Issue