support multi-node training

2021-03-09 20:07:28 +08:00 · 2021-03-09 20:07:28 +08:00 · f57c5764e3
parent 68c190b53c
commit f57c5764e3
5 changed files with 57 additions and 31 deletions
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@ -32,6 +32,26 @@ If you want to train model with 4 GPUs, you can run:
 python3 tools/train_net.py --config-file ./configs/Market1501/bagtricks_R50.yml --num-gpus 4
 ```

+If you want to train model with multiple machines, you can run:
+
+```
+# machine 1
+export GLOO_SOCKET_IFNAME=eth0
+export NCCL_SOCKET_IFNAME=eth0
+
+python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
+--num-gpus 4 --num-machines 2 --machine-rank 0 --dist-url tcp://ip:port 
+
+# machine 2
+export GLOO_SOCKET_IFNAME=eth0
+export NCCL_SOCKET_IFNAME=eth0
+
+python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
+--num-gpus 4 --num-machines 2 --machine-rank 1 --dist-url tcp://ip:port 
+```
+
+Make sure the dataset path and code are the same in different machines, and machines can communicate with each other. 
+
 To evaluate a model's performance, use

 ```bash
--- a/fastreid/engine/defaults.py
+++ b/fastreid/engine/defaults.py
@ -467,15 +467,18 @@ class DefaultTrainer(TrainerBase):
            results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP_ENABLED)
            results[dataset_name] = results_i

-        if comm.is_main_process():
-            assert isinstance(
-                results, dict
-            ), "Evaluator must return a dict on the main process. Got {} instead.".format(
-                results
-            )
-            print_csv_format(results)
+            if comm.is_main_process():
+                assert isinstance(
+                    results, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                results_i['dataset'] = dataset_name
+                print_csv_format(results_i)

-        if len(results) == 1: results = list(results.values())[0]
+        if len(results) == 1:
+            results = list(results.values())[0]

        return results

--- a/fastreid/engine/hooks.py
+++ b/fastreid/engine/hooks.py
@ -360,19 +360,20 @@ class EvalHook(HookBase):
                    )
            self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)

-        # Remove extra memory cache of main process due to evaluation
-        torch.cuda.empty_cache()
-
-    def after_epoch(self):
-        next_epoch = self.trainer.epoch + 1
-        is_final = next_epoch == self.trainer.max_epoch
-        if is_final or (self._period > 0 and next_epoch % self._period == 0):
-            self._do_eval()
        # Evaluation may take different time among workers.
        # A barrier make them start the next iteration together.
        comm.synchronize()

+    def after_epoch(self):
+        next_epoch = self.trainer.epoch + 1
+        if self._period > 0 and next_epoch % self._period == 0:
+            self._do_eval()
+
    def after_train(self):
+        next_epoch = self.trainer.epoch + 1
+        # This condition is to prevent the eval from running after a failed training
+        if next_epoch % self._period != 0 and next_epoch >= self.trainer.max_epoch:
+            self._do_eval()
        # func is likely a closure that holds reference to the trainer
        # therefore we clean it to avoid circular reference in the end
        del self._func
--- a/fastreid/evaluation/evaluator.py
+++ b/fastreid/evaluation/evaluator.py
@ -6,6 +6,7 @@ from contextlib import contextmanager

 import torch

+from fastreid.utils import comm
 from fastreid.utils.logger import log_every_n_seconds


@ -96,6 +97,7 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
    Returns:
        The return value of `evaluator.evaluate()`
    """
+    num_devices = comm.get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader.dataset)))

@ -118,10 +120,11 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
                inputs["images"] = inputs["images"].flip(dims=[3])
                flip_outputs = model(inputs)
                outputs = (outputs + flip_outputs) / 2
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

-            idx += 1
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_batch = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_batch > 30:
@ -140,17 +143,18 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
-        "Total inference time: {} ({:.6f} s / batch per device)".format(
-            total_time_str, total_time / (total - num_warmup)
+        "Total inference time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
        )
    )
    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
-        "Total inference pure compute time: {} ({:.6f} s / batch per device)".format(
-            total_compute_time_str, total_compute_time / (total - num_warmup)
+        "Total inference pure compute time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
        )
    )
    results = evaluator.evaluate()
+
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
--- a/fastreid/evaluation/testing.py
+++ b/fastreid/evaluation/testing.py
@ -8,23 +8,21 @@ import numpy as np
 from tabulate import tabulate
 from termcolor import colored

-logger = logging.getLogger(__name__)
-

 def print_csv_format(results):
    """
-    Print main metrics in a format similar to Detectron,
+    Print main metrics in a format similar to Detectron2,
    so that they are easy to copypaste into a spreadsheet.
    Args:
-        results (OrderedDict[dict]): task_name -> {metric -> score}
+        results (OrderedDict): {metric -> score}
    """
-    assert isinstance(results, OrderedDict), results  # unordered results cannot be properly printed
-    task = list(results.keys())[0]
-    metrics = ["Datasets"] + [k for k in results[task]]
+    # unordered results cannot be properly printed
+    assert isinstance(results, OrderedDict) or not len(results), results
+    logger = logging.getLogger(__name__)

-    csv_results = []
-    for task, res in results.items():
-        csv_results.append((task, *list(res.values())))
+    dataset_name = results.pop('dataset')
+    metrics = ["Dataset"] + [k for k in results]
+    csv_results = [(dataset_name, *list(results.values()))]

    # tabulate it
    table = tabulate(