Merge pull request #425 from JDAI-CV/multi-node

Summary: Add multiple machine training getting started docs. Change multiple dataset evaluation logging mode, which will show the testing result of each dataset immediately. Reviewed by: l1aoxingyu
2025-06-03 14:50:47 +08:00 · 2021-03-09 20:13:29 +08:00 · 2021-03-09 20:13:29 +08:00 · 0cc9fb95a6
commit 0cc9fb95a6
parent 44ad4b83b1 f57c5764e3
6 changed files with 88 additions and 57 deletions
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@ -32,6 +32,26 @@ If you want to train model with 4 GPUs, you can run:
 python3 tools/train_net.py --config-file ./configs/Market1501/bagtricks_R50.yml --num-gpus 4
 ```

+If you want to train model with multiple machines, you can run:
+
+```
+# machine 1
+export GLOO_SOCKET_IFNAME=eth0
+export NCCL_SOCKET_IFNAME=eth0
+
+python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
+--num-gpus 4 --num-machines 2 --machine-rank 0 --dist-url tcp://ip:port 
+
+# machine 2
+export GLOO_SOCKET_IFNAME=eth0
+export NCCL_SOCKET_IFNAME=eth0
+
+python3 tools/train_net.py --config-file configs/Market1501/bagtricks_R50.yml \
+--num-gpus 4 --num-machines 2 --machine-rank 1 --dist-url tcp://ip:port 
+```
+
+Make sure the dataset path and code are the same in different machines, and machines can communicate with each other. 
+
 To evaluate a model's performance, use

 ```bash
--- a/fastreid/engine/defaults.py
+++ b/fastreid/engine/defaults.py
@ -467,15 +467,18 @@ class DefaultTrainer(TrainerBase):
            results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP_ENABLED)
            results[dataset_name] = results_i

-        if comm.is_main_process():
-            assert isinstance(
-                results, dict
-            ), "Evaluator must return a dict on the main process. Got {} instead.".format(
-                results
-            )
-            print_csv_format(results)
+            if comm.is_main_process():
+                assert isinstance(
+                    results, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                results_i['dataset'] = dataset_name
+                print_csv_format(results_i)

-        if len(results) == 1: results = list(results.values())[0]
+        if len(results) == 1:
+            results = list(results.values())[0]

        return results

--- a/fastreid/engine/hooks.py
+++ b/fastreid/engine/hooks.py
@ -360,19 +360,20 @@ class EvalHook(HookBase):
                    )
            self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)

-        # Remove extra memory cache of main process due to evaluation
-        torch.cuda.empty_cache()
-
-    def after_epoch(self):
-        next_epoch = self.trainer.epoch + 1
-        is_final = next_epoch == self.trainer.max_epoch
-        if is_final or (self._period > 0 and next_epoch % self._period == 0):
-            self._do_eval()
        # Evaluation may take different time among workers.
        # A barrier make them start the next iteration together.
        comm.synchronize()

+    def after_epoch(self):
+        next_epoch = self.trainer.epoch + 1
+        if self._period > 0 and next_epoch % self._period == 0:
+            self._do_eval()
+
    def after_train(self):
+        next_epoch = self.trainer.epoch + 1
+        # This condition is to prevent the eval from running after a failed training
+        if next_epoch % self._period != 0 and next_epoch >= self.trainer.max_epoch:
+            self._do_eval()
        # func is likely a closure that holds reference to the trainer
        # therefore we clean it to avoid circular reference in the end
        del self._func
--- a/fastreid/evaluation/evaluator.py
+++ b/fastreid/evaluation/evaluator.py
@ -6,6 +6,7 @@ from contextlib import contextmanager

 import torch

+from fastreid.utils import comm
 from fastreid.utils.logger import log_every_n_seconds


@ -96,6 +97,7 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
    Returns:
        The return value of `evaluator.evaluate()`
    """
+    num_devices = comm.get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader.dataset)))

@ -118,10 +120,11 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
                inputs["images"] = inputs["images"].flip(dims=[3])
                flip_outputs = model(inputs)
                outputs = (outputs + flip_outputs) / 2
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

-            idx += 1
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_batch = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_batch > 30:
@ -140,17 +143,18 @@ def inference_on_dataset(model, data_loader, evaluator, flip_test=False):
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
-        "Total inference time: {} ({:.6f} s / batch per device)".format(
-            total_time_str, total_time / (total - num_warmup)
+        "Total inference time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
        )
    )
    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
-        "Total inference pure compute time: {} ({:.6f} s / batch per device)".format(
-            total_compute_time_str, total_compute_time / (total - num_warmup)
+        "Total inference pure compute time: {} ({:.6f} s / batch per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
        )
    )
    results = evaluator.evaluate()
+
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
--- a/fastreid/evaluation/reid_evaluation.py
+++ b/fastreid/evaluation/reid_evaluation.py
@ -5,6 +5,7 @@
 """
 import copy
 import logging
+import itertools
 from collections import OrderedDict

 import numpy as np
@ -28,50 +29,54 @@ class ReidEvaluator(DatasetEvaluator):
        self._num_query = num_query
        self._output_dir = output_dir

-        self.features = []
-        self.pids = []
-        self.camids = []
+        self._cpu_device = torch.device('cpu')
+
+        self._predictions = []

    def reset(self):
-        self.features = []
-        self.pids = []
-        self.camids = []
+        self._predictions = []

    def process(self, inputs, outputs):
-        self.pids.extend(inputs["targets"])
-        self.camids.extend(inputs["camids"])
-        self.features.append(outputs.cpu())
+        prediction = {
+            'feats': outputs.to(self._cpu_device, torch.float32),
+            'pids': inputs['targets'].to(self._cpu_device),
+            'camids': inputs['camids'].to(self._cpu_device)
+
+        }
+        self._predictions.append(prediction)

    def evaluate(self):
        if comm.get_world_size() > 1:
            comm.synchronize()
-            features = comm.gather(self.features)
-            features = sum(features, [])
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))

-            pids = comm.gather(self.pids)
-            pids = sum(pids, [])
+            if not comm.is_main_process():
+                return {}

-            camids = comm.gather(self.camids)
-            camids = sum(camids, [])
-
-            # fmt: off
-            if not comm.is_main_process(): return {}
-            # fmt: on
        else:
-            features = self.features
-            pids = self.pids
-            camids = self.camids
+            predictions = self._predictions
+
+        features = []
+        pids = []
+        camids = []
+        for prediction in predictions:
+            features.append(prediction['feats'])
+            pids.append(prediction['pids'])
+            camids.append(prediction['camids'])

        features = torch.cat(features, dim=0)
+        pids = torch.cat(pids, dim=0).numpy()
+        camids = torch.cat(camids, dim=0).numpy()
        # query feature, person ids and camera ids
        query_features = features[:self._num_query]
-        query_pids = np.asarray(pids[:self._num_query])
-        query_camids = np.asarray(camids[:self._num_query])
+        query_pids = pids[:self._num_query]
+        query_camids = camids[:self._num_query]

        # gallery features, person ids and camera ids
        gallery_features = features[self._num_query:]
-        gallery_pids = np.asarray(pids[self._num_query:])
-        gallery_camids = np.asarray(camids[self._num_query:])
+        gallery_pids = pids[self._num_query:]
+        gallery_camids = camids[self._num_query:]

        self._results = OrderedDict()

--- a/fastreid/evaluation/testing.py
+++ b/fastreid/evaluation/testing.py
@ -8,23 +8,21 @@ import numpy as np
 from tabulate import tabulate
 from termcolor import colored

-logger = logging.getLogger(__name__)
-

 def print_csv_format(results):
    """
-    Print main metrics in a format similar to Detectron,
+    Print main metrics in a format similar to Detectron2,
    so that they are easy to copypaste into a spreadsheet.
    Args:
-        results (OrderedDict[dict]): task_name -> {metric -> score}
+        results (OrderedDict): {metric -> score}
    """
-    assert isinstance(results, OrderedDict), results  # unordered results cannot be properly printed
-    task = list(results.keys())[0]
-    metrics = ["Datasets"] + [k for k in results[task]]
+    # unordered results cannot be properly printed
+    assert isinstance(results, OrderedDict) or not len(results), results
+    logger = logging.getLogger(__name__)

-    csv_results = []
-    for task, res in results.items():
-        csv_results.append((task, *list(res.values())))
+    dataset_name = results.pop('dataset')
+    metrics = ["Dataset"] + [k for k in results]
+    csv_results = [(dataset_name, *list(results.values()))]

    # tabulate it
    table = tabulate(