Support multi-node distributed training with MLU backend (#1266)

2025-06-03 21:54:44 +08:00 · 2023-07-26 10:32:53 +08:00 · 2023-07-26 10:32:53 +08:00 · 78205c3254
commit 78205c3254
parent 68360e7ce8
1 changed files with 2 additions and 1 deletions
--- a/mmengine/dist/utils.py
+++ b/mmengine/dist/utils.py
@ -101,7 +101,8 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
    rank = int(os.environ['RANK'])
    if is_mlu_available():
        import torch_mlu  # noqa: F401
-        torch.mlu.set_device(rank)
+        local_rank = int(os.environ['LOCAL_RANK'])
+        torch.mlu.set_device(local_rank)
        torch_dist.init_process_group(
            backend='cncl',
            rank=rank,