From 78205c3254fdd1a5926d46931eba51712e22bf9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E5=90=AF=E5=85=83?= Date: Wed, 26 Jul 2023 10:32:53 +0800 Subject: [PATCH] Support multi-node distributed training with MLU backend (#1266) --- mmengine/dist/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index 06734c01..777bec35 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -101,7 +101,8 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None: rank = int(os.environ['RANK']) if is_mlu_available(): import torch_mlu # noqa: F401 - torch.mlu.set_device(rank) + local_rank = int(os.environ['LOCAL_RANK']) + torch.mlu.set_device(local_rank) torch_dist.init_process_group( backend='cncl', rank=rank,