mirror of
https://github.com/open-mmlab/mmengine.git
synced 2025-06-03 21:54:44 +08:00
[Fix] Support multi-node distributed training with NPU backend (#1459)
This commit is contained in:
parent
671f3bcdf4
commit
8e6fb12b1f
7
mmengine/dist/utils.py
vendored
7
mmengine/dist/utils.py
vendored
@ -99,9 +99,10 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
|
|||||||
**kwargs: keyword arguments are passed to ``init_process_group``.
|
**kwargs: keyword arguments are passed to ``init_process_group``.
|
||||||
"""
|
"""
|
||||||
rank = int(os.environ['RANK'])
|
rank = int(os.environ['RANK'])
|
||||||
|
# LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
|
||||||
|
local_rank = int(os.environ['LOCAL_RANK'])
|
||||||
if is_mlu_available():
|
if is_mlu_available():
|
||||||
import torch_mlu # noqa: F401
|
import torch_mlu # noqa: F401
|
||||||
local_rank = int(os.environ['LOCAL_RANK'])
|
|
||||||
torch.mlu.set_device(local_rank)
|
torch.mlu.set_device(local_rank)
|
||||||
torch_dist.init_process_group(
|
torch_dist.init_process_group(
|
||||||
backend='cncl',
|
backend='cncl',
|
||||||
@ -110,15 +111,13 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
|
|||||||
**kwargs)
|
**kwargs)
|
||||||
elif is_npu_available():
|
elif is_npu_available():
|
||||||
import torch_npu # noqa: F401
|
import torch_npu # noqa: F401
|
||||||
torch.npu.set_device(rank)
|
torch.npu.set_device(local_rank)
|
||||||
torch_dist.init_process_group(
|
torch_dist.init_process_group(
|
||||||
backend='hccl',
|
backend='hccl',
|
||||||
rank=rank,
|
rank=rank,
|
||||||
world_size=int(os.environ['WORLD_SIZE']),
|
world_size=int(os.environ['WORLD_SIZE']),
|
||||||
**kwargs)
|
**kwargs)
|
||||||
else:
|
else:
|
||||||
# LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
|
|
||||||
local_rank = int(os.environ['LOCAL_RANK'])
|
|
||||||
torch.cuda.set_device(local_rank)
|
torch.cuda.set_device(local_rank)
|
||||||
|
|
||||||
if init_backend == 'torch':
|
if init_backend == 'torch':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user