mirror of
https://github.com/open-mmlab/mmengine.git
synced 2025-06-03 21:54:44 +08:00
Support multi-node distributed training with MLU backend (#1266)
This commit is contained in:
parent
68360e7ce8
commit
78205c3254
3
mmengine/dist/utils.py
vendored
3
mmengine/dist/utils.py
vendored
@ -101,7 +101,8 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
|
|||||||
rank = int(os.environ['RANK'])
|
rank = int(os.environ['RANK'])
|
||||||
if is_mlu_available():
|
if is_mlu_available():
|
||||||
import torch_mlu # noqa: F401
|
import torch_mlu # noqa: F401
|
||||||
torch.mlu.set_device(rank)
|
local_rank = int(os.environ['LOCAL_RANK'])
|
||||||
|
torch.mlu.set_device(local_rank)
|
||||||
torch_dist.init_process_group(
|
torch_dist.init_process_group(
|
||||||
backend='cncl',
|
backend='cncl',
|
||||||
rank=rank,
|
rank=rank,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user