Support multi-node distributed training with MLU backend (#1266)

This commit is contained in:
黄启元 2023-07-26 10:32:53 +08:00 committed by GitHub
parent 68360e7ce8
commit 78205c3254
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -101,7 +101,8 @@ def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
rank = int(os.environ['RANK'])
if is_mlu_available():
import torch_mlu # noqa: F401
torch.mlu.set_device(rank)
local_rank = int(os.environ['LOCAL_RANK'])
torch.mlu.set_device(local_rank)
torch_dist.init_process_group(
backend='cncl',
rank=rank,