Increase NCCL timeout to 3 hours (#12345)
* Increase NCCL timeout to 3 hours When training on a large dataset using DDP, the scanning process will be very long, and it will raise NCCL timeout error. Change the default timeout 30min to 3 hours, same as ultralytics yolov8 (https://github.com/ultralytics/ultralytics/pull/3343) Signed-off-by: Troy <wudashuo@vip.qq.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Troy <wudashuo@vip.qq.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>pull/12420/head
parent
3d8f004559
commit
cc232e3e35
5
train.py
5
train.py
|
@ -23,7 +23,7 @@ import subprocess
|
|||
import sys
|
||||
import time
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
|
@ -529,7 +529,8 @@ def main(opt, callbacks=Callbacks()):
|
|||
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
|
||||
torch.cuda.set_device(LOCAL_RANK)
|
||||
device = torch.device('cuda', LOCAL_RANK)
|
||||
dist.init_process_group(backend='nccl' if dist.is_nccl_available() else 'gloo')
|
||||
dist.init_process_group(backend='nccl' if dist.is_nccl_available() else 'gloo',
|
||||
timeout=timedelta(seconds=10800))
|
||||
|
||||
# Train
|
||||
if not opt.evolve:
|
||||
|
|
Loading…
Reference in New Issue