2021-03-26 03:55:20 +08:00
|
|
|
# Resume all interrupted trainings in yolov5/ dir including DDP trainings
|
2021-02-24 15:10:14 +08:00
|
|
|
# Usage: $ python utils/aws/resume.py
|
|
|
|
|
|
|
|
import os
|
2021-02-24 17:43:59 +08:00
|
|
|
import sys
|
2021-02-24 15:10:14 +08:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import torch
|
|
|
|
import yaml
|
|
|
|
|
2021-09-18 21:02:08 +08:00
|
|
|
FILE = Path(__file__).resolve()
|
|
|
|
ROOT = FILE.parents[2] # YOLOv5 root directory
|
|
|
|
if str(ROOT) not in sys.path:
|
|
|
|
sys.path.append(str(ROOT)) # add ROOT to PATH
|
2021-02-24 17:43:59 +08:00
|
|
|
|
2021-02-24 15:10:14 +08:00
|
|
|
port = 0 # --master_port
|
|
|
|
path = Path('').resolve()
|
|
|
|
for last in path.rglob('*/**/last.pt'):
|
|
|
|
ckpt = torch.load(last)
|
|
|
|
if ckpt['optimizer'] is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Load opt.yaml
|
|
|
|
with open(last.parent.parent / 'opt.yaml') as f:
|
2021-04-21 20:34:45 +08:00
|
|
|
opt = yaml.safe_load(f)
|
2021-02-24 15:10:14 +08:00
|
|
|
|
|
|
|
# Get device count
|
|
|
|
d = opt['device'].split(',') # devices
|
|
|
|
nd = len(d) # number of devices
|
|
|
|
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
|
|
|
|
|
|
|
|
if ddp: # multi-GPU
|
|
|
|
port += 1
|
2021-07-22 23:22:11 +08:00
|
|
|
cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
|
2021-02-24 15:10:14 +08:00
|
|
|
else: # single-GPU
|
|
|
|
cmd = f'python train.py --resume {last}'
|
|
|
|
|
|
|
|
cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
|
|
|
|
print(cmd)
|
|
|
|
os.system(cmd)
|