yolov5/utils/aws/resume.py

# Resume all interrupted trainings in yolov5/ dir including DDP trainings
# Usage: $ python utils/aws/resume.py

import os
import sys
from pathlib import Path

import torch
import yaml

FILE = Path(__file__).resolve()
ROOT = FILE.parents[2]  # YOLOv5 root directory
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # add ROOT to PATH

port = 0  # --master_port
path = Path('').resolve()
for last in path.rglob('*/**/last.pt'):
    ckpt = torch.load(last)
    if ckpt['optimizer'] is None:
        continue

    # Load opt.yaml
    with open(last.parent.parent / 'opt.yaml') as f:
        opt = yaml.safe_load(f)

    # Get device count
    d = opt['device'].split(',')  # devices
    nd = len(d)  # number of devices
    ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1)  # distributed data parallel

    if ddp:  # multi-GPU
        port += 1
        cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
    else:  # single-GPU
        cmd = f'python train.py --resume {last}'

    cmd += ' > /dev/null 2>&1 &'  # redirect output to dev/null and run in daemon thread
    print(cmd)
    os.system(cmd)
resume.py typo (#2603) 2021-03-25 20:55:20 +01:00			`# Resume all interrupted trainings in yolov5/ dir including DDP trainings`
Amazon AWS EC2 startup and re-startup scripts (#2185) * Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup 2021-02-23 23:10:14 -08:00			`# Usage: $ python utils/aws/resume.py`

			`import os`
Amazon AWS EC2 startup and re-startup scripts (#2282) 2021-02-24 01:43:59 -08:00			`import sys`
Amazon AWS EC2 startup and re-startup scripts (#2185) * Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup 2021-02-23 23:10:14 -08:00			`from pathlib import Path`

			`import torch`
			`import yaml`

Update `sys.path.append(str(ROOT))` (#4852) * Update `sys.path.append(str(ROOT))` * Cleanup 2021-09-18 15:02:08 +02:00			`FILE = Path(__file__).resolve()`
			`ROOT = FILE.parents[2] # YOLOv5 root directory`
			`if str(ROOT) not in sys.path:`
			`sys.path.append(str(ROOT)) # add ROOT to PATH`
Amazon AWS EC2 startup and re-startup scripts (#2282) 2021-02-24 01:43:59 -08:00
Amazon AWS EC2 startup and re-startup scripts (#2185) * Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup 2021-02-23 23:10:14 -08:00			`port = 0 # --master_port`
			`path = Path('').resolve()`
			`for last in path.rglob('/*/last.pt'):`
			`ckpt = torch.load(last)`
			`if ckpt['optimizer'] is None:`
			`continue`

			`# Load opt.yaml`
			`with open(last.parent.parent / 'opt.yaml') as f:`
Implement yaml.safe_load() (#2876) * Implement yaml.safe_load() * yaml.safe_dump() 2021-04-21 14:34:45 +02:00			`opt = yaml.safe_load(f)`
Amazon AWS EC2 startup and re-startup scripts (#2185) * Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup 2021-02-23 23:10:14 -08:00
			`# Get device count`
			`d = opt['device'].split(',') # devices`
			`nd = len(d) # number of devices`
			`ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel`

			`if ddp: # multi-GPU`
			`port += 1`
Update resume.py (#4115) 2021-07-22 17:22:11 +02:00			`cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'`
Amazon AWS EC2 startup and re-startup scripts (#2185) * Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup 2021-02-23 23:10:14 -08:00			`else: # single-GPU`
			`cmd = f'python train.py --resume {last}'`

			`cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread`
			`print(cmd)`
			`os.system(cmd)`