Implementation of Early Stopping for DDP training (#8345)
* Implementation of Early Stopping for DDP training This edit correctly uses the broadcast_object_list() function to send slave processes a boolean so to end the training phase if the variable is True, thus allowing the master process to destroy the process group and terminate. * Update train.py * Update train.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update train.py * Update train.py * Update train.py * Further cleanup This cleans up the definition of broadcast_list and removes the requirement for clear() afterward. Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>pull/8466/head
parent
f76a78e707
commit
6935a54e60
24
train.py
24
train.py
|
@ -294,7 +294,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio
|
||||||
results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
|
results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
|
||||||
scheduler.last_epoch = start_epoch - 1 # do not move
|
scheduler.last_epoch = start_epoch - 1 # do not move
|
||||||
scaler = torch.cuda.amp.GradScaler(enabled=amp)
|
scaler = torch.cuda.amp.GradScaler(enabled=amp)
|
||||||
stopper = EarlyStopping(patience=opt.patience)
|
stopper, stop = EarlyStopping(patience=opt.patience), False
|
||||||
compute_loss = ComputeLoss(model) # init loss class
|
compute_loss = ComputeLoss(model) # init loss class
|
||||||
callbacks.run('on_train_start')
|
callbacks.run('on_train_start')
|
||||||
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
|
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
|
||||||
|
@ -402,6 +402,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio
|
||||||
|
|
||||||
# Update best mAP
|
# Update best mAP
|
||||||
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
||||||
|
stop = stopper(epoch=epoch, fitness=fi) # early stop check
|
||||||
if fi > best_fitness:
|
if fi > best_fitness:
|
||||||
best_fitness = fi
|
best_fitness = fi
|
||||||
log_vals = list(mloss) + list(results) + lr
|
log_vals = list(mloss) + list(results) + lr
|
||||||
|
@ -428,19 +429,14 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio
|
||||||
del ckpt
|
del ckpt
|
||||||
callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
|
callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
|
||||||
|
|
||||||
# Stop Single-GPU
|
# EarlyStopping
|
||||||
if RANK == -1 and stopper(epoch=epoch, fitness=fi):
|
if RANK != -1: # if DDP training
|
||||||
break
|
broadcast_list = [stop if RANK == 0 else None]
|
||||||
|
dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks
|
||||||
# Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
|
if RANK != 0:
|
||||||
# stop = stopper(epoch=epoch, fitness=fi)
|
stop = broadcast_list[0]
|
||||||
# if RANK == 0:
|
if stop:
|
||||||
# dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks
|
break # must break all DDP ranks
|
||||||
|
|
||||||
# Stop DPP
|
|
||||||
# with torch_distributed_zero_first(RANK):
|
|
||||||
# if stop:
|
|
||||||
# break # must break all DDP ranks
|
|
||||||
|
|
||||||
# end epoch ----------------------------------------------------------------------------------------------------
|
# end epoch ----------------------------------------------------------------------------------------------------
|
||||||
# end training -----------------------------------------------------------------------------------------------------
|
# end training -----------------------------------------------------------------------------------------------------
|
||||||
|
|
Loading…
Reference in New Issue