Auto-format by https://ultralytics.com/actions
parent
f20e9969d4
commit
4736f445d7
|
@ -224,7 +224,6 @@ def train(opt, device):
|
|||
for i, (images, labels) in pbar: # progress bar
|
||||
images, labels = images.to(device, non_blocking=True), labels.to(device)
|
||||
|
||||
|
||||
amp_autocast = None
|
||||
if check_version(torch.__version__, "2.4.0"):
|
||||
amp_autocast = torch.amp.autocast("cuda", enabled=device.type != "cpu")
|
||||
|
|
274
train.py
274
train.py
|
@ -95,9 +95,7 @@ from utils.torch_utils import (
|
|||
torch_distributed_zero_first,
|
||||
)
|
||||
|
||||
LOCAL_RANK = int(
|
||||
os.getenv("LOCAL_RANK", -1)
|
||||
) # https://pytorch.org/docs/stable/elastic/run.html
|
||||
LOCAL_RANK = int(os.getenv("LOCAL_RANK", -1)) # https://pytorch.org/docs/stable/elastic/run.html
|
||||
RANK = int(os.getenv("RANK", -1))
|
||||
WORLD_SIZE = int(os.getenv("WORLD_SIZE", 1))
|
||||
GIT_INFO = check_git_info()
|
||||
|
@ -177,9 +175,7 @@ def train(hyp, opt, device, callbacks):
|
|||
if isinstance(hyp, str):
|
||||
with open(hyp, errors="ignore") as f:
|
||||
hyp = yaml.safe_load(f) # load hyps dict
|
||||
LOGGER.info(
|
||||
colorstr("hyperparameters: ") + ", ".join(f"{k}={v}" for k, v in hyp.items())
|
||||
)
|
||||
LOGGER.info(colorstr("hyperparameters: ") + ", ".join(f"{k}={v}" for k, v in hyp.items()))
|
||||
opt.hyp = hyp.copy() # for saving hyps to checkpoints
|
||||
|
||||
# Save run settings
|
||||
|
@ -227,14 +223,8 @@ def train(hyp, opt, device, callbacks):
|
|||
data_dict = data_dict or check_dataset(data) # check if None
|
||||
train_path, val_path = data_dict["train"], data_dict["val"]
|
||||
nc = 1 if single_cls else int(data_dict["nc"]) # number of classes
|
||||
names = (
|
||||
{0: "item"}
|
||||
if single_cls and len(data_dict["names"]) != 1
|
||||
else data_dict["names"]
|
||||
) # class names
|
||||
is_coco = isinstance(val_path, str) and val_path.endswith(
|
||||
"coco/val2017.txt"
|
||||
) # COCO dataset
|
||||
names = {0: "item"} if single_cls and len(data_dict["names"]) != 1 else data_dict["names"] # class names
|
||||
is_coco = isinstance(val_path, str) and val_path.endswith("coco/val2017.txt") # COCO dataset
|
||||
|
||||
# Model
|
||||
check_suffix(weights, ".pt") # check weights
|
||||
|
@ -242,31 +232,19 @@ def train(hyp, opt, device, callbacks):
|
|||
if pretrained:
|
||||
with torch_distributed_zero_first(LOCAL_RANK):
|
||||
weights = attempt_download(weights) # download if not found locally
|
||||
ckpt = torch.load(
|
||||
weights, map_location="cpu"
|
||||
) # load checkpoint to CPU to avoid CUDA memory leak
|
||||
model = Model(
|
||||
cfg or ckpt["model"].yaml, ch=3, nc=nc, anchors=hyp.get("anchors")
|
||||
).to(
|
||||
device
|
||||
) # create
|
||||
exclude = (
|
||||
["anchor"] if (cfg or hyp.get("anchors")) and not resume else []
|
||||
) # exclude keys
|
||||
ckpt = torch.load(weights, map_location="cpu") # load checkpoint to CPU to avoid CUDA memory leak
|
||||
model = Model(cfg or ckpt["model"].yaml, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device) # create
|
||||
exclude = ["anchor"] if (cfg or hyp.get("anchors")) and not resume else [] # exclude keys
|
||||
csd = ckpt["model"].float().state_dict() # checkpoint state_dict as FP32
|
||||
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect
|
||||
model.load_state_dict(csd, strict=False) # load
|
||||
LOGGER.info(
|
||||
f"Transferred {len(csd)}/{len(model.state_dict())} items from {weights}"
|
||||
) # report
|
||||
LOGGER.info(f"Transferred {len(csd)}/{len(model.state_dict())} items from {weights}") # report
|
||||
else:
|
||||
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device) # create
|
||||
amp = check_amp(model) # check AMP
|
||||
|
||||
# Freeze
|
||||
freeze = [
|
||||
f"model.{x}." for x in (freeze if len(freeze) > 1 else range(freeze[0]))
|
||||
] # layers to freeze
|
||||
freeze = [f"model.{x}." for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze
|
||||
for k, v in model.named_parameters():
|
||||
v.requires_grad = True # train all layers
|
||||
# v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results)
|
||||
|
@ -287,9 +265,7 @@ def train(hyp, opt, device, callbacks):
|
|||
nbs = 64 # nominal batch size
|
||||
accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing
|
||||
hyp["weight_decay"] *= batch_size * accumulate / nbs # scale weight_decay
|
||||
optimizer = smart_optimizer(
|
||||
model, opt.optimizer, hyp["lr0"], hyp["momentum"], hyp["weight_decay"]
|
||||
)
|
||||
optimizer = smart_optimizer(model, opt.optimizer, hyp["lr0"], hyp["momentum"], hyp["weight_decay"])
|
||||
|
||||
# Scheduler
|
||||
if opt.cos_lr:
|
||||
|
@ -300,9 +276,7 @@ def train(hyp, opt, device, callbacks):
|
|||
"""Linear learning rate scheduler function with decay calculated by epoch proportion."""
|
||||
return (1 - x / epochs) * (1.0 - hyp["lrf"]) + hyp["lrf"] # linear
|
||||
|
||||
scheduler = lr_scheduler.LambdaLR(
|
||||
optimizer, lr_lambda=lf
|
||||
) # plot_lr_scheduler(optimizer, scheduler, epochs)
|
||||
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
|
||||
|
||||
# EMA
|
||||
ema = ModelEMA(model) if RANK in {-1, 0} else None
|
||||
|
@ -311,9 +285,7 @@ def train(hyp, opt, device, callbacks):
|
|||
best_fitness, start_epoch = 0.0, 0
|
||||
if pretrained:
|
||||
if resume:
|
||||
best_fitness, start_epoch, epochs = smart_resume(
|
||||
ckpt, optimizer, ema, weights, epochs, resume
|
||||
)
|
||||
best_fitness, start_epoch, epochs = smart_resume(ckpt, optimizer, ema, weights, epochs, resume)
|
||||
del ckpt, csd
|
||||
|
||||
# DP mode
|
||||
|
@ -350,9 +322,7 @@ def train(hyp, opt, device, callbacks):
|
|||
)
|
||||
labels = np.concatenate(dataset.labels, 0)
|
||||
mlc = int(labels[:, 0].max()) # max label class
|
||||
assert (
|
||||
mlc < nc
|
||||
), f"Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}"
|
||||
assert mlc < nc, f"Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}"
|
||||
|
||||
# Process 0
|
||||
if RANK in {-1, 0}:
|
||||
|
@ -373,9 +343,7 @@ def train(hyp, opt, device, callbacks):
|
|||
|
||||
if not resume:
|
||||
if not opt.noautoanchor:
|
||||
check_anchors(
|
||||
dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz
|
||||
) # run AutoAnchor
|
||||
check_anchors(dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz) # run AutoAnchor
|
||||
model.half().float() # pre-reduce anchor precision
|
||||
|
||||
callbacks.run("on_pretrain_routine_end", labels, names)
|
||||
|
@ -392,17 +360,13 @@ def train(hyp, opt, device, callbacks):
|
|||
hyp["label_smoothing"] = opt.label_smoothing
|
||||
model.nc = nc # attach number of classes to model
|
||||
model.hyp = hyp # attach hyperparameters to model
|
||||
model.class_weights = (
|
||||
labels_to_class_weights(dataset.labels, nc).to(device) * nc
|
||||
) # attach class weights
|
||||
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights
|
||||
model.names = names
|
||||
|
||||
# Start training
|
||||
t0 = time.time()
|
||||
nb = len(train_loader) # number of batches
|
||||
nw = max(
|
||||
round(hyp["warmup_epochs"] * nb), 100
|
||||
) # number of warmup iterations, max(3 epochs, 100 iterations)
|
||||
nw = max(round(hyp["warmup_epochs"] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations)
|
||||
# nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training
|
||||
last_opt_step = -1
|
||||
maps = np.zeros(nc) # mAP per class
|
||||
|
@ -424,23 +388,15 @@ def train(hyp, opt, device, callbacks):
|
|||
f"Logging results to {colorstr('bold', save_dir)}\n"
|
||||
f"Starting training for {epochs} epochs..."
|
||||
)
|
||||
for epoch in range(
|
||||
start_epoch, epochs
|
||||
): # epoch ------------------------------------------------------------------
|
||||
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
||||
callbacks.run("on_train_epoch_start")
|
||||
model.train()
|
||||
|
||||
# Update image weights (optional, single-GPU only)
|
||||
if opt.image_weights:
|
||||
cw = (
|
||||
model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc
|
||||
) # class weights
|
||||
iw = labels_to_image_weights(
|
||||
dataset.labels, nc=nc, class_weights=cw
|
||||
) # image weights
|
||||
dataset.indices = random.choices(
|
||||
range(dataset.n), weights=iw, k=dataset.n
|
||||
) # rand weighted idx
|
||||
cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights
|
||||
iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights
|
||||
dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx
|
||||
|
||||
# Update mosaic border (optional)
|
||||
# b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
|
||||
|
@ -470,14 +426,10 @@ def train(hyp, opt, device, callbacks):
|
|||
targets,
|
||||
paths,
|
||||
_,
|
||||
) in (
|
||||
pbar
|
||||
): # batch -------------------------------------------------------------
|
||||
) in pbar: # batch -------------------------------------------------------------
|
||||
callbacks.run("on_train_batch_start")
|
||||
ni = i + nb * epoch # number integrated batches (since train start)
|
||||
imgs = (
|
||||
imgs.to(device, non_blocking=True).float() / 255
|
||||
) # uint8 to float32, 0-255 to 0.0-1.0
|
||||
imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0
|
||||
|
||||
# Warmup
|
||||
if ni <= nw:
|
||||
|
@ -495,23 +447,15 @@ def train(hyp, opt, device, callbacks):
|
|||
],
|
||||
)
|
||||
if "momentum" in x:
|
||||
x["momentum"] = np.interp(
|
||||
ni, xi, [hyp["warmup_momentum"], hyp["momentum"]]
|
||||
)
|
||||
x["momentum"] = np.interp(ni, xi, [hyp["warmup_momentum"], hyp["momentum"]])
|
||||
|
||||
# Multi-scale
|
||||
if opt.multi_scale:
|
||||
sz = (
|
||||
random.randrange(int(imgsz * 0.5), int(imgsz * 1.5) + gs) // gs * gs
|
||||
) # size
|
||||
sz = random.randrange(int(imgsz * 0.5), int(imgsz * 1.5) + gs) // gs * gs # size
|
||||
sf = sz / max(imgs.shape[2:]) # scale factor
|
||||
if sf != 1:
|
||||
ns = [
|
||||
math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]
|
||||
] # new shape (stretched to gs-multiple)
|
||||
imgs = nn.functional.interpolate(
|
||||
imgs, size=ns, mode="bilinear", align_corners=False
|
||||
)
|
||||
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
|
||||
imgs = nn.functional.interpolate(imgs, size=ns, mode="bilinear", align_corners=False)
|
||||
|
||||
amp_autocast = None
|
||||
if check_version(torch.__version__, "2.4.0"):
|
||||
|
@ -522,9 +466,7 @@ def train(hyp, opt, device, callbacks):
|
|||
# Forward
|
||||
with amp_autocast:
|
||||
pred = model(imgs) # forward
|
||||
loss, loss_items = compute_loss(
|
||||
pred, targets.to(device)
|
||||
) # loss scaled by batch_size
|
||||
loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size
|
||||
if RANK != -1:
|
||||
loss *= WORLD_SIZE # gradient averaged between devices in DDP mode
|
||||
if opt.quad:
|
||||
|
@ -536,9 +478,7 @@ def train(hyp, opt, device, callbacks):
|
|||
# Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
|
||||
if ni - last_opt_step >= accumulate:
|
||||
scaler.unscale_(optimizer) # unscale gradients
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
model.parameters(), max_norm=10.0
|
||||
) # clip gradients
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0) # clip gradients
|
||||
scaler.step(optimizer) # optimizer.step
|
||||
scaler.update()
|
||||
optimizer.zero_grad()
|
||||
|
@ -560,9 +500,7 @@ def train(hyp, opt, device, callbacks):
|
|||
imgs.shape[-1],
|
||||
)
|
||||
)
|
||||
callbacks.run(
|
||||
"on_train_batch_end", model, ni, imgs, targets, paths, list(mloss)
|
||||
)
|
||||
callbacks.run("on_train_batch_end", model, ni, imgs, targets, paths, list(mloss))
|
||||
if callbacks.stop_training:
|
||||
return
|
||||
# end batch ------------------------------------------------------------------------------------------------
|
||||
|
@ -574,9 +512,7 @@ def train(hyp, opt, device, callbacks):
|
|||
if RANK in {-1, 0}:
|
||||
# mAP
|
||||
callbacks.run("on_train_epoch_end", epoch=epoch)
|
||||
ema.update_attr(
|
||||
model, include=["yaml", "nc", "hyp", "names", "stride", "class_weights"]
|
||||
)
|
||||
ema.update_attr(model, include=["yaml", "nc", "hyp", "names", "stride", "class_weights"])
|
||||
final_epoch = (epoch + 1 == epochs) or stopper.possible_stop
|
||||
if not noval or final_epoch: # Calculate mAP
|
||||
results, maps, _ = validate.run(
|
||||
|
@ -594,9 +530,7 @@ def train(hyp, opt, device, callbacks):
|
|||
)
|
||||
|
||||
# Update best mAP
|
||||
fi = fitness(
|
||||
np.array(results).reshape(1, -1)
|
||||
) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
||||
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
||||
stop = stopper(epoch=epoch, fitness=fi) # early stop check
|
||||
if fi > best_fitness:
|
||||
best_fitness = fi
|
||||
|
@ -624,16 +558,12 @@ def train(hyp, opt, device, callbacks):
|
|||
if opt.save_period > 0 and epoch % opt.save_period == 0:
|
||||
torch.save(ckpt, w / f"epoch{epoch}.pt")
|
||||
del ckpt
|
||||
callbacks.run(
|
||||
"on_model_save", last, epoch, final_epoch, best_fitness, fi
|
||||
)
|
||||
callbacks.run("on_model_save", last, epoch, final_epoch, best_fitness, fi)
|
||||
|
||||
# EarlyStopping
|
||||
if RANK != -1: # if DDP training
|
||||
broadcast_list = [stop if RANK == 0 else None]
|
||||
dist.broadcast_object_list(
|
||||
broadcast_list, 0
|
||||
) # broadcast 'stop' to all ranks
|
||||
dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks
|
||||
if RANK != 0:
|
||||
stop = broadcast_list[0]
|
||||
if stop:
|
||||
|
@ -642,9 +572,7 @@ def train(hyp, opt, device, callbacks):
|
|||
# end epoch ----------------------------------------------------------------------------------------------------
|
||||
# end training -----------------------------------------------------------------------------------------------------
|
||||
if RANK in {-1, 0}:
|
||||
LOGGER.info(
|
||||
f"\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours."
|
||||
)
|
||||
LOGGER.info(f"\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.")
|
||||
for f in last, best:
|
||||
if f.exists():
|
||||
strip_optimizer(f) # strip optimizers
|
||||
|
@ -655,9 +583,7 @@ def train(hyp, opt, device, callbacks):
|
|||
batch_size=batch_size // WORLD_SIZE * 2,
|
||||
imgsz=imgsz,
|
||||
model=attempt_load(f, device).half(),
|
||||
iou_thres=(
|
||||
0.65 if is_coco else 0.60
|
||||
), # best pycocotools at iou 0.65
|
||||
iou_thres=(0.65 if is_coco else 0.60), # best pycocotools at iou 0.65
|
||||
single_cls=single_cls,
|
||||
dataloader=val_loader,
|
||||
save_dir=save_dir,
|
||||
|
@ -705,13 +631,9 @@ def parse_opt(known=False):
|
|||
- Tutorial: https://docs.ultralytics.com/yolov5/tutorials/train_custom_data
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--weights", type=str, default=ROOT / "yolov5s.pt", help="initial weights path"
|
||||
)
|
||||
parser.add_argument("--weights", type=str, default=ROOT / "yolov5s.pt", help="initial weights path")
|
||||
parser.add_argument("--cfg", type=str, default="", help="model.yaml path")
|
||||
parser.add_argument(
|
||||
"--data", type=str, default=ROOT / "data/coco128.yaml", help="dataset.yaml path"
|
||||
)
|
||||
parser.add_argument("--data", type=str, default=ROOT / "data/coco128.yaml", help="dataset.yaml path")
|
||||
parser.add_argument(
|
||||
"--hyp",
|
||||
type=str,
|
||||
|
@ -741,15 +663,9 @@ def parse_opt(known=False):
|
|||
default=False,
|
||||
help="resume most recent training",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nosave", action="store_true", help="only save final checkpoint"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noval", action="store_true", help="only validate final epoch"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--noautoanchor", action="store_true", help="disable AutoAnchor"
|
||||
)
|
||||
parser.add_argument("--nosave", action="store_true", help="only save final checkpoint")
|
||||
parser.add_argument("--noval", action="store_true", help="only validate final epoch")
|
||||
parser.add_argument("--noautoanchor", action="store_true", help="disable AutoAnchor")
|
||||
parser.add_argument("--noplots", action="store_true", help="save no plot files")
|
||||
parser.add_argument(
|
||||
"--evolve",
|
||||
|
@ -771,20 +687,14 @@ def parse_opt(known=False):
|
|||
help="resume evolve from last generation",
|
||||
)
|
||||
parser.add_argument("--bucket", type=str, default="", help="gsutil bucket")
|
||||
parser.add_argument(
|
||||
"--cache", type=str, nargs="?", const="ram", help="image --cache ram/disk"
|
||||
)
|
||||
parser.add_argument("--cache", type=str, nargs="?", const="ram", help="image --cache ram/disk")
|
||||
parser.add_argument(
|
||||
"--image-weights",
|
||||
action="store_true",
|
||||
help="use weighted image selection for training",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device", default="", help="cuda device, i.e. 0 or 0,1,2,3 or cpu"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--multi-scale", action="store_true", help="vary img-size +/- 50%%"
|
||||
)
|
||||
parser.add_argument("--device", default="", help="cuda device, i.e. 0 or 0,1,2,3 or cpu")
|
||||
parser.add_argument("--multi-scale", action="store_true", help="vary img-size +/- 50%%")
|
||||
parser.add_argument(
|
||||
"--single-cls",
|
||||
action="store_true",
|
||||
|
@ -808,9 +718,7 @@ def parse_opt(known=False):
|
|||
default=8,
|
||||
help="max dataloader workers (per RANK in DDP mode)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--project", default=ROOT / "runs/train", help="save to project/name"
|
||||
)
|
||||
parser.add_argument("--project", default=ROOT / "runs/train", help="save to project/name")
|
||||
parser.add_argument("--name", default="exp", help="save to project/name")
|
||||
parser.add_argument(
|
||||
"--exist-ok",
|
||||
|
@ -819,9 +727,7 @@ def parse_opt(known=False):
|
|||
)
|
||||
parser.add_argument("--quad", action="store_true", help="quad dataloader")
|
||||
parser.add_argument("--cos-lr", action="store_true", help="cosine LR scheduler")
|
||||
parser.add_argument(
|
||||
"--label-smoothing", type=float, default=0.0, help="Label smoothing epsilon"
|
||||
)
|
||||
parser.add_argument("--label-smoothing", type=float, default=0.0, help="Label smoothing epsilon")
|
||||
parser.add_argument(
|
||||
"--patience",
|
||||
type=int,
|
||||
|
@ -872,9 +778,7 @@ def parse_opt(known=False):
|
|||
)
|
||||
|
||||
# NDJSON logging
|
||||
parser.add_argument(
|
||||
"--ndjson-console", action="store_true", help="Log ndjson to console"
|
||||
)
|
||||
parser.add_argument("--ndjson-console", action="store_true", help="Log ndjson to console")
|
||||
parser.add_argument("--ndjson-file", action="store_true", help="Log ndjson to file")
|
||||
|
||||
return parser.parse_known_args()[0] if known else parser.parse_args()
|
||||
|
@ -903,9 +807,7 @@ def main(opt, callbacks=Callbacks()):
|
|||
|
||||
# Resume (from specified or most recent last.pt)
|
||||
if opt.resume and not check_comet_resume(opt) and not opt.evolve:
|
||||
last = Path(
|
||||
check_file(opt.resume) if isinstance(opt.resume, str) else get_latest_run()
|
||||
)
|
||||
last = Path(check_file(opt.resume) if isinstance(opt.resume, str) else get_latest_run())
|
||||
opt_yaml = last.parent.parent / "opt.yaml" # train options yaml
|
||||
opt_data = opt.data # original dataset
|
||||
if opt_yaml.is_file():
|
||||
|
@ -925,13 +827,9 @@ def main(opt, callbacks=Callbacks()):
|
|||
str(opt.weights),
|
||||
str(opt.project),
|
||||
) # checks
|
||||
assert len(opt.cfg) or len(
|
||||
opt.weights
|
||||
), "either --cfg or --weights must be specified"
|
||||
assert len(opt.cfg) or len(opt.weights), "either --cfg or --weights must be specified"
|
||||
if opt.evolve:
|
||||
if opt.project == str(
|
||||
ROOT / "runs/train"
|
||||
): # if default project name, rename to runs/evolve
|
||||
if opt.project == str(ROOT / "runs/train"): # if default project name, rename to runs/evolve
|
||||
opt.project = str(ROOT / "runs/evolve")
|
||||
opt.exist_ok, opt.resume = (
|
||||
opt.resume,
|
||||
|
@ -939,9 +837,7 @@ def main(opt, callbacks=Callbacks()):
|
|||
) # pass resume to exist_ok and disable resume
|
||||
if opt.name == "cfg":
|
||||
opt.name = Path(opt.cfg).stem # use model.yaml as name
|
||||
opt.save_dir = str(
|
||||
increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)
|
||||
)
|
||||
opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
|
||||
|
||||
# DDP mode
|
||||
device = select_device(opt.device, batch_size=opt.batch_size)
|
||||
|
@ -949,15 +845,9 @@ def main(opt, callbacks=Callbacks()):
|
|||
msg = "is not compatible with YOLOv5 Multi-GPU DDP training"
|
||||
assert not opt.image_weights, f"--image-weights {msg}"
|
||||
assert not opt.evolve, f"--evolve {msg}"
|
||||
assert (
|
||||
opt.batch_size != -1
|
||||
), f"AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size"
|
||||
assert (
|
||||
opt.batch_size % WORLD_SIZE == 0
|
||||
), f"--batch-size {opt.batch_size} must be multiple of WORLD_SIZE"
|
||||
assert (
|
||||
torch.cuda.device_count() > LOCAL_RANK
|
||||
), "insufficient CUDA devices for DDP command"
|
||||
assert opt.batch_size != -1, f"AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size"
|
||||
assert opt.batch_size % WORLD_SIZE == 0, f"--batch-size {opt.batch_size} must be multiple of WORLD_SIZE"
|
||||
assert torch.cuda.device_count() > LOCAL_RANK, "insufficient CUDA devices for DDP command"
|
||||
torch.cuda.set_device(LOCAL_RANK)
|
||||
device = torch.device("cuda", LOCAL_RANK)
|
||||
dist.init_process_group(
|
||||
|
@ -1059,18 +949,14 @@ def main(opt, callbacks=Callbacks()):
|
|||
upper_limit = np.array([meta[k][2] for k in hyp_GA.keys()])
|
||||
|
||||
# Create gene_ranges list to hold the range of values for each gene in the population
|
||||
gene_ranges = [
|
||||
(lower_limit[i], upper_limit[i]) for i in range(len(upper_limit))
|
||||
]
|
||||
gene_ranges = [(lower_limit[i], upper_limit[i]) for i in range(len(upper_limit))]
|
||||
|
||||
# Initialize the population with initial_values or random values
|
||||
initial_values = []
|
||||
|
||||
# If resuming evolution from a previous checkpoint
|
||||
if opt.resume_evolve is not None:
|
||||
assert os.path.isfile(
|
||||
ROOT / opt.resume_evolve
|
||||
), "evolve population path is wrong!"
|
||||
assert os.path.isfile(ROOT / opt.resume_evolve), "evolve population path is wrong!"
|
||||
with open(ROOT / opt.resume_evolve, errors="ignore") as f:
|
||||
evolve_population = yaml.safe_load(f)
|
||||
for value in evolve_population.values():
|
||||
|
@ -1079,9 +965,7 @@ def main(opt, callbacks=Callbacks()):
|
|||
|
||||
# If not resuming from a previous checkpoint, generate initial values from .yaml files in opt.evolve_population
|
||||
else:
|
||||
yaml_files = [
|
||||
f for f in os.listdir(opt.evolve_population) if f.endswith(".yaml")
|
||||
]
|
||||
yaml_files = [f for f in os.listdir(opt.evolve_population) if f.endswith(".yaml")]
|
||||
for file_name in yaml_files:
|
||||
with open(os.path.join(opt.evolve_population, file_name)) as yaml_file:
|
||||
value = yaml.safe_load(yaml_file)
|
||||
|
@ -1090,14 +974,9 @@ def main(opt, callbacks=Callbacks()):
|
|||
|
||||
# Generate random values within the search space for the rest of the population
|
||||
if initial_values is None:
|
||||
population = [
|
||||
generate_individual(gene_ranges, len(hyp_GA)) for _ in range(pop_size)
|
||||
]
|
||||
population = [generate_individual(gene_ranges, len(hyp_GA)) for _ in range(pop_size)]
|
||||
elif pop_size > 1:
|
||||
population = [
|
||||
generate_individual(gene_ranges, len(hyp_GA))
|
||||
for _ in range(pop_size - len(initial_values))
|
||||
]
|
||||
population = [generate_individual(gene_ranges, len(hyp_GA)) for _ in range(pop_size - len(initial_values))]
|
||||
for initial_value in initial_values:
|
||||
population = [initial_value] + population
|
||||
|
||||
|
@ -1107,19 +986,14 @@ def main(opt, callbacks=Callbacks()):
|
|||
if generation >= 1:
|
||||
save_dict = {}
|
||||
for i in range(len(population)):
|
||||
little_dict = {
|
||||
list_keys[j]: float(population[i][j])
|
||||
for j in range(len(population[i]))
|
||||
}
|
||||
little_dict = {list_keys[j]: float(population[i][j]) for j in range(len(population[i]))}
|
||||
save_dict[f"gen{str(generation)}number{str(i)}"] = little_dict
|
||||
|
||||
with open(save_dir / "evolve_population.yaml", "w") as outfile:
|
||||
yaml.dump(save_dict, outfile, default_flow_style=False)
|
||||
|
||||
# Adaptive elite size
|
||||
elite_size = min_elite_size + int(
|
||||
(max_elite_size - min_elite_size) * (generation / opt.evolve)
|
||||
)
|
||||
elite_size = min_elite_size + int((max_elite_size - min_elite_size) * (generation / opt.evolve))
|
||||
# Evaluate the fitness of each individual in the population
|
||||
fitness_scores = []
|
||||
for individual in population:
|
||||
|
@ -1147,25 +1021,16 @@ def main(opt, callbacks=Callbacks()):
|
|||
# Adaptive tournament size
|
||||
tournament_size = max(
|
||||
max(2, tournament_size_min),
|
||||
int(
|
||||
min(tournament_size_max, pop_size)
|
||||
- (generation / (opt.evolve / 10))
|
||||
),
|
||||
int(min(tournament_size_max, pop_size) - (generation / (opt.evolve / 10))),
|
||||
)
|
||||
# Perform tournament selection to choose the best individual
|
||||
tournament_indices = random.sample(range(pop_size), tournament_size)
|
||||
tournament_fitness = [fitness_scores[j] for j in tournament_indices]
|
||||
winner_index = tournament_indices[
|
||||
tournament_fitness.index(max(tournament_fitness))
|
||||
]
|
||||
winner_index = tournament_indices[tournament_fitness.index(max(tournament_fitness))]
|
||||
selected_indices.append(winner_index)
|
||||
|
||||
# Add the elite individuals to the selected indices
|
||||
elite_indices = [
|
||||
i
|
||||
for i in range(pop_size)
|
||||
if fitness_scores[i] in sorted(fitness_scores)[-elite_size:]
|
||||
]
|
||||
elite_indices = [i for i in range(pop_size) if fitness_scores[i] in sorted(fitness_scores)[-elite_size:]]
|
||||
selected_indices.extend(elite_indices)
|
||||
# Create the next generation through crossover and mutation
|
||||
next_generation = []
|
||||
|
@ -1182,25 +1047,18 @@ def main(opt, callbacks=Callbacks()):
|
|||
)
|
||||
if random.uniform(0, 1) < crossover_rate:
|
||||
crossover_point = random.randint(1, len(hyp_GA) - 1)
|
||||
child = (
|
||||
population[parent1_index][:crossover_point]
|
||||
+ population[parent2_index][crossover_point:]
|
||||
)
|
||||
child = population[parent1_index][:crossover_point] + population[parent2_index][crossover_point:]
|
||||
else:
|
||||
child = population[parent1_index]
|
||||
# Adaptive mutation rate
|
||||
mutation_rate = max(
|
||||
mutation_rate_min,
|
||||
min(
|
||||
mutation_rate_max, mutation_rate_max - (generation / opt.evolve)
|
||||
),
|
||||
min(mutation_rate_max, mutation_rate_max - (generation / opt.evolve)),
|
||||
)
|
||||
for j in range(len(hyp_GA)):
|
||||
if random.uniform(0, 1) < mutation_rate:
|
||||
child[j] += random.uniform(-0.1, 0.1)
|
||||
child[j] = min(
|
||||
max(child[j], gene_ranges[j][0]), gene_ranges[j][1]
|
||||
)
|
||||
child[j] = min(max(child[j], gene_ranges[j][0]), gene_ranges[j][1])
|
||||
next_generation.append(child)
|
||||
# Replace the old population with the new generation
|
||||
population = next_generation
|
||||
|
|
|
@ -14,9 +14,7 @@ def check_train_batch_size(model, imgsz=640, amp=True):
|
|||
"""Checks and computes optimal training batch size for YOLOv5 model, given image size and AMP setting."""
|
||||
if check_version(torch.__version__, "2.4.0"):
|
||||
with torch.amp.autocast("cuda", enabled=amp):
|
||||
return autobatch(
|
||||
deepcopy(model).train(), imgsz
|
||||
) # compute optimal batch size
|
||||
return autobatch(deepcopy(model).train(), imgsz) # compute optimal batch size
|
||||
with torch.cuda.amp.autocast(amp):
|
||||
return autobatch(deepcopy(model).train(), imgsz) # compute optimal batch size
|
||||
|
||||
|
@ -34,14 +32,10 @@ def autobatch(model, imgsz=640, fraction=0.8, batch_size=16):
|
|||
LOGGER.info(f"{prefix}Computing optimal batch size for --imgsz {imgsz}")
|
||||
device = next(model.parameters()).device # get model device
|
||||
if device.type == "cpu":
|
||||
LOGGER.info(
|
||||
f"{prefix}CUDA not detected, using default CPU batch-size {batch_size}"
|
||||
)
|
||||
LOGGER.info(f"{prefix}CUDA not detected, using default CPU batch-size {batch_size}")
|
||||
return batch_size
|
||||
if torch.backends.cudnn.benchmark:
|
||||
LOGGER.info(
|
||||
f"{prefix} ⚠️ Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}"
|
||||
)
|
||||
LOGGER.info(f"{prefix} ⚠️ Requires torch.backends.cudnn.benchmark=False, using default batch-size {batch_size}")
|
||||
return batch_size
|
||||
|
||||
# Inspect CUDA memory
|
||||
|
@ -52,9 +46,7 @@ def autobatch(model, imgsz=640, fraction=0.8, batch_size=16):
|
|||
r = torch.cuda.memory_reserved(device) / gb # GiB reserved
|
||||
a = torch.cuda.memory_allocated(device) / gb # GiB allocated
|
||||
f = t - (r + a) # GiB free
|
||||
LOGGER.info(
|
||||
f"{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free"
|
||||
)
|
||||
LOGGER.info(f"{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free")
|
||||
|
||||
# Profile batch sizes
|
||||
batch_sizes = [1, 2, 4, 8, 16]
|
||||
|
@ -74,12 +66,8 @@ def autobatch(model, imgsz=640, fraction=0.8, batch_size=16):
|
|||
b = batch_sizes[max(i - 1, 0)] # select prior safe point
|
||||
if b < 1 or b > 1024: # b outside of safe range
|
||||
b = batch_size
|
||||
LOGGER.warning(
|
||||
f"{prefix}WARNING ⚠️ CUDA anomaly detected, recommend restart environment and retry command."
|
||||
)
|
||||
LOGGER.warning(f"{prefix}WARNING ⚠️ CUDA anomaly detected, recommend restart environment and retry command.")
|
||||
|
||||
fraction = (np.polyval(p, b) + r + a) / t # actual fraction predicted
|
||||
LOGGER.info(
|
||||
f"{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅"
|
||||
)
|
||||
LOGGER.info(f"{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅")
|
||||
return b
|
||||
|
|
Loading…
Reference in New Issue