Refactor code for speed and clarity

pull/13198/head
Glenn Jocher 2024-07-17 22:26:29 +02:00
parent 8003649c79
commit 90cd326b3f
4 changed files with 3260 additions and 248 deletions

File diff suppressed because it is too large Load Diff

View File

@ -14,8 +14,19 @@ class Sum(nn.Module):
"""Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070."""
def __init__(self, n, weight=False):
"""Initializes a module to sum outputs of layers with number of inputs `n` and optional weighting, supporting 2+
inputs.
"""
Initialize the Sum module to aggregate outputs from multiple layers, optionally with weights.
Args:
n (int): Number of layers to sum. Must be 2 or more.
weight (bool): If True, applies weights to the inputs before summing.
Returns:
None
Notes:
Refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070 for detailed insights
and usage scenarios.
"""
super().__init__()
self.weight = weight # apply weights boolean
@ -24,7 +35,26 @@ class Sum(nn.Module):
self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True) # layer weights
def forward(self, x):
"""Processes input through a customizable weighted sum of `n` inputs, optionally applying learned weights."""
"""
Compute a weighted or unweighted sum of input tensors.
Args:
x (list[torch.Tensor]): List of input tensors to be summed, with each tensor having the same shape (N, D).
Returns:
(torch.Tensor): The resulting tensor after summing the input tensors, maintaining the same shape (N, D).
Example:
```python
sum_layer = Sum(n=3, weight=False)
inputs = [torch.rand(1, 10), torch.rand(1, 10), torch.rand(1, 10)]
result = sum_layer.forward(inputs)
```
Note:
If `weight` is set to True when initializing the class, weights will be applied to the inputs before summing.
For more information, refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070.
"""
y = x[0] # no weight
if self.weight:
w = torch.sigmoid(self.w) * 2
@ -40,8 +70,29 @@ class MixConv2d(nn.Module):
"""Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595."""
def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
"""Initializes MixConv2d with mixed depth-wise convolutional layers, taking input and output channels (c1, c2),
kernel sizes (k), stride (s), and channel distribution strategy (equal_ch).
"""
Initialize the MixConv2d module, handling mixed depth-wise convolutional operations.
Args:
c1 (int): Number of input channels (C1).
c2 (int): Number of output channels (C2).
k (tuple[int]): Kernel sizes for the convolutional layers.
s (int): Stride value for the convolutional layers.
equal_ch (bool): Flag to determine if channels are distributed equally. True for equal channels per group, False
for equal weight.numel() per group.
Example:
```python
mixconv = MixConv2d(c1=32, c2=64, k=(1, 3, 5), s=1, equal_ch=True)
output = mixconv(input_tensor)
```
Note:
The `MixConv2d` layer applies multiple depth-wise convolutions with different kernel sizes in parallel, which
can capture multi-scale features within a single layer. This technique is particularly useful for improving
spatial feature extraction and reducing model complexity.
Further reading: https://arxiv.org/abs/1907.09595
"""
super().__init__()
n = len(k) # number of convolutions
@ -63,8 +114,24 @@ class MixConv2d(nn.Module):
self.act = nn.SiLU()
def forward(self, x):
"""Performs forward pass by applying SiLU activation on batch-normalized concatenated convolutional layer
outputs.
"""
Perform forward pass by applying mixed depth-wise convolutions followed by batch normalization and SiLU activation.
Args:
x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
H is the height, and W is the width.
Returns:
(torch.Tensor): Output tensor after applying mixed convolutions, batch normalization, and SiLU activation,
maintaining the shape (N, C', H', W') where C' is the output channels based on the convolutional layer
configuration.
Example:
```python
mixconv = MixConv2d(c1=32, c2=64, k=(1, 3), s=1)
x = torch.randn(16, 32, 128, 128)
output = mixconv(x)
```
"""
return self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
@ -73,11 +140,51 @@ class Ensemble(nn.ModuleList):
"""Ensemble of models."""
def __init__(self):
"""Initializes an ensemble of models to be used for aggregated predictions."""
"""
Initializes an ensemble of models for combined inference and aggregated predictions.
Example:
```python
ensemble = Ensemble()
model1 = MyModel1()
model2 = MyModel2()
ensemble.append(model1)
ensemble.append(model2)
```
"""
super().__init__()
def forward(self, x, augment=False, profile=False, visualize=False):
"""Performs forward pass aggregating outputs from an ensemble of models.."""
"""
Aggregates outputs from multiple models in the ensemble by concatenating them during the forward pass.
Args:
x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
H is the height, and W is the width.
augment (bool): Flag to apply test-time augmentation (TTA) during inference. Default is False.
profile (bool): If True, enables profiling of the forward pass. Default is False.
visualize (bool): If True, enables visualization of model predictions. Default is False.
Returns:
(torch.Tensor): Aggregated output tensor from the ensemble models, with shape dependent on the number of models
and their architectures.
Example:
```python
from ultralytics import Ensemble
import torch
# Initialize the ensemble
ensemble = Ensemble()
# Assume models are already added to the ensemble
# Create a dummy input tensor
x = torch.randn(8, 3, 640, 640) # Example input for 8 images of 3 channels and 640x640 resolution
# Perform forward pass
output = ensemble.forward(x, augment=False, profile=False, visualize=False)
```
"""
y = [module(x, augment, profile, visualize)[0] for module in self]
# y = torch.stack(y).max(0)[0] # max ensemble
# y = torch.stack(y).mean(0) # mean ensemble
@ -87,9 +194,32 @@ class Ensemble(nn.ModuleList):
def attempt_load(weights, device=None, inplace=True, fuse=True):
"""
Loads and fuses an ensemble or single YOLOv5 model from weights, handling device placement and model adjustments.
Example inputs: weights=[a,b,c] or a single model weights=[a] or weights=a.
Loads and fuses a YOLOv5 model or an ensemble of models from provided weights, adjusting device placement and model
attributes for optimal performance.
Args:
weights (str | list[str]): Path(s) to model weight file(s). It can be a single path or a list of paths.
device (torch.device | None, optional): Device to load the model on. If None, loads on CPU by default.
inplace (bool, optional): If True, enables inplace operations in certain layers like activation layers.
Defaults to True.
fuse (bool, optional): Whether to fuse Conv2d + BatchNorm2d layers for speedup during inference. Defaults to True.
Returns:
(torch.nn.Module): Loaded YOLOv5 model or an ensemble of models loaded onto the specified device.
Example:
```python
# Load a single model weight
model = attempt_load('yolov5s.pt')
# Load an ensemble of models
model = attempt_load(['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt'])
```
Note:
- This function ensures compatibility and performance optimization by adjusting attributes and configurations of the
loaded model(s).
- If `fuse` is set to True, it will fuse Conv2d and BatchNorm2d layers within the model(s) to speed up inference.
"""
from models.yolo import Detect, Model

File diff suppressed because it is too large Load Diff

View File

@ -76,7 +76,27 @@ class Detect(nn.Module):
export = False # export mode
def __init__(self, nc=80, anchors=(), ch=(), inplace=True):
"""Initializes YOLOv5 detection layer with specified classes, anchors, channels, and inplace operations."""
"""
Initializes the YOLOv5 Detect layer with class count, anchors, channels, and inplace operations.
Args:
nc (int, optional): Number of classes. Default is 80.
anchors (tuple, optional): Anchor box dimensions, typically specified for each detection layer. Default is ().
ch (tuple, optional): Number of input channels for each detection layer. Default is ().
inplace (bool, optional): If True, operations are done inplace. Default is True.
Returns:
None
Example:
```python
detect_layer = Detect(nc=80, anchors=(), ch=(256, 512, 1024), inplace=True)
```
Note:
This function initializes detection heads in the YOLOv5 model, setting up convolution layers, grids, and
anchor grids required for object detection inference.
"""
super().__init__()
self.nc = nc # number of classes
self.no = nc + 5 # number of outputs per anchor
@ -89,7 +109,23 @@ class Detect(nn.Module):
self.inplace = inplace # use inplace ops (e.g. slice assignment)
def forward(self, x):
"""Processes input through YOLOv5 layers, altering shape for detection: `x(bs, 3, ny, nx, 85)`."""
"""
Processes input through detection layers, reshaping and applying convolution for YOLOv5 inference.
Args:
x (list[torch.Tensor]): List of feature maps from backbone with shape (B, C, H, W) where B is the batch
size, C is the number of channels, and H and W are height and width.
Returns:
(list[torch.Tensor]): List of processed detections, each a torch Tensor with shape (B, N, D) where B
is the batch size, N is the number of detections, and D is the dimensions of each detection
(e.g., bounding box coordinates, objectness score, class probabilities).
Note:
This method applies a series of convolutions to transform the input feature maps into detection
outputs. It also handles reshaping and permutation to align with YOLOv5's output format. During
inference, additional steps are performed to compute final object locations and dimensions.
"""
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
@ -115,7 +151,29 @@ class Detect(nn.Module):
return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, "1.10.0")):
"""Generates a mesh grid for anchor boxes with optional compatibility for torch versions < 1.10."""
"""
Generate a mesh grid for anchor boxes with torch version compatibility for detection models.
Args:
nx (int): Number of grid cells along the x-axis.
ny (int): Number of grid cells along the y-axis.
i (int): Index of the detection layer for which the grid is being generated.
torch_1_10 (bool): Indicator whether the torch version is at least 1.10.0 for meshgrid compatibility.
Returns:
(tuple[torch.Tensor, torch.Tensor]): A tuple containing two tensors:
- grid (torch.Tensor): The generated grid with shape (1, num_anchors, ny, nx, 2), containing xy coordinates.
- anchor_grid (torch.Tensor): The anchor grid scaled by the stride, with shape (1, num_anchors, ny, nx, 2).
Example:
```python
detector = Detect()
grid, anchor_grid = detector._make_grid(20, 20, 0)
```
Note:
The function ensures compatibility with different torch versions by using appropriate meshgrid indexing options.
"""
d = self.anchors[i].device
t = self.anchors[i].dtype
shape = 1, self.na, ny, nx, 2 # grid shape
@ -129,7 +187,25 @@ class Detect(nn.Module):
class Segment(Detect):
# YOLOv5 Segment head for segmentation models
def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True):
"""Initializes YOLOv5 Segment head with options for mask count, protos, and channel adjustments."""
"""
Initializes YOLOv5 Segment head with parameters for masks, prototypes, class count, anchors, and channels.
Args:
nc (int): Number of classes for the segmentation model (default is 80).
anchors (tuple): Tuple of anchor box dimensions for the segmentation model.
nm (int): Number of masks for the segmentation (default is 32).
npr (int): Number of prototypes for the masks (default is 256).
ch (tuple): Tuple of input channels for each detection layer.
inplace (bool): If True, use in-place operations for layer computations (default is True).
Returns:
None
Example:
```python
segment_head = Segment(nc=80, anchors=anchors, nm=32, npr=256, ch=[512, 256, 128], inplace=True)
```
"""
super().__init__(nc, anchors, ch, inplace)
self.nm = nm # number of masks
self.npr = npr # number of protos
@ -139,8 +215,38 @@ class Segment(Detect):
self.detect = Detect.forward
def forward(self, x):
"""Processes input through the network, returning detections and prototypes; adjusts output based on
training/export mode.
"""
Processes input through the network, returning detections and prototypes.
Args:
x (list[torch.Tensor]): List of input tensors corresponding to different detection layers, each with shape
(B, C, H, W), where B is batch size, C is number of channels, H and W are height and width.
Returns:
(tuple[torch.Tensor, torch.Tensor]): A tuple containing:
- `detection` (torch.Tensor): The detection output tensor with shape (B, N, 85), where B is batch size, N is
the number of detections.
- `prototypes` (torch.Tensor): The prototype masks tensor produced by the network with shape (B, P, H', W'),
where B is batch size, P is the number of prototypes, and H' and W' correspond to height and width.
Example:
```python
import torch
from ultralytics import YOLOv5
# Initialize model
model = YOLOv5.Segment()
# Generate dummy input
x = [torch.randn(1, 3, 640, 640) for _ in range(3)]
# Forward pass
detection, prototypes = model.forward(x)
```
Note:
During inference (evaluation mode), detection outputs are post-processed to generate final bounding boxes and classes.
In training mode, the outputs are not processed.
"""
p = self.proto(x[0])
x = self.detect(self, x)
@ -151,13 +257,64 @@ class BaseModel(nn.Module):
"""YOLOv5 base model."""
def forward(self, x, profile=False, visualize=False):
"""Executes a single-scale inference or training pass on the YOLOv5 base model, with options for profiling and
visualization.
"""
Perform a forward pass through the YOLOv5 model, optionally profiling and visualizing features.
Args:
x (torch.Tensor): Input data tensor with shape (N, C, H, W).
profile (bool): Whether to profile execution time of each layer. Defaults to False.
visualize (bool): Whether to store and visualize feature maps. Defaults to False.
Returns:
(torch.Tensor | tuple): In training mode, returns predictions as tuples with shapes (N, 3, H, W, no).
In inference mode, returns a single tensor with shape (N, M, no), where M is the number of predicted
objects after non-maximum suppression (NMS).
Example:
```python
model = BaseModel()
input_tensor = torch.randn(1, 3, 640, 640)
output = model.forward(input_tensor, profile=True, visualize=True)
```
Note:
- In training mode, the method returns unprocessed predictions for each scale, suitable for loss calculation.
- In inference mode, non-maximum suppression is applied to refine predictions.
"""
return self._forward_once(x, profile, visualize) # single-scale inference, train
def _forward_once(self, x, profile=False, visualize=False):
"""Performs a forward pass on the YOLOv5 model, enabling profiling and feature visualization options."""
"""
Execute a forward pass through the YOLOv5 model layers with optional profiling and visualization.
Args:
x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number
of channels, and H and W are the height and width of the input image, respectively.
profile (bool): If True, profiles the execution time for each layer. Defaults to False.
visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
Returns:
(torch.Tensor): Model output tensor with shape depending on whether the model is in training or
inference mode.
- In training mode: Returns a list of tensors for each detection layer, each tensor has shape
(N, 3, H, W, no), where `no` is the number of outputs per anchor.
- In inference mode: If not exporting, returns a tuple with a single tensor of shape (N, M, no),
where M is the number of predicted objects.
- If exporting: Returns a tensor of shape (N, M, no).
Example:
```python
model = BaseModel()
input_tensor = torch.randn(1, 3, 640, 640) # Generate a random input tensor
output = model._forward_once(input_tensor, profile=True, visualize=True)
```
Note:
This method conducts a single-scale inference or training pass through the model. Depending on the mode
(training or inference), the method behaves differently. In training mode, it returns unprocessed
predictions for each detection layer. In inference mode, non-maximum suppression (NMS) is typically
applied after this method to refine predictions.
"""
y, dt = [], [] # outputs
for m in self.model:
if m.f != -1: # if not from previous layer
@ -171,7 +328,32 @@ class BaseModel(nn.Module):
return x
def _profile_one_layer(self, m, x, dt):
"""Profiles a single layer's performance by computing GFLOPs, execution time, and parameters."""
"""
Profiles a single model layer's GFLOPs, parameters, and execution time within the YOLOv5 model.
Args:
m (nn.Module): The model layer to be profiled.
x (torch.Tensor): Input tensor passed to the model layer, with shape (N, C, H, W).
dt (list[float]): List to record execution times of the profiled layer.
Returns:
None: The function updates the `dt` list with the execution time of the layer in milliseconds.
Example:
```python
model = BaseModel()
layer = nn.Conv2d(3, 16, 3, 1) # Example layer
input_tensor = torch.randn(1, 3, 640, 640) # Example input
execution_times = []
model._profile_one_layer(layer, input_tensor, execution_times)
```
Note:
- Profiling is done for the purpose of understanding the computational load (GFLOPs) and time taken per layer within
the YOLOv5 model.
- If the `thop` library is not available, FLOPs computation will not be performed.
"""
c = m == self.model[-1] # is final layer, copy input as inplace fix
o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1e9 * 2 if thop else 0 # FLOPs
t = time_sync()
@ -185,7 +367,25 @@ class BaseModel(nn.Module):
LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total")
def fuse(self):
"""Fuses Conv2d() and BatchNorm2d() layers in the model to improve inference speed."""
"""
Fuses Conv2d and BatchNorm2d layers in the model to optimize inference speed.
This method modifies the model in place by merging Conv2d and BatchNorm2d layers into single Conv2d
layers where applicable. This can significantly improve inference speed and reduce memory usage.
Returns:
None
Example:
```python
model = BaseModel()
model.fuse()
```
Note:
After fusing layers, the forward method of fused layers is updated to `forward_fuse`, optimizing
the execution path.
"""
LOGGER.info("Fusing layers... ")
for m in self.model.modules():
if isinstance(m, (Conv, DWConv)) and hasattr(m, "bn"):
@ -196,12 +396,44 @@ class BaseModel(nn.Module):
return self
def info(self, verbose=False, img_size=640):
"""Prints model information given verbosity and image size, e.g., `info(verbose=True, img_size=640)`."""
"""
Display model summary, including layer details and computational complexity for a specified image size.
Args:
verbose (bool): If True, prints a detailed summary including information about each layer. Defaults to False.
img_size (int | tuple[int]): Size of the input image as an integer (for square images) or tuple (H, W).
Defaults to 640.
Returns:
(None): This function does not return any value. It directly prints the model summary to the console.
Example:
```python
model = BaseModel()
model.info(verbose=True, img_size=640)
```
Note:
Ensure that the `verbose` parameter is set to True for a comprehensive layer-by-layer summary. The image size should
be supplied based on the expected input size for the model.
"""
model_info(self, verbose, img_size)
def _apply(self, fn):
"""Applies transformations like to(), cpu(), cuda(), half() to model tensors excluding parameters or registered
buffers.
"""
Apply a function to the model and its layer parameters, including specific modifications for Detect and Segment layers.
Args:
fn (function): A function to apply to the model's tensors.
Returns:
(torch.nn.Module): The module with applied transformations.
Note:
The function is particularly useful for operations like converting tensors to a target device
(e.g., CUDA, CPU) or altering their precision (e.g., float16). The Detect layer's stride and grid
parameters, as well as the Segment layer's anchor grids, are specifically modified to ensure consistency
after such transformations.
"""
self = super()._apply(fn)
m = self.model[-1] # Detect()
@ -216,7 +448,36 @@ class BaseModel(nn.Module):
class DetectionModel(BaseModel):
# YOLOv5 detection model
def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, anchors=None):
"""Initializes YOLOv5 model with configuration file, input channels, number of classes, and custom anchors."""
"""
Initializes YOLOv5 model using the specified config, input channels, class count, and custom anchors.
Args:
cfg (str | dict): Model configuration, either a path to a YAML config file or a configuration dictionary.
ch (int): Number of input channels. Defaults to 3.
nc (int | None): Number of classes. If provided, overrides the value in the YAML file/config dictionary. Defaults to None.
anchors (list[int] | None): Custom anchors. If provided, overrides the anchors defined in the YAML file/config
dictionary. Defaults to None.
Returns:
None
Example:
```python
from ultralytics.models.yolo import DetectionModel
# Initialize model with path to YAML config
model1 = DetectionModel(cfg="yolov5s.yaml")
# Initialize model with configuration dictionary
cfg_dict = {"nc": 80, "depth_multiple": 0.33, "width_multiple": 0.50}
model2 = DetectionModel(cfg=cfg_dict, ch=3, nc=80)
```
Note:
If `cfg` is a dictionary, it should include the necessary parameters such as `nc`, `depth_multiple`, and `width_multiple`.
During initialization, the model configuration from the YAML file or dictionary is parsed, and the internal model
structure is built accordingly. This includes defining the detection layers and adjusting anchors and strides.
"""
super().__init__()
if isinstance(cfg, dict):
self.yaml = cfg # model dict
@ -261,13 +522,64 @@ class DetectionModel(BaseModel):
LOGGER.info("")
def forward(self, x, augment=False, profile=False, visualize=False):
"""Performs single-scale or augmented inference and may include profiling or visualization."""
"""
Perform forward pass through the YOLOv5 detection model for training or inference, with options for augmentation,
profiling, and visualization.
Args:
x (torch.Tensor): Input tensor with a shape of (N, C, H, W), where N is the batch size, C is the number of channels,
H is the height, and W is the width.
augment (bool): If True, performs augmented inference. Defaults to False.
profile (bool): If True, profiles the execution time of each layer. Defaults to False.
visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
Returns:
(torch.Tensor | tuple): Depending on the mode, returns either:
- In training mode: tuple containing predictions for each scale with shapes (N, 3, H, W, no).
- In inference mode: tensor with shape (N, M, no), where M is the number of predicted objects after
non-maximum suppression.
- When exporting: tuple containing concatenated inference output tensor and intermediate feature maps.
Example:
```python
model = DetectionModel(cfg="yolov5s.yaml", ch=3, nc=80)
input_tensor = torch.randn(1, 3, 640, 640)
output = model.forward(input_tensor, augment=False, profile=True, visualize=False)
```
Note:
This method adapts to training and inference modes, with different return types based on the operational mode.
During training mode, it returns raw predictions across various scales for loss calculation, whereas in inference
mode, non-maximum suppression (NMS) is applied to refine predictions.
"""
if augment:
return self._forward_augment(x) # augmented inference, None
return self._forward_once(x, profile, visualize) # single-scale inference, train
def _forward_augment(self, x):
"""Performs augmented inference across different scales and flips, returning combined detections."""
"""
Performs augmented inference by processing input across different scales and flips, merging the outputs.
Args:
x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of channels,
H and W are height and width.
Returns:
(torch.Tensor): Merged output tensor after multi-scale and flip augmentations, with shape (N, M, no),
where N is batch size, M is the number of predictions, and no is the number of output features.
Example:
```python
model = DetectionModel(cfg='yolov5s.yaml')
input_tensor = torch.randn(1, 3, 640, 640)
output = model._forward_augment(input_tensor)
```
Note:
The function processes the input using different scales (1, 0.83, 0.67) and flips (None, horizontal),
descaling predictions before merging. This helps to improve model robustness and accuracy
during inference.
"""
img_size = x.shape[-2:] # height, width
s = [1, 0.83, 0.67] # scales
f = [None, 3, None] # flips (2-ud, 3-lr)
@ -282,7 +594,23 @@ class DetectionModel(BaseModel):
return torch.cat(y, 1), None # augmented inference, train
def _descale_pred(self, p, flips, scale, img_size):
"""De-scales predictions from augmented inference, adjusting for flips and image size."""
"""
Adjusts predictions for augmented inference by de-scaling and correcting for flips or image size changes.
Args:
p (torch.Tensor): Predictions tensor with shape (..., N) where N indicates prediction attributes like
bounding box coordinates, confidence score, etc.
flips (int | None): Specifies flip mode. `2` for vertical flip, `3` for horizontal flip, and `None` for no flip.
scale (float): Scale factor used during augmentation.
img_size (tuple[int, int]): Original image dimensions as (height, width).
Returns:
(torch.Tensor): Adjusted predictions tensor with the same shape as input, de-scaled and de-flipped appropriately.
Note:
If inplace operations are enabled, the adjustments are applied directly on the tensor. Otherwise, new tensors are
created for the adjusted values to avoid modifying the original input.
"""
if self.inplace:
p[..., :4] /= scale # de-scale
if flips == 2:
@ -299,8 +627,18 @@ class DetectionModel(BaseModel):
return p
def _clip_augmented(self, y):
"""Clips augmented inference tails for YOLOv5 models, affecting first and last tensors based on grid points and
layer counts.
"""
Clip augmented inference tails for YOLOv5 models, adjusting predictions from the first and last layers.
Args:
y (list[torch.Tensor]): List of tensors, where each tensor represents detections from augmented inference across different layers.
Returns:
(list[torch.Tensor]): Modified list of tensors with clipped augmented inference tails.
Notes:
This function helps to discard the augmented tails by adjusting predictions from the first and last layers,
which might otherwise introduce artifacts due to the augmentation process.
"""
nl = self.model[-1].nl # number of detection layers (P3-P5)
g = sum(4**x for x in range(nl)) # grid points
@ -313,9 +651,36 @@ class DetectionModel(BaseModel):
def _initialize_biases(self, cf=None):
"""
Initializes biases for YOLOv5's Detect() module, optionally using class frequencies (cf).
For details see https://arxiv.org/abs/1708.02002 section 3.3.
Initialize biases for the YOLOv5 Detect module using specified or default bias adjustments.
Args:
cf (torch.Tensor | None): Optional tensor representing class frequencies for bias initialization. The shape should be
(N,), where N is the number of classes. If not provided, default adjustments are applied based on the number of
classes and image dimensions.
Returns:
(torch.Tensor): Updated biases for the model with shape (N, M), where N is the number of anchors and M is the number of
outputs per anchor.
Note:
The function calculates the biases based on principles from https://arxiv.org/abs/1708.02002, section 3.3. If class
frequencies (`cf`) are not provided, default bias adjustments are made. Adjustments primarily ensure that objectness and
class biases are reasonably initialized for effective training.
Example:
```python
from ultralytics.yolov5 import DetectionModel
import torch
# Initialize model
model = DetectionModel(cfg="yolov5s.yaml")
# Optional class frequencies tensor
class_frequencies = torch.tensor([100, 150, 200])
# Initialize biases
model._initialize_biases(cf=class_frequencies)
```
"""
# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
m = self.model[-1] # Detect() module
@ -334,22 +699,91 @@ Model = DetectionModel # retain YOLOv5 'Model' class for backwards compatibilit
class SegmentationModel(DetectionModel):
# YOLOv5 segmentation model
def __init__(self, cfg="yolov5s-seg.yaml", ch=3, nc=None, anchors=None):
"""Initializes a YOLOv5 segmentation model with configurable params: cfg (str) for configuration, ch (int) for channels, nc (int) for num classes, anchors (list)."""
"""
Initializes a YOLOv5 segmentation model with configurable parameters.
Args:
cfg (str): Path to the configuration file containing model architecture and parameters. Defaults to "yolov5s-seg.yaml".
ch (int): Number of input channels. Defaults to 3.
nc (int | None): Number of classes. If provided, overrides the number of classes specified in the cfg file.
anchors (list | None): List of anchor points. If provided, overrides the anchor configuration in the cfg file.
Returns:
(None): Initializes various components of the SegmentationModel class.
Example:
```python
from ultralytics import SegmentationModel
model = SegmentationModel()
```
Note:
The initialization includes setting up model layers, anchors, and other configurations based on the provided
or default configuration file.
"""
super().__init__(cfg, ch, nc, anchors)
class ClassificationModel(BaseModel):
# YOLOv5 classification model
def __init__(self, cfg=None, model=None, nc=1000, cutoff=10):
"""Initializes YOLOv5 model with config file `cfg`, input channels `ch`, number of classes `nc`, and `cuttoff`
index.
"""
Initializes a YOLOv5 classification model with either a configuration file or a pre-built model, specifying
the number of classes and a cutoff layer index.
Args:
cfg (str | None): Path to the model configuration file, or None if using `model`.
model (torch.nn.Module | None): Pre-built torch model, or None if using `cfg`.
nc (int): Number of output classes, default is 1000.
cutoff (int): Index of the cutoff layer, default is 10.
Returns:
None
Example:
```python
# Initializing from a configuration file
model = ClassificationModel(cfg='yolov5-class-config.yaml', nc=1000, cutoff=10)
# Initializing from an existing model
model = ClassificationModel(model=prebuilt_model, nc=1000, cutoff=10)
```
Note:
This model can be extended or customized by modifying the configuration file or the pre-built model.
"""
super().__init__()
self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)
def _from_detection_model(self, model, nc=1000, cutoff=10):
"""Creates a classification model from a YOLOv5 detection model, slicing at `cutoff` and adding a classification
layer.
"""
Perform a transformation from a YOLOv5 detection model to a classification model.
Args:
model (DetectionModel): A pre-trained YOLOv5 detection model.
nc (int): Number of classes for the classification model. Default is 1000.
cutoff (int): Index to slice the model's layers up to the classification layer. Default is 10.
Returns:
None. The function modifies the model in place.
Notes:
This function takes a detection model and transforms it into a classification model by slicing the model layers
at the specified cutoff point and adding a classification layer with the specified number of classes.
- If the input model is wrapped by `DetectMultiBackend`, it unwraps the model to get the underlying YOLOv5 model.
- Constructs a `Classify` layer, replacing the final detection layer with this new classification layer.
Example:
```python
from ultralytics import YOLOv5
# Load a pre-trained detection model
detection_model = YOLOv5.load('yolov5s.pt')
# Create a classification model from detection model
classification_model = YOLOv5.ClassificationModel()
classification_model._from_detection_model(detection_model, nc=1000, cutoff=10)
```
"""
if isinstance(model, DetectMultiBackend):
model = model.model # unwrap DetectMultiBackend
@ -365,12 +799,49 @@ class ClassificationModel(BaseModel):
self.nc = nc
def _from_yaml(self, cfg):
"""Creates a YOLOv5 classification model from a specified *.yaml configuration file."""
"""
Perform initialization and parsing from a YOLOv5 configuration file.
Args:
cfg (str): Path to the YOLOv5 YAML configuration file.
Returns:
None. The function modifies the model in place utilizing the defined configuration parameters.
Notes:
This function reads a YOLOv5 YAML configuration file and constructs the classification model accordingly. It sets the
appropriate channels, layers, and output classes based on the parsed configuration data.
"""
self.model = None
def parse_model(d, ch):
"""Parses a YOLOv5 model from a dict `d`, configuring layers based on input channels `ch` and model architecture."""
"""
Parses YOLOv5 model architecture from a configuration dictionary and initializes its layers.
Args:
d (dict): Dictionary containing model configuration. Must include keys: "anchors", "nc", "depth_multiple",
"width_multiple", and optionally "activation" and "channel_multiple".
ch (list[int]): List of input channels for each layer.
Returns:
(tuple[nn.Sequential, list[int]]): A tuple containing:
- `model` (nn.Sequential): The constructed YOLOv5 model based on the configuration.
- `save` (list[int]): List of layers whose outputs should be preserved during the forward pass.
Example:
```python
from pathlib import Path
import yaml
# Load model configuration YAML
with open(Path('yolov5s.yaml'), 'r') as file:
model_config = yaml.safe_load(file)
# Parse model and initialize
model, save = parse_model(model_config, ch=[3])
```
"""
LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}")
anchors, nc, gd, gw, act, ch_mul = (
d["anchors"],