Refactor code for speed and clarity
parent
8003649c79
commit
90cd326b3f
1523
models/common.py
1523
models/common.py
File diff suppressed because it is too large
Load Diff
|
@ -14,8 +14,19 @@ class Sum(nn.Module):
|
|||
"""Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070."""
|
||||
|
||||
def __init__(self, n, weight=False):
|
||||
"""Initializes a module to sum outputs of layers with number of inputs `n` and optional weighting, supporting 2+
|
||||
inputs.
|
||||
"""
|
||||
Initialize the Sum module to aggregate outputs from multiple layers, optionally with weights.
|
||||
|
||||
Args:
|
||||
n (int): Number of layers to sum. Must be 2 or more.
|
||||
weight (bool): If True, applies weights to the inputs before summing.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Notes:
|
||||
Refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070 for detailed insights
|
||||
and usage scenarios.
|
||||
"""
|
||||
super().__init__()
|
||||
self.weight = weight # apply weights boolean
|
||||
|
@ -24,7 +35,26 @@ class Sum(nn.Module):
|
|||
self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True) # layer weights
|
||||
|
||||
def forward(self, x):
|
||||
"""Processes input through a customizable weighted sum of `n` inputs, optionally applying learned weights."""
|
||||
"""
|
||||
Compute a weighted or unweighted sum of input tensors.
|
||||
|
||||
Args:
|
||||
x (list[torch.Tensor]): List of input tensors to be summed, with each tensor having the same shape (N, D).
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): The resulting tensor after summing the input tensors, maintaining the same shape (N, D).
|
||||
|
||||
Example:
|
||||
```python
|
||||
sum_layer = Sum(n=3, weight=False)
|
||||
inputs = [torch.rand(1, 10), torch.rand(1, 10), torch.rand(1, 10)]
|
||||
result = sum_layer.forward(inputs)
|
||||
```
|
||||
|
||||
Note:
|
||||
If `weight` is set to True when initializing the class, weights will be applied to the inputs before summing.
|
||||
For more information, refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070.
|
||||
"""
|
||||
y = x[0] # no weight
|
||||
if self.weight:
|
||||
w = torch.sigmoid(self.w) * 2
|
||||
|
@ -40,8 +70,29 @@ class MixConv2d(nn.Module):
|
|||
"""Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595."""
|
||||
|
||||
def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
|
||||
"""Initializes MixConv2d with mixed depth-wise convolutional layers, taking input and output channels (c1, c2),
|
||||
kernel sizes (k), stride (s), and channel distribution strategy (equal_ch).
|
||||
"""
|
||||
Initialize the MixConv2d module, handling mixed depth-wise convolutional operations.
|
||||
|
||||
Args:
|
||||
c1 (int): Number of input channels (C1).
|
||||
c2 (int): Number of output channels (C2).
|
||||
k (tuple[int]): Kernel sizes for the convolutional layers.
|
||||
s (int): Stride value for the convolutional layers.
|
||||
equal_ch (bool): Flag to determine if channels are distributed equally. True for equal channels per group, False
|
||||
for equal weight.numel() per group.
|
||||
|
||||
Example:
|
||||
```python
|
||||
mixconv = MixConv2d(c1=32, c2=64, k=(1, 3, 5), s=1, equal_ch=True)
|
||||
output = mixconv(input_tensor)
|
||||
```
|
||||
|
||||
Note:
|
||||
The `MixConv2d` layer applies multiple depth-wise convolutions with different kernel sizes in parallel, which
|
||||
can capture multi-scale features within a single layer. This technique is particularly useful for improving
|
||||
spatial feature extraction and reducing model complexity.
|
||||
|
||||
Further reading: https://arxiv.org/abs/1907.09595
|
||||
"""
|
||||
super().__init__()
|
||||
n = len(k) # number of convolutions
|
||||
|
@ -63,8 +114,24 @@ class MixConv2d(nn.Module):
|
|||
self.act = nn.SiLU()
|
||||
|
||||
def forward(self, x):
|
||||
"""Performs forward pass by applying SiLU activation on batch-normalized concatenated convolutional layer
|
||||
outputs.
|
||||
"""
|
||||
Perform forward pass by applying mixed depth-wise convolutions followed by batch normalization and SiLU activation.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
|
||||
H is the height, and W is the width.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Output tensor after applying mixed convolutions, batch normalization, and SiLU activation,
|
||||
maintaining the shape (N, C', H', W') where C' is the output channels based on the convolutional layer
|
||||
configuration.
|
||||
|
||||
Example:
|
||||
```python
|
||||
mixconv = MixConv2d(c1=32, c2=64, k=(1, 3), s=1)
|
||||
x = torch.randn(16, 32, 128, 128)
|
||||
output = mixconv(x)
|
||||
```
|
||||
"""
|
||||
return self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
|
||||
|
||||
|
@ -73,11 +140,51 @@ class Ensemble(nn.ModuleList):
|
|||
"""Ensemble of models."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes an ensemble of models to be used for aggregated predictions."""
|
||||
"""
|
||||
Initializes an ensemble of models for combined inference and aggregated predictions.
|
||||
|
||||
Example:
|
||||
```python
|
||||
ensemble = Ensemble()
|
||||
model1 = MyModel1()
|
||||
model2 = MyModel2()
|
||||
ensemble.append(model1)
|
||||
ensemble.append(model2)
|
||||
```
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x, augment=False, profile=False, visualize=False):
|
||||
"""Performs forward pass aggregating outputs from an ensemble of models.."""
|
||||
"""
|
||||
Aggregates outputs from multiple models in the ensemble by concatenating them during the forward pass.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
|
||||
H is the height, and W is the width.
|
||||
augment (bool): Flag to apply test-time augmentation (TTA) during inference. Default is False.
|
||||
profile (bool): If True, enables profiling of the forward pass. Default is False.
|
||||
visualize (bool): If True, enables visualization of model predictions. Default is False.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Aggregated output tensor from the ensemble models, with shape dependent on the number of models
|
||||
and their architectures.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from ultralytics import Ensemble
|
||||
import torch
|
||||
|
||||
# Initialize the ensemble
|
||||
ensemble = Ensemble()
|
||||
# Assume models are already added to the ensemble
|
||||
|
||||
# Create a dummy input tensor
|
||||
x = torch.randn(8, 3, 640, 640) # Example input for 8 images of 3 channels and 640x640 resolution
|
||||
|
||||
# Perform forward pass
|
||||
output = ensemble.forward(x, augment=False, profile=False, visualize=False)
|
||||
```
|
||||
"""
|
||||
y = [module(x, augment, profile, visualize)[0] for module in self]
|
||||
# y = torch.stack(y).max(0)[0] # max ensemble
|
||||
# y = torch.stack(y).mean(0) # mean ensemble
|
||||
|
@ -87,9 +194,32 @@ class Ensemble(nn.ModuleList):
|
|||
|
||||
def attempt_load(weights, device=None, inplace=True, fuse=True):
|
||||
"""
|
||||
Loads and fuses an ensemble or single YOLOv5 model from weights, handling device placement and model adjustments.
|
||||
|
||||
Example inputs: weights=[a,b,c] or a single model weights=[a] or weights=a.
|
||||
Loads and fuses a YOLOv5 model or an ensemble of models from provided weights, adjusting device placement and model
|
||||
attributes for optimal performance.
|
||||
|
||||
Args:
|
||||
weights (str | list[str]): Path(s) to model weight file(s). It can be a single path or a list of paths.
|
||||
device (torch.device | None, optional): Device to load the model on. If None, loads on CPU by default.
|
||||
inplace (bool, optional): If True, enables inplace operations in certain layers like activation layers.
|
||||
Defaults to True.
|
||||
fuse (bool, optional): Whether to fuse Conv2d + BatchNorm2d layers for speedup during inference. Defaults to True.
|
||||
|
||||
Returns:
|
||||
(torch.nn.Module): Loaded YOLOv5 model or an ensemble of models loaded onto the specified device.
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Load a single model weight
|
||||
model = attempt_load('yolov5s.pt')
|
||||
|
||||
# Load an ensemble of models
|
||||
model = attempt_load(['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt'])
|
||||
```
|
||||
|
||||
Note:
|
||||
- This function ensures compatibility and performance optimization by adjusting attributes and configurations of the
|
||||
loaded model(s).
|
||||
- If `fuse` is set to True, it will fuse Conv2d and BatchNorm2d layers within the model(s) to speed up inference.
|
||||
"""
|
||||
from models.yolo import Detect, Model
|
||||
|
||||
|
|
1300
models/tf.py
1300
models/tf.py
File diff suppressed because it is too large
Load Diff
531
models/yolo.py
531
models/yolo.py
|
@ -76,7 +76,27 @@ class Detect(nn.Module):
|
|||
export = False # export mode
|
||||
|
||||
def __init__(self, nc=80, anchors=(), ch=(), inplace=True):
|
||||
"""Initializes YOLOv5 detection layer with specified classes, anchors, channels, and inplace operations."""
|
||||
"""
|
||||
Initializes the YOLOv5 Detect layer with class count, anchors, channels, and inplace operations.
|
||||
|
||||
Args:
|
||||
nc (int, optional): Number of classes. Default is 80.
|
||||
anchors (tuple, optional): Anchor box dimensions, typically specified for each detection layer. Default is ().
|
||||
ch (tuple, optional): Number of input channels for each detection layer. Default is ().
|
||||
inplace (bool, optional): If True, operations are done inplace. Default is True.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Example:
|
||||
```python
|
||||
detect_layer = Detect(nc=80, anchors=(), ch=(256, 512, 1024), inplace=True)
|
||||
```
|
||||
|
||||
Note:
|
||||
This function initializes detection heads in the YOLOv5 model, setting up convolution layers, grids, and
|
||||
anchor grids required for object detection inference.
|
||||
"""
|
||||
super().__init__()
|
||||
self.nc = nc # number of classes
|
||||
self.no = nc + 5 # number of outputs per anchor
|
||||
|
@ -89,7 +109,23 @@ class Detect(nn.Module):
|
|||
self.inplace = inplace # use inplace ops (e.g. slice assignment)
|
||||
|
||||
def forward(self, x):
|
||||
"""Processes input through YOLOv5 layers, altering shape for detection: `x(bs, 3, ny, nx, 85)`."""
|
||||
"""
|
||||
Processes input through detection layers, reshaping and applying convolution for YOLOv5 inference.
|
||||
|
||||
Args:
|
||||
x (list[torch.Tensor]): List of feature maps from backbone with shape (B, C, H, W) where B is the batch
|
||||
size, C is the number of channels, and H and W are height and width.
|
||||
|
||||
Returns:
|
||||
(list[torch.Tensor]): List of processed detections, each a torch Tensor with shape (B, N, D) where B
|
||||
is the batch size, N is the number of detections, and D is the dimensions of each detection
|
||||
(e.g., bounding box coordinates, objectness score, class probabilities).
|
||||
|
||||
Note:
|
||||
This method applies a series of convolutions to transform the input feature maps into detection
|
||||
outputs. It also handles reshaping and permutation to align with YOLOv5's output format. During
|
||||
inference, additional steps are performed to compute final object locations and dimensions.
|
||||
"""
|
||||
z = [] # inference output
|
||||
for i in range(self.nl):
|
||||
x[i] = self.m[i](x[i]) # conv
|
||||
|
@ -115,7 +151,29 @@ class Detect(nn.Module):
|
|||
return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
|
||||
|
||||
def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, "1.10.0")):
|
||||
"""Generates a mesh grid for anchor boxes with optional compatibility for torch versions < 1.10."""
|
||||
"""
|
||||
Generate a mesh grid for anchor boxes with torch version compatibility for detection models.
|
||||
|
||||
Args:
|
||||
nx (int): Number of grid cells along the x-axis.
|
||||
ny (int): Number of grid cells along the y-axis.
|
||||
i (int): Index of the detection layer for which the grid is being generated.
|
||||
torch_1_10 (bool): Indicator whether the torch version is at least 1.10.0 for meshgrid compatibility.
|
||||
|
||||
Returns:
|
||||
(tuple[torch.Tensor, torch.Tensor]): A tuple containing two tensors:
|
||||
- grid (torch.Tensor): The generated grid with shape (1, num_anchors, ny, nx, 2), containing xy coordinates.
|
||||
- anchor_grid (torch.Tensor): The anchor grid scaled by the stride, with shape (1, num_anchors, ny, nx, 2).
|
||||
|
||||
Example:
|
||||
```python
|
||||
detector = Detect()
|
||||
grid, anchor_grid = detector._make_grid(20, 20, 0)
|
||||
```
|
||||
|
||||
Note:
|
||||
The function ensures compatibility with different torch versions by using appropriate meshgrid indexing options.
|
||||
"""
|
||||
d = self.anchors[i].device
|
||||
t = self.anchors[i].dtype
|
||||
shape = 1, self.na, ny, nx, 2 # grid shape
|
||||
|
@ -129,7 +187,25 @@ class Detect(nn.Module):
|
|||
class Segment(Detect):
|
||||
# YOLOv5 Segment head for segmentation models
|
||||
def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True):
|
||||
"""Initializes YOLOv5 Segment head with options for mask count, protos, and channel adjustments."""
|
||||
"""
|
||||
Initializes YOLOv5 Segment head with parameters for masks, prototypes, class count, anchors, and channels.
|
||||
|
||||
Args:
|
||||
nc (int): Number of classes for the segmentation model (default is 80).
|
||||
anchors (tuple): Tuple of anchor box dimensions for the segmentation model.
|
||||
nm (int): Number of masks for the segmentation (default is 32).
|
||||
npr (int): Number of prototypes for the masks (default is 256).
|
||||
ch (tuple): Tuple of input channels for each detection layer.
|
||||
inplace (bool): If True, use in-place operations for layer computations (default is True).
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Example:
|
||||
```python
|
||||
segment_head = Segment(nc=80, anchors=anchors, nm=32, npr=256, ch=[512, 256, 128], inplace=True)
|
||||
```
|
||||
"""
|
||||
super().__init__(nc, anchors, ch, inplace)
|
||||
self.nm = nm # number of masks
|
||||
self.npr = npr # number of protos
|
||||
|
@ -139,8 +215,38 @@ class Segment(Detect):
|
|||
self.detect = Detect.forward
|
||||
|
||||
def forward(self, x):
|
||||
"""Processes input through the network, returning detections and prototypes; adjusts output based on
|
||||
training/export mode.
|
||||
"""
|
||||
Processes input through the network, returning detections and prototypes.
|
||||
|
||||
Args:
|
||||
x (list[torch.Tensor]): List of input tensors corresponding to different detection layers, each with shape
|
||||
(B, C, H, W), where B is batch size, C is number of channels, H and W are height and width.
|
||||
|
||||
Returns:
|
||||
(tuple[torch.Tensor, torch.Tensor]): A tuple containing:
|
||||
- `detection` (torch.Tensor): The detection output tensor with shape (B, N, 85), where B is batch size, N is
|
||||
the number of detections.
|
||||
- `prototypes` (torch.Tensor): The prototype masks tensor produced by the network with shape (B, P, H', W'),
|
||||
where B is batch size, P is the number of prototypes, and H' and W' correspond to height and width.
|
||||
|
||||
Example:
|
||||
```python
|
||||
import torch
|
||||
from ultralytics import YOLOv5
|
||||
|
||||
# Initialize model
|
||||
model = YOLOv5.Segment()
|
||||
|
||||
# Generate dummy input
|
||||
x = [torch.randn(1, 3, 640, 640) for _ in range(3)]
|
||||
|
||||
# Forward pass
|
||||
detection, prototypes = model.forward(x)
|
||||
```
|
||||
|
||||
Note:
|
||||
During inference (evaluation mode), detection outputs are post-processed to generate final bounding boxes and classes.
|
||||
In training mode, the outputs are not processed.
|
||||
"""
|
||||
p = self.proto(x[0])
|
||||
x = self.detect(self, x)
|
||||
|
@ -151,13 +257,64 @@ class BaseModel(nn.Module):
|
|||
"""YOLOv5 base model."""
|
||||
|
||||
def forward(self, x, profile=False, visualize=False):
|
||||
"""Executes a single-scale inference or training pass on the YOLOv5 base model, with options for profiling and
|
||||
visualization.
|
||||
"""
|
||||
Perform a forward pass through the YOLOv5 model, optionally profiling and visualizing features.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input data tensor with shape (N, C, H, W).
|
||||
profile (bool): Whether to profile execution time of each layer. Defaults to False.
|
||||
visualize (bool): Whether to store and visualize feature maps. Defaults to False.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor | tuple): In training mode, returns predictions as tuples with shapes (N, 3, H, W, no).
|
||||
In inference mode, returns a single tensor with shape (N, M, no), where M is the number of predicted
|
||||
objects after non-maximum suppression (NMS).
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = BaseModel()
|
||||
input_tensor = torch.randn(1, 3, 640, 640)
|
||||
output = model.forward(input_tensor, profile=True, visualize=True)
|
||||
```
|
||||
|
||||
Note:
|
||||
- In training mode, the method returns unprocessed predictions for each scale, suitable for loss calculation.
|
||||
- In inference mode, non-maximum suppression is applied to refine predictions.
|
||||
"""
|
||||
return self._forward_once(x, profile, visualize) # single-scale inference, train
|
||||
|
||||
def _forward_once(self, x, profile=False, visualize=False):
|
||||
"""Performs a forward pass on the YOLOv5 model, enabling profiling and feature visualization options."""
|
||||
"""
|
||||
Execute a forward pass through the YOLOv5 model layers with optional profiling and visualization.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number
|
||||
of channels, and H and W are the height and width of the input image, respectively.
|
||||
profile (bool): If True, profiles the execution time for each layer. Defaults to False.
|
||||
visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Model output tensor with shape depending on whether the model is in training or
|
||||
inference mode.
|
||||
- In training mode: Returns a list of tensors for each detection layer, each tensor has shape
|
||||
(N, 3, H, W, no), where `no` is the number of outputs per anchor.
|
||||
- In inference mode: If not exporting, returns a tuple with a single tensor of shape (N, M, no),
|
||||
where M is the number of predicted objects.
|
||||
- If exporting: Returns a tensor of shape (N, M, no).
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = BaseModel()
|
||||
input_tensor = torch.randn(1, 3, 640, 640) # Generate a random input tensor
|
||||
output = model._forward_once(input_tensor, profile=True, visualize=True)
|
||||
```
|
||||
|
||||
Note:
|
||||
This method conducts a single-scale inference or training pass through the model. Depending on the mode
|
||||
(training or inference), the method behaves differently. In training mode, it returns unprocessed
|
||||
predictions for each detection layer. In inference mode, non-maximum suppression (NMS) is typically
|
||||
applied after this method to refine predictions.
|
||||
"""
|
||||
y, dt = [], [] # outputs
|
||||
for m in self.model:
|
||||
if m.f != -1: # if not from previous layer
|
||||
|
@ -171,7 +328,32 @@ class BaseModel(nn.Module):
|
|||
return x
|
||||
|
||||
def _profile_one_layer(self, m, x, dt):
|
||||
"""Profiles a single layer's performance by computing GFLOPs, execution time, and parameters."""
|
||||
"""
|
||||
Profiles a single model layer's GFLOPs, parameters, and execution time within the YOLOv5 model.
|
||||
|
||||
Args:
|
||||
m (nn.Module): The model layer to be profiled.
|
||||
x (torch.Tensor): Input tensor passed to the model layer, with shape (N, C, H, W).
|
||||
dt (list[float]): List to record execution times of the profiled layer.
|
||||
|
||||
Returns:
|
||||
None: The function updates the `dt` list with the execution time of the layer in milliseconds.
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = BaseModel()
|
||||
layer = nn.Conv2d(3, 16, 3, 1) # Example layer
|
||||
input_tensor = torch.randn(1, 3, 640, 640) # Example input
|
||||
execution_times = []
|
||||
|
||||
model._profile_one_layer(layer, input_tensor, execution_times)
|
||||
```
|
||||
|
||||
Note:
|
||||
- Profiling is done for the purpose of understanding the computational load (GFLOPs) and time taken per layer within
|
||||
the YOLOv5 model.
|
||||
- If the `thop` library is not available, FLOPs computation will not be performed.
|
||||
"""
|
||||
c = m == self.model[-1] # is final layer, copy input as inplace fix
|
||||
o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1e9 * 2 if thop else 0 # FLOPs
|
||||
t = time_sync()
|
||||
|
@ -185,7 +367,25 @@ class BaseModel(nn.Module):
|
|||
LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total")
|
||||
|
||||
def fuse(self):
|
||||
"""Fuses Conv2d() and BatchNorm2d() layers in the model to improve inference speed."""
|
||||
"""
|
||||
Fuses Conv2d and BatchNorm2d layers in the model to optimize inference speed.
|
||||
|
||||
This method modifies the model in place by merging Conv2d and BatchNorm2d layers into single Conv2d
|
||||
layers where applicable. This can significantly improve inference speed and reduce memory usage.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = BaseModel()
|
||||
model.fuse()
|
||||
```
|
||||
|
||||
Note:
|
||||
After fusing layers, the forward method of fused layers is updated to `forward_fuse`, optimizing
|
||||
the execution path.
|
||||
"""
|
||||
LOGGER.info("Fusing layers... ")
|
||||
for m in self.model.modules():
|
||||
if isinstance(m, (Conv, DWConv)) and hasattr(m, "bn"):
|
||||
|
@ -196,12 +396,44 @@ class BaseModel(nn.Module):
|
|||
return self
|
||||
|
||||
def info(self, verbose=False, img_size=640):
|
||||
"""Prints model information given verbosity and image size, e.g., `info(verbose=True, img_size=640)`."""
|
||||
"""
|
||||
Display model summary, including layer details and computational complexity for a specified image size.
|
||||
|
||||
Args:
|
||||
verbose (bool): If True, prints a detailed summary including information about each layer. Defaults to False.
|
||||
img_size (int | tuple[int]): Size of the input image as an integer (for square images) or tuple (H, W).
|
||||
Defaults to 640.
|
||||
|
||||
Returns:
|
||||
(None): This function does not return any value. It directly prints the model summary to the console.
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = BaseModel()
|
||||
model.info(verbose=True, img_size=640)
|
||||
```
|
||||
|
||||
Note:
|
||||
Ensure that the `verbose` parameter is set to True for a comprehensive layer-by-layer summary. The image size should
|
||||
be supplied based on the expected input size for the model.
|
||||
"""
|
||||
model_info(self, verbose, img_size)
|
||||
|
||||
def _apply(self, fn):
|
||||
"""Applies transformations like to(), cpu(), cuda(), half() to model tensors excluding parameters or registered
|
||||
buffers.
|
||||
"""
|
||||
Apply a function to the model and its layer parameters, including specific modifications for Detect and Segment layers.
|
||||
|
||||
Args:
|
||||
fn (function): A function to apply to the model's tensors.
|
||||
|
||||
Returns:
|
||||
(torch.nn.Module): The module with applied transformations.
|
||||
|
||||
Note:
|
||||
The function is particularly useful for operations like converting tensors to a target device
|
||||
(e.g., CUDA, CPU) or altering their precision (e.g., float16). The Detect layer's stride and grid
|
||||
parameters, as well as the Segment layer's anchor grids, are specifically modified to ensure consistency
|
||||
after such transformations.
|
||||
"""
|
||||
self = super()._apply(fn)
|
||||
m = self.model[-1] # Detect()
|
||||
|
@ -216,7 +448,36 @@ class BaseModel(nn.Module):
|
|||
class DetectionModel(BaseModel):
|
||||
# YOLOv5 detection model
|
||||
def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, anchors=None):
|
||||
"""Initializes YOLOv5 model with configuration file, input channels, number of classes, and custom anchors."""
|
||||
"""
|
||||
Initializes YOLOv5 model using the specified config, input channels, class count, and custom anchors.
|
||||
|
||||
Args:
|
||||
cfg (str | dict): Model configuration, either a path to a YAML config file or a configuration dictionary.
|
||||
ch (int): Number of input channels. Defaults to 3.
|
||||
nc (int | None): Number of classes. If provided, overrides the value in the YAML file/config dictionary. Defaults to None.
|
||||
anchors (list[int] | None): Custom anchors. If provided, overrides the anchors defined in the YAML file/config
|
||||
dictionary. Defaults to None.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Example:
|
||||
```python
|
||||
from ultralytics.models.yolo import DetectionModel
|
||||
|
||||
# Initialize model with path to YAML config
|
||||
model1 = DetectionModel(cfg="yolov5s.yaml")
|
||||
|
||||
# Initialize model with configuration dictionary
|
||||
cfg_dict = {"nc": 80, "depth_multiple": 0.33, "width_multiple": 0.50}
|
||||
model2 = DetectionModel(cfg=cfg_dict, ch=3, nc=80)
|
||||
```
|
||||
|
||||
Note:
|
||||
If `cfg` is a dictionary, it should include the necessary parameters such as `nc`, `depth_multiple`, and `width_multiple`.
|
||||
During initialization, the model configuration from the YAML file or dictionary is parsed, and the internal model
|
||||
structure is built accordingly. This includes defining the detection layers and adjusting anchors and strides.
|
||||
"""
|
||||
super().__init__()
|
||||
if isinstance(cfg, dict):
|
||||
self.yaml = cfg # model dict
|
||||
|
@ -261,13 +522,64 @@ class DetectionModel(BaseModel):
|
|||
LOGGER.info("")
|
||||
|
||||
def forward(self, x, augment=False, profile=False, visualize=False):
|
||||
"""Performs single-scale or augmented inference and may include profiling or visualization."""
|
||||
"""
|
||||
Perform forward pass through the YOLOv5 detection model for training or inference, with options for augmentation,
|
||||
profiling, and visualization.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor with a shape of (N, C, H, W), where N is the batch size, C is the number of channels,
|
||||
H is the height, and W is the width.
|
||||
augment (bool): If True, performs augmented inference. Defaults to False.
|
||||
profile (bool): If True, profiles the execution time of each layer. Defaults to False.
|
||||
visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor | tuple): Depending on the mode, returns either:
|
||||
- In training mode: tuple containing predictions for each scale with shapes (N, 3, H, W, no).
|
||||
- In inference mode: tensor with shape (N, M, no), where M is the number of predicted objects after
|
||||
non-maximum suppression.
|
||||
- When exporting: tuple containing concatenated inference output tensor and intermediate feature maps.
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = DetectionModel(cfg="yolov5s.yaml", ch=3, nc=80)
|
||||
input_tensor = torch.randn(1, 3, 640, 640)
|
||||
output = model.forward(input_tensor, augment=False, profile=True, visualize=False)
|
||||
```
|
||||
|
||||
Note:
|
||||
This method adapts to training and inference modes, with different return types based on the operational mode.
|
||||
During training mode, it returns raw predictions across various scales for loss calculation, whereas in inference
|
||||
mode, non-maximum suppression (NMS) is applied to refine predictions.
|
||||
"""
|
||||
if augment:
|
||||
return self._forward_augment(x) # augmented inference, None
|
||||
return self._forward_once(x, profile, visualize) # single-scale inference, train
|
||||
|
||||
def _forward_augment(self, x):
|
||||
"""Performs augmented inference across different scales and flips, returning combined detections."""
|
||||
"""
|
||||
Performs augmented inference by processing input across different scales and flips, merging the outputs.
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of channels,
|
||||
H and W are height and width.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Merged output tensor after multi-scale and flip augmentations, with shape (N, M, no),
|
||||
where N is batch size, M is the number of predictions, and no is the number of output features.
|
||||
|
||||
Example:
|
||||
```python
|
||||
model = DetectionModel(cfg='yolov5s.yaml')
|
||||
input_tensor = torch.randn(1, 3, 640, 640)
|
||||
output = model._forward_augment(input_tensor)
|
||||
```
|
||||
|
||||
Note:
|
||||
The function processes the input using different scales (1, 0.83, 0.67) and flips (None, horizontal),
|
||||
descaling predictions before merging. This helps to improve model robustness and accuracy
|
||||
during inference.
|
||||
"""
|
||||
img_size = x.shape[-2:] # height, width
|
||||
s = [1, 0.83, 0.67] # scales
|
||||
f = [None, 3, None] # flips (2-ud, 3-lr)
|
||||
|
@ -282,7 +594,23 @@ class DetectionModel(BaseModel):
|
|||
return torch.cat(y, 1), None # augmented inference, train
|
||||
|
||||
def _descale_pred(self, p, flips, scale, img_size):
|
||||
"""De-scales predictions from augmented inference, adjusting for flips and image size."""
|
||||
"""
|
||||
Adjusts predictions for augmented inference by de-scaling and correcting for flips or image size changes.
|
||||
|
||||
Args:
|
||||
p (torch.Tensor): Predictions tensor with shape (..., N) where N indicates prediction attributes like
|
||||
bounding box coordinates, confidence score, etc.
|
||||
flips (int | None): Specifies flip mode. `2` for vertical flip, `3` for horizontal flip, and `None` for no flip.
|
||||
scale (float): Scale factor used during augmentation.
|
||||
img_size (tuple[int, int]): Original image dimensions as (height, width).
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Adjusted predictions tensor with the same shape as input, de-scaled and de-flipped appropriately.
|
||||
|
||||
Note:
|
||||
If inplace operations are enabled, the adjustments are applied directly on the tensor. Otherwise, new tensors are
|
||||
created for the adjusted values to avoid modifying the original input.
|
||||
"""
|
||||
if self.inplace:
|
||||
p[..., :4] /= scale # de-scale
|
||||
if flips == 2:
|
||||
|
@ -299,8 +627,18 @@ class DetectionModel(BaseModel):
|
|||
return p
|
||||
|
||||
def _clip_augmented(self, y):
|
||||
"""Clips augmented inference tails for YOLOv5 models, affecting first and last tensors based on grid points and
|
||||
layer counts.
|
||||
"""
|
||||
Clip augmented inference tails for YOLOv5 models, adjusting predictions from the first and last layers.
|
||||
|
||||
Args:
|
||||
y (list[torch.Tensor]): List of tensors, where each tensor represents detections from augmented inference across different layers.
|
||||
|
||||
Returns:
|
||||
(list[torch.Tensor]): Modified list of tensors with clipped augmented inference tails.
|
||||
|
||||
Notes:
|
||||
This function helps to discard the augmented tails by adjusting predictions from the first and last layers,
|
||||
which might otherwise introduce artifacts due to the augmentation process.
|
||||
"""
|
||||
nl = self.model[-1].nl # number of detection layers (P3-P5)
|
||||
g = sum(4**x for x in range(nl)) # grid points
|
||||
|
@ -313,9 +651,36 @@ class DetectionModel(BaseModel):
|
|||
|
||||
def _initialize_biases(self, cf=None):
|
||||
"""
|
||||
Initializes biases for YOLOv5's Detect() module, optionally using class frequencies (cf).
|
||||
|
||||
For details see https://arxiv.org/abs/1708.02002 section 3.3.
|
||||
Initialize biases for the YOLOv5 Detect module using specified or default bias adjustments.
|
||||
|
||||
Args:
|
||||
cf (torch.Tensor | None): Optional tensor representing class frequencies for bias initialization. The shape should be
|
||||
(N,), where N is the number of classes. If not provided, default adjustments are applied based on the number of
|
||||
classes and image dimensions.
|
||||
|
||||
Returns:
|
||||
(torch.Tensor): Updated biases for the model with shape (N, M), where N is the number of anchors and M is the number of
|
||||
outputs per anchor.
|
||||
|
||||
Note:
|
||||
The function calculates the biases based on principles from https://arxiv.org/abs/1708.02002, section 3.3. If class
|
||||
frequencies (`cf`) are not provided, default bias adjustments are made. Adjustments primarily ensure that objectness and
|
||||
class biases are reasonably initialized for effective training.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from ultralytics.yolov5 import DetectionModel
|
||||
import torch
|
||||
|
||||
# Initialize model
|
||||
model = DetectionModel(cfg="yolov5s.yaml")
|
||||
|
||||
# Optional class frequencies tensor
|
||||
class_frequencies = torch.tensor([100, 150, 200])
|
||||
|
||||
# Initialize biases
|
||||
model._initialize_biases(cf=class_frequencies)
|
||||
```
|
||||
"""
|
||||
# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
|
||||
m = self.model[-1] # Detect() module
|
||||
|
@ -334,22 +699,91 @@ Model = DetectionModel # retain YOLOv5 'Model' class for backwards compatibilit
|
|||
class SegmentationModel(DetectionModel):
|
||||
# YOLOv5 segmentation model
|
||||
def __init__(self, cfg="yolov5s-seg.yaml", ch=3, nc=None, anchors=None):
|
||||
"""Initializes a YOLOv5 segmentation model with configurable params: cfg (str) for configuration, ch (int) for channels, nc (int) for num classes, anchors (list)."""
|
||||
"""
|
||||
Initializes a YOLOv5 segmentation model with configurable parameters.
|
||||
|
||||
Args:
|
||||
cfg (str): Path to the configuration file containing model architecture and parameters. Defaults to "yolov5s-seg.yaml".
|
||||
ch (int): Number of input channels. Defaults to 3.
|
||||
nc (int | None): Number of classes. If provided, overrides the number of classes specified in the cfg file.
|
||||
anchors (list | None): List of anchor points. If provided, overrides the anchor configuration in the cfg file.
|
||||
|
||||
Returns:
|
||||
(None): Initializes various components of the SegmentationModel class.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from ultralytics import SegmentationModel
|
||||
model = SegmentationModel()
|
||||
```
|
||||
|
||||
Note:
|
||||
The initialization includes setting up model layers, anchors, and other configurations based on the provided
|
||||
or default configuration file.
|
||||
"""
|
||||
super().__init__(cfg, ch, nc, anchors)
|
||||
|
||||
|
||||
class ClassificationModel(BaseModel):
|
||||
# YOLOv5 classification model
|
||||
def __init__(self, cfg=None, model=None, nc=1000, cutoff=10):
|
||||
"""Initializes YOLOv5 model with config file `cfg`, input channels `ch`, number of classes `nc`, and `cuttoff`
|
||||
index.
|
||||
"""
|
||||
Initializes a YOLOv5 classification model with either a configuration file or a pre-built model, specifying
|
||||
the number of classes and a cutoff layer index.
|
||||
|
||||
Args:
|
||||
cfg (str | None): Path to the model configuration file, or None if using `model`.
|
||||
model (torch.nn.Module | None): Pre-built torch model, or None if using `cfg`.
|
||||
nc (int): Number of output classes, default is 1000.
|
||||
cutoff (int): Index of the cutoff layer, default is 10.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Initializing from a configuration file
|
||||
model = ClassificationModel(cfg='yolov5-class-config.yaml', nc=1000, cutoff=10)
|
||||
|
||||
# Initializing from an existing model
|
||||
model = ClassificationModel(model=prebuilt_model, nc=1000, cutoff=10)
|
||||
```
|
||||
|
||||
Note:
|
||||
This model can be extended or customized by modifying the configuration file or the pre-built model.
|
||||
"""
|
||||
super().__init__()
|
||||
self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)
|
||||
|
||||
def _from_detection_model(self, model, nc=1000, cutoff=10):
|
||||
"""Creates a classification model from a YOLOv5 detection model, slicing at `cutoff` and adding a classification
|
||||
layer.
|
||||
"""
|
||||
Perform a transformation from a YOLOv5 detection model to a classification model.
|
||||
|
||||
Args:
|
||||
model (DetectionModel): A pre-trained YOLOv5 detection model.
|
||||
nc (int): Number of classes for the classification model. Default is 1000.
|
||||
cutoff (int): Index to slice the model's layers up to the classification layer. Default is 10.
|
||||
|
||||
Returns:
|
||||
None. The function modifies the model in place.
|
||||
|
||||
Notes:
|
||||
This function takes a detection model and transforms it into a classification model by slicing the model layers
|
||||
at the specified cutoff point and adding a classification layer with the specified number of classes.
|
||||
- If the input model is wrapped by `DetectMultiBackend`, it unwraps the model to get the underlying YOLOv5 model.
|
||||
- Constructs a `Classify` layer, replacing the final detection layer with this new classification layer.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from ultralytics import YOLOv5
|
||||
|
||||
# Load a pre-trained detection model
|
||||
detection_model = YOLOv5.load('yolov5s.pt')
|
||||
|
||||
# Create a classification model from detection model
|
||||
classification_model = YOLOv5.ClassificationModel()
|
||||
classification_model._from_detection_model(detection_model, nc=1000, cutoff=10)
|
||||
```
|
||||
"""
|
||||
if isinstance(model, DetectMultiBackend):
|
||||
model = model.model # unwrap DetectMultiBackend
|
||||
|
@ -365,12 +799,49 @@ class ClassificationModel(BaseModel):
|
|||
self.nc = nc
|
||||
|
||||
def _from_yaml(self, cfg):
|
||||
"""Creates a YOLOv5 classification model from a specified *.yaml configuration file."""
|
||||
"""
|
||||
Perform initialization and parsing from a YOLOv5 configuration file.
|
||||
|
||||
Args:
|
||||
cfg (str): Path to the YOLOv5 YAML configuration file.
|
||||
|
||||
Returns:
|
||||
None. The function modifies the model in place utilizing the defined configuration parameters.
|
||||
|
||||
Notes:
|
||||
This function reads a YOLOv5 YAML configuration file and constructs the classification model accordingly. It sets the
|
||||
appropriate channels, layers, and output classes based on the parsed configuration data.
|
||||
"""
|
||||
self.model = None
|
||||
|
||||
|
||||
def parse_model(d, ch):
|
||||
"""Parses a YOLOv5 model from a dict `d`, configuring layers based on input channels `ch` and model architecture."""
|
||||
"""
|
||||
Parses YOLOv5 model architecture from a configuration dictionary and initializes its layers.
|
||||
|
||||
Args:
|
||||
d (dict): Dictionary containing model configuration. Must include keys: "anchors", "nc", "depth_multiple",
|
||||
"width_multiple", and optionally "activation" and "channel_multiple".
|
||||
ch (list[int]): List of input channels for each layer.
|
||||
|
||||
Returns:
|
||||
(tuple[nn.Sequential, list[int]]): A tuple containing:
|
||||
- `model` (nn.Sequential): The constructed YOLOv5 model based on the configuration.
|
||||
- `save` (list[int]): List of layers whose outputs should be preserved during the forward pass.
|
||||
|
||||
Example:
|
||||
```python
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
# Load model configuration YAML
|
||||
with open(Path('yolov5s.yaml'), 'r') as file:
|
||||
model_config = yaml.safe_load(file)
|
||||
|
||||
# Parse model and initialize
|
||||
model, save = parse_model(model_config, ch=[3])
|
||||
```
|
||||
"""
|
||||
LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}")
|
||||
anchors, nc, gd, gw, act, ch_mul = (
|
||||
d["anchors"],
|
||||
|
|
Loading…
Reference in New Issue