diff --git a/models/common.py b/models/common.py index 049dfc0b9..63e047788 100644 --- a/models/common.py +++ b/models/common.py @@ -59,9 +59,26 @@ from utils.torch_utils import copy_attr, smart_inference_mode def autopad(k, p=None, d=1): """ - Pads kernel to 'same' output shape, adjusting for optional dilation; returns padding size. - - `k`: kernel, `p`: padding, `d`: dilation. + Pads kernel to achieve 'same' output shape, taking into account optional dilation. + + Args: + k (int | list[int]): Size of the kernel. Supports single integer or list of integers for each dimension. + p (None | int | list[int]): Padding size. If None, computes 'same' padding automatically. Default is None. + d (int): Dilation rate to apply to the kernel. Defaults to 1. + + Returns: + (int | list[int]): Calculated padding size. Returns a single integer if the kernel size is an integer, otherwise a + list of integers matching the dimensions of the kernel. + + Example: + ```python + pad_size = autopad(3) # For a single dimension kernel of size 3, dilation 1 + pad_sizes = autopad([3, 3], d=2) # For a 2D kernel with size 3x3 and dilation 2 + ``` + + Note: + This function is commonly used when creating neural network architectures to ensure the output dimensions + match the input dimensions, facilitating easy model design and debugging. """ if d > 1: k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size @@ -75,26 +92,108 @@ class Conv(nn.Module): default_act = nn.SiLU() # default activation def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): - """Initializes a standard convolution layer with optional batch normalization and activation.""" + """ + Initialize a convolutional layer with batch normalization and an optional activation function. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size. Default is 1. + s (int): Stride size. Default is 1. + p (None | int): Padding size. If None, the padding is computed as 'same' padding. Default is None. + g (int): Number of groups for group convolution. Default is 1. + d (int): Dilation rate. Default is 1. + act (bool or torch.nn.Module): If True, uses the default activation function (SiLU), otherwise no activation + is applied. You can also provide a custom activation function. Default is True. + + Returns: + (None): This is an initialization method, so it does not return anything. + + Example: + ```python + # Creating a convolutional layer with 3 input channels, 16 output channels, kernel size 3, stride 1, and ReLU activation + conv_layer = Conv(3, 16, k=3, s=1, act=torch.nn.ReLU()) + ``` + + Note: + The default activation function used is SiLU if `act` is set to True. You can replace it with other activation + functions by passing the desired nn.Module as the `act` argument. + """ super().__init__() self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): - """Applies a convolution followed by batch normalization and an activation function to the input tensor `x`.""" + """ + Perform convolution, batch normalization, and activation on the input tensor `x` in sequence. + + Args: + x (torch.Tensor): Input tensor with shape (N, C_in, H, W), where N is the batch size, C_in is the number of input + channels, H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor after applying convolution, batch normalization, and activation, with shape + (N, C_out, H_out, W_out) where C_out is the number of output channels and H_out, W_out are the heights and + widths of the output based on the kernel size, stride, and padding. + + Example: + ```python + conv_layer = Conv(3, 16, k=3, s=1, p=1) + input_tensor = torch.randn(1, 3, 224, 224) # Batch size 1, 3 input channels, 224x224 image + output_tensor = conv_layer(input_tensor) + ``` + + Note: + This forward pass integrates three operations: a convolution, batch normalization, and an optional activation function + (default is nn.SiLU). + """ return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): - """Applies a fused convolution and activation function to the input tensor `x`.""" + """ + Apply convolution and activation without batch normalization for optimized inference. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W), typically a feature map from previous layers. + + Returns: + (torch.Tensor): Output tensor after applying convolution and activation, with shape (N, C', H', W') where the + output channels C' may differ from input channels due to the convolution operations. + + Example: + ```python + conv_layer = Conv(3, 16, k=3, s=1, act=True) + fused_output = conv_layer.forward_fuse(torch.rand(1, 3, 224, 224)) + ``` + """ return self.act(self.conv(x)) class DWConv(Conv): # Depth-wise convolution def __init__(self, c1, c2, k=1, s=1, d=1, act=True): - """Initializes a depth-wise convolution layer with optional activation; args: input channels (c1), output - channels (c2), kernel size (k), stride (s), dilation (d), and activation flag (act). + """ + Initializes a depth-wise convolution layer with optional activation. + + Args: + c1 (int): Number of input channels (C1). + c2 (int): Number of output channels (C2). + k (int): Kernel size. Defaults to 1. + s (int): Stride size. Defaults to 1. + d (int): Dilation rate. Defaults to 1. + act (bool | nn.Module): Activation function or flag. If True, SiLU activation is used. If a + nn.Module is provided, it is used as the custom activation function. Defaults to True. + + Returns: + None + + Example: + ```python + dwconv = DWConv(32, 64, 3, 1, 1, True) + input_tensor = torch.rand(1, 32, 224, 224) # Example input tensor with shape (N, C1, H, W) + output_tensor = dwconv(input_tensor) # Output tensor with shape (N, C2, H_out, W_out) + ``` """ super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act) @@ -102,8 +201,25 @@ class DWConv(Conv): class DWConvTranspose2d(nn.ConvTranspose2d): # Depth-wise transpose convolution def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0): - """Initializes a depth-wise transpose convolutional layer for YOLOv5; args: input channels (c1), output channels - (c2), kernel size (k), stride (s), input padding (p1), output padding (p2). + """ + Initialize a depth-wise transpose convolutional layer. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size of the transpose convolution. Default is 1. + s (int): Stride of the transpose convolution. Default is 1. + p1 (int): Input padding for the transpose convolution. Default is 0. + p2 (int): Output padding for the transpose convolution. Default is 0. + + Returns: + None + + Example: + ```python + layer = DWConvTranspose2d(64, 128, 3, 2, 1, 1) + output = layer(torch.randn(1, 64, 32, 32)) # Example input tensor + ``` """ super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2)) @@ -112,9 +228,21 @@ class TransformerLayer(nn.Module): # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance) def __init__(self, c, num_heads): """ - Initializes a transformer layer, sans LayerNorm for performance, with multihead attention and linear layers. - - See as described in https://arxiv.org/abs/2010.11929. + Initialize a transformer layer without LayerNorm for improved performance. + + Args: + c (int): Number of input and output channels for the transformer layer. + num_heads (int): Number of attention heads in the multihead attention mechanism. + + Returns: + None + + Example: + ```python + layer = TransformerLayer(c=512, num_heads=8) + input_tensor = torch.rand(10, 32, 512) # (sequence_length, batch_size, embedding_dim) + output = layer(input_tensor) + ``` """ super().__init__() self.q = nn.Linear(c, c, bias=False) @@ -125,7 +253,26 @@ class TransformerLayer(nn.Module): self.fc2 = nn.Linear(c, c, bias=False) def forward(self, x): - """Performs forward pass using MultiheadAttention and two linear transformations with residual connections.""" + """ + Perform forward pass with multihead attention and linear layers using residual connections. + + Args: + x (torch.Tensor): Input tensor of shape (T, N, C) where T is the sequence length, N is the batch size, + and C is the embedding dimension. + + Returns: + (torch.Tensor): Output tensor of shape (T, N, C) matching the input shape. + + Example: + ```python + layer = TransformerLayer(c=512, num_heads=8) + input_tensor = torch.rand(10, 32, 512) # Example input tensor with shape (T, N, C) + output = layer(input_tensor) # Output tensor with same shape (T, N, C) + ``` + + Note: + This implementation removes LayerNorm layers for better computational performance. + """ x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x x = self.fc2(self.fc1(x)) + x return x @@ -134,8 +281,27 @@ class TransformerLayer(nn.Module): class TransformerBlock(nn.Module): # Vision Transformer https://arxiv.org/abs/2010.11929 def __init__(self, c1, c2, num_heads, num_layers): - """Initializes a Transformer block for vision tasks, adapting dimensions if necessary and stacking specified - layers. + """ + Initialize a Transformer block for vision tasks, adapting dimensions and stacking specified layers. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + num_heads (int): Number of attention heads in each transformer layer. + num_layers (int): Number of transformer layers to stack. + + Returns: + None + + Example: + ```python + transformer_block = TransformerBlock(c1=64, c2=128, num_heads=8, num_layers=6) + ``` + + Note: + This implementation adapts to input dimension changes by including an initial convolution layer if required. + Utilizes multi-head self-attention mechanism as described in the paper: + https://arxiv.org/abs/2010.11929. """ super().__init__() self.conv = None @@ -146,8 +312,27 @@ class TransformerBlock(nn.Module): self.c2 = c2 def forward(self, x): - """Processes input through an optional convolution, followed by Transformer layers and position embeddings for - object detection. + """ + Perform forward pass through the Vision Transformer block. + + Args: + x (torch.Tensor): Input tensor with shape (B, C1, W, H) where B is batch size, C1 is number of input channels, + W is width and H is height. + + Returns: + (torch.Tensor): Output tensor with shape (L, B, C2) after processing through Vision Transformer layers, where + L is sequence length, B is batch size, and C2 is number of output channels. + + Example: + ```python + transformer_block = TransformerBlock(c1=3, c2=64, num_heads=8, num_layers=6) + input_tensor = torch.rand(1, 3, 224, 224) # Example input tensor of shape (B, C1, W, H) + output_tensor = transformer_block(input_tensor) + print(output_tensor.shape) # Will output torch.Size([L, B, C2]) + ``` + + Note: + Ensure the input tensor has the correct shape (B, C1, W, H) and dimensions when using this Transformer block. """ if self.conv is not None: x = self.conv(x) @@ -159,8 +344,34 @@ class TransformerBlock(nn.Module): class Bottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): - """Initializes a standard bottleneck layer with optional shortcut and group convolution, supporting channel - expansion. + """ + Initialize a standard bottleneck layer. + + This layer consists of a sequence of convolution operations optionally followed by a shortcut connection. The bottleneck design helps in reducing the number of parameters while preserving the performance through embedding dimensionality reduction and restoration. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + shortcut (bool): Whether to add a shortcut connection. Defaults to True. + g (int): Number of groups for group convolution. Defaults to 1. + e (float): Expansion ratio for hidden layer dimensionality. Defaults to 0.5. + + Returns: + (None): This function does not return any value. + + Example: + ```python + bottleneck_layer = Bottleneck(64, 128, shortcut=True, g=1, e=0.5) + input_tensor = torch.randn(1, 64, 128, 128) # Example input tensor with shape (N, C1, H, W) + output_tensor = bottleneck_layer(input_tensor) # Output tensor with shape (N, C2, H, W) + ``` + + Note: + Ensure the input tensor to the Bottleneck layer has the correct shape (N, C1, H, W) where: + N is the batch size, + C1 is the number of input channels, + H is the height, + W is the width. """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -169,8 +380,25 @@ class Bottleneck(nn.Module): self.add = shortcut and c1 == c2 def forward(self, x): - """Processes input through two convolutions, optionally adds shortcut if channel dimensions match; input is a - tensor. + """ + Perform a forward pass through the bottleneck layer with optional shortcut connection. + + Args: + x (torch.Tensor): Input tensor with shape (..., C_in, H, W), where C_in is the number of input channels. + + Returns: + (torch.Tensor): Output tensor, with shape (..., C_out, H, W) where C_out is the number of output channels, either including + the shortcut connection if applicable. + + Example: + ```python + import torch + from ultralytics.models.common import Bottleneck + + bottleneck = Bottleneck(c1=64, c2=64) + x = torch.randn(1, 64, 144, 144) # Sample input + y = bottleneck(x) # Forward pass + ``` """ return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) @@ -178,8 +406,35 @@ class Bottleneck(nn.Module): class BottleneckCSP(nn.Module): # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): - """Initializes CSP bottleneck with optional shortcuts; args: ch_in, ch_out, number of repeats, shortcut bool, - groups, expansion. + """ + Initialize the CSP Bottleneck layer, which is an extension of the traditional bottleneck layer to leverage cross-stage + partial connections. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int): Number of times the bottleneck layer is repeated. Default is 1. + shortcut (bool): Whether to use shortcut connections in the bottleneck layers. Default is True. + g (int): Number of groups for the grouped convolution in the bottleneck layers. Default is 1. + e (float): Expansion factor to control the hidden channels in the bottleneck layers. Default is 0.5. + + Returns: + (None): Initializes the parameters for the CSP Bottleneck module. + + Example: + ```python + from ultralytics.models.common import BottleneckCSP + + # Instantiate CSPBottleneck with specific configuration + bottleneck_csp = BottleneckCSP(c1=64, c2=128, n=3, shortcut=True, g=2, e=0.5) + + # Example input tensor + input_tensor = torch.randn(1, 64, 128, 128) # Shape (B, C1, H, W) + + # Forward pass through the layer + output_tensor = bottleneck_csp(input_tensor) + print(output_tensor.shape) # Should match expected output shape + ``` """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -192,8 +447,31 @@ class BottleneckCSP(nn.Module): self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) def forward(self, x): - """Performs forward pass by applying layers, activation, and concatenation on input x, returning feature- - enhanced output. + """ + Perform a forward pass through the CSP (Cross Stage Partial) Bottleneck layer. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor after applying CSP bottleneck transformations, with shape (N, C2, H, W), where C2 is + the output channel size specified during initialization. + + Example: + ```python + import torch + from ultralytics.models.common import BottleneckCSP + + model = BottleneckCSP(c1=64, c2=128, n=1) + x = torch.randn(1, 64, 128, 128) + output = model.forward(x) + ``` + + Note: + CSP Bottleneck architecture helps in reducing the amount of computation as well as mitigating the gradient + vanishing problem in deep neural networks. The specific implementation follows the design principles outlined in + https://github.com/WongKinYiu/CrossStagePartialNetworks. """ y1 = self.cv3(self.m(self.cv1(x))) y2 = self.cv2(x) @@ -204,10 +482,30 @@ class CrossConv(nn.Module): # Cross Convolution Downsample def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False): """ - Initializes CrossConv with downsampling, expanding, and optionally shortcutting; `c1` input, `c2` output - channels. - - Inputs are ch_in, ch_out, kernel, stride, groups, expansion, shortcut. + Perform initialization of the CrossConv module, which combines convolutions with optional downsampling. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size for the convolution. Defaults to 3. + s (int): Stride for the convolution. Defaults to 1. + g (int): Number of groups for the grouped convolution. Defaults to 1. + e (float): Expansion factor for the intermediate channels. Defaults to 1.0. + shortcut (bool): If True, includes a shortcut connection. Defaults to False. + + Returns: + (None): This method initializes the CrossConv instance without returning any value. + + Note: + This module is designed for channel expansion and downsampling operations within neural network architectures, + particularly for YOLOv5. + + Example: + ```python + cross_conv = CrossConv(64, 128) + input_tensor = torch.randn(1, 64, 224, 224) + output = cross_conv(input_tensor) + ``` """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -216,15 +514,62 @@ class CrossConv(nn.Module): self.add = shortcut and c1 == c2 def forward(self, x): - """Performs feature sampling, expanding, and applies shortcut if channels match; expects `x` input tensor.""" + """ + Perform feature downsampling, expansion, and optional shortcut connection in a neural network. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with the same shape as the input, transformed through the cross convolution layers. + + Example: + ```python + import torch + from ultralytics.models.common import CrossConv + + cross_conv = CrossConv(64, 128) + input_tensor = torch.randn(1, 64, 224, 224) + output_tensor = cross_conv(input_tensor) + print(output_tensor.shape) # Output tensor shape + ``` + + Note: + CrossConv layers are used in models to effectively downsample and expand feature maps, aiding in feature extraction + while maintaining computational efficiency. + """ return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class C3(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): - """Initializes C3 module with options for channel count, bottleneck repetition, shortcut usage, group - convolutions, and expansion. + """ + Initialize a CSP bottleneck containing three convolutional layers and optional shortcut connections. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int): Number of Bottleneck layers to include. Defaults to 1. + shortcut (bool): Whether to use shortcut connections in the Bottleneck layers. Defaults to True. + g (int): Number of groups for the grouped convolution. Defaults to 1. + e (float): Expansion ratio for the hidden channels in the Bottleneck layers. Defaults to 0.5. + + Returns: + (torch.Tensor): The output tensor from the sequential layers, maintaining the same spatial dimensions but potentially + different channel dimensions. + + Example: + ```python + from ultralytics.models.common import C3 + import torch + + c3_layer = C3(c1=128, c2=256, n=1, shortcut=True) + x = torch.randn(1, 128, 32, 32) # Example input tensor + y = c3_layer(x) # Output tensor + print(y.shape) # torch.Size([1, 256, 32, 32]) + ``` """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -234,15 +579,63 @@ class C3(nn.Module): self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n))) def forward(self, x): - """Performs forward propagation using concatenated outputs from two convolutions and a Bottleneck sequence.""" + """ + Performs a forward pass using CSP bottleneck with three convolution layers, incorporating hidden bottleneck layers. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of input channels, + H is height, and W is width. + + Returns: + (torch.Tensor): Output tensor with shape (N, C_out, H, W), where C_out is the number of output channels after + processing through the CSP bottleneck with 3 convolutions. + + Example: + ```python + import torch + from ultralytics.models.common import C3 + + model = C3(c1=64, c2=128, n=3) + input_tensor = torch.randn(1, 64, 128, 128) # Example input tensor with shape (N, C, H, W) + output_tensor = model(input_tensor) + print(output_tensor.shape) # Outputs tensor shape after forward pass + ``` + + Note: + CSP Bottleneck with 3 convolutions and hidden bottleneck layers helps in efficient representation by downsampling + and concatenating filtered features from different paths. This architecture is inspired by the principles outlined + in https://github.com/WongKinYiu/CrossStagePartialNetworks. + """ return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) class C3x(C3): # C3 module with cross-convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): - """Initializes C3x module with cross-convolutions, extending C3 with customizable channel dimensions, groups, - and expansion. + """ + Initialize the C3x module with cross-convolutions. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int): Number of Bottleneck layers to include. Defaults to 1. + shortcut (bool): Whether to add shortcut connections. Defaults to True. + g (int): Number of groups for grouped convolution. Defaults to 1. + e (float): Expansion ratio for hidden channels. Defaults to 0.5. + + Returns: + None: This constructor initializes the C3x module with the specified parameters and does not return any value. + + Note: + This class inherits from C3 and extends its functionality by adding cross-convolutions adjacent to the main + bottleneck layers for enhanced feature extraction. + + Example: + ```python + c3x_layer = C3x(64, 128, n=3, shortcut=True, g=1, e=0.5) + input_tensor = torch.randn(1, 64, 256, 256) # Example input tensor with shape (N, C1, H, W) + output_tensor = c3x_layer(input_tensor) # Output tensor with shape (N, C2, H, W) + ``` """ super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) @@ -252,8 +645,30 @@ class C3x(C3): class C3TR(C3): # C3 module with TransformerBlock() def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): - """Initializes C3 module with TransformerBlock for enhanced feature extraction, accepts channel sizes, shortcut - config, group, and expansion. + """ + Initialize the C3 module with an integrated TransformerBlock for advanced feature extraction. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int): Number of Bottleneck layers to be stacked sequentially. + shortcut (bool): Whether to use residual connections between layers. + g (int): Number of groups in group convolution. + e (float): Expansion coefficient for channel dimensions. + + Returns: + None + + Example: + ```python + c3tr = C3TR(64, 128, n=3, shortcut=True, g=1, e=0.5) + input_tensor = torch.rand(1, 64, 256, 256) # Random input tensor with shape (B, C1, H, W) + output_tensor = c3tr(input_tensor) + ``` + + Notes: + This module extends C3 by incorporating a TransformerBlock for enhanced contextual feature extraction, as described in + the paper "Attention Is All You Need" (https://arxiv.org/abs/2010.11929). """ super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) @@ -263,8 +678,31 @@ class C3TR(C3): class C3SPP(C3): # C3 module with SPP() def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5): - """Initializes a C3 module with SPP layer for advanced spatial feature extraction, given channel sizes, kernel - sizes, shortcut, group, and expansion ratio. + """ + Initialize a C3 module with Spatial Pyramid Pooling (SPP) for advanced spatial feature extraction. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (tuple[int]): Kernel sizes for SPP. Defaults to (5, 9, 13). + n (int, optional): Number of Bottleneck layers. Defaults to 1. + shortcut (bool, optional): Whether to use residual connections. Defaults to True. + g (int, optional): Number of groups for group convolution. Defaults to 1. + e (float, optional): Expansion ratio for hidden channels. Defaults to 0.5. + + Returns: + None + + Example: + ```python + c3spp = C3SPP(c1=64, c2=128, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5) + input_tensor = torch.randn(1, 64, 32, 32) # Batch size 1, 64 channels, 32x32 image + output = c3spp(input_tensor) + print(output.shape) # Expected output shape: (1, 128, 32, 32) + ``` + + Note: + The SPP layer enhances the receptive field size while keeping computational costs manageable. """ super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) @@ -274,7 +712,29 @@ class C3SPP(C3): class C3Ghost(C3): # C3 module with GhostBottleneck() def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): - """Initializes YOLOv5's C3 module with Ghost Bottlenecks for efficient feature extraction.""" + """ + Initializes YOLOv5's C3 module using Ghost Bottlenecks for efficient feature extraction and processing. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int, optional): Number of Bottleneck layers to include. Defaults to 1. + shortcut (bool, optional): Whether to add shortcut connections. Defaults to True. + g (int, optional): Number of groups for group convolution. Defaults to 1. + e (float, optional): Expansion ratio for hidden channels. Defaults to 0.5. + + Returns: + None + + Example: + ```python + from ultralytics.models.common import C3Ghost + + c3ghost_layer = C3Ghost(c1=64, c2=128, n=2, shortcut=True, e=0.5) + input_tensor = torch.randn(1, 64, 256, 256) # Random input tensor with shape (B, C1, H, W) + output_tensor = c3ghost_layer(input_tensor) + ``` + """ super().__init__(c1, c2, n, shortcut, g, e) c_ = int(c2 * e) # hidden channels self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n))) @@ -283,7 +743,30 @@ class C3Ghost(C3): class SPP(nn.Module): # Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729 def __init__(self, c1, c2, k=(5, 9, 13)): - """Initializes SPP layer with Spatial Pyramid Pooling, ref: https://arxiv.org/abs/1406.4729, args: c1 (input channels), c2 (output channels), k (kernel sizes).""" + """ + Initialize the Spatial Pyramid Pooling (SPP) layer to enhance receptive field size and feature extraction. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (tuple[int], optional): Kernel sizes for max pooling layers. Default is (5, 9, 13). + + Returns: + None + + Example: + ```python + spp_layer = SPP(c1=64, c2=128, k=(5, 9, 13)) + input_tensor = torch.randn(1, 64, 32, 32) # Batch size 1, 64 channels, 32x32 resolution + output_tensor = spp_layer(input_tensor) + print(output_tensor.shape) # Output shape: (1, 128, 32, 32) + ``` + + Note: + The SPP layer facilitates effective extraction of multi-scale context by performing max pooling + with multiple kernel sizes. This enhances the network's receptive field and robustness to object + scaling and deformation. + """ super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = Conv(c1, c_, 1, 1) @@ -291,8 +774,23 @@ class SPP(nn.Module): self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) def forward(self, x): - """Applies convolution and max pooling layers to the input tensor `x`, concatenates results, and returns output - tensor. + """ + Apply the Spatial Pyramid Pooling (SPP) process to enhance spatial feature extraction from input tensor. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with enhanced spatial features, having shape (N, C2, H, W). + + Example: + ```python + spp_layer = SPP(c1=64, c2=128, k=(5, 9, 13)) + input_tensor = torch.randn(1, 64, 32, 32) # Batch size 1, 64 channels, 32x32 spatial dimensions + output_tensor = spp_layer(input_tensor) + print(output_tensor.shape) # Expected output shape: (1, 128, 32, 32) + ``` """ x = self.cv1(x) with warnings.catch_warnings(): @@ -304,10 +802,27 @@ class SPPF(nn.Module): # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher def __init__(self, c1, c2, k=5): """ - Initializes YOLOv5 SPPF layer with given channels and kernel size for YOLOv5 model, combining convolution and - max pooling. - - Equivalent to SPP(k=(5, 9, 13)). + Initialize YOLOv5's Spatial Pyramid Pooling - Fast (SPPF) layer with convolution and max pooling. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int, optional): Kernel size for max pooling layers. Default is 5. + + Returns: + None: This method initializes the SPPF layer without returning any value. + + Example: + ```python + sppf = SPPF(128, 256, k=5) + input_tensor = torch.randn(1, 128, 64, 64) + output_tensor = sppf(input_tensor) + print(output_tensor.shape) # Expected output shape: (1, 256, 64, 64) + ``` + + Note: + SPPF enhances feature extraction efficiency by reducing spatial dimensions and enriching features using convolution and + max pooling. """ super().__init__() c_ = c1 // 2 # hidden channels @@ -316,7 +831,29 @@ class SPPF(nn.Module): self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) def forward(self, x): - """Processes input through a series of convolutions and max pooling operations for feature extraction.""" + """ + Perform forward pass through the Spatial Pyramid Pooling-Fast (SPPF) layer to enhance spatial features. + + Args: + x (torch.Tensor): Input tensor with shape (N, C1, H, W), where N is batch size, C1 is number of input channels, + H is height, and W is width. + + Returns: + (torch.Tensor): Output tensor with enriched spatial features and shape (N, C2, H, W), where C2 is the number of + output channels. + + Example: + ```python + sppf = SPPF(128, 256, k=5) + input_tensor = torch.randn(1, 128, 64, 64) # Example input tensor + output_tensor = sppf(input_tensor) + print(output_tensor.shape) # Expected output shape: (1, 256, 64, 64) + ``` + + Note: + The SPPF layer leverages multiple levels of max pooling to capture diverse spatial patterns efficiently, which is + particularly useful in object detection tasks like YOLOv5. + """ x = self.cv1(x) with warnings.catch_warnings(): warnings.simplefilter("ignore") # suppress torch 1.9.0 max_pool2d() warning @@ -328,15 +865,61 @@ class SPPF(nn.Module): class Focus(nn.Module): # Focus wh information into c-space def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): - """Initializes Focus module to concentrate width-height info into channel space with configurable convolution - parameters. + """ + Initialize the Focus layer that concatenates slices of the input tensor to increase channel depth before applying convolution. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size for the convolution. Defaults to 1. + s (int): Stride for the convolution. Defaults to 1. + p (int | None): Padding size for the convolution. Uses automatic padding if None. Defaults to None. + g (int): Group size for the convolution. Defaults to 1. + act (bool | nn.Module): Activation function to apply after the convolution. Uses default activation (nn.SiLU) if True, + or no activation if False. Can also be a custom activation module. + + Returns: + None: This is an initializer method, so it does not return a value. + + Example: + ```python + focus = Focus(3, 64, k=3, s=1, p=1) + input_tensor = torch.rand(1, 3, 224, 224) + output = focus(input_tensor) + ``` + Notes: + The Focus layer is designed to increase the channel dimension by concatenating four slices of the input tensor, + then applying a convolution to the concatenated result. """ super().__init__() self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act) # self.contract = Contract(gain=2) def forward(self, x): - """Processes input through Focus mechanism, reshaping (b,c,w,h) to (b,4c,w/2,h/2) then applies convolution.""" + """ + Focus width and height information into channel space and apply convolution. + + Args: + x (torch.Tensor): Input tensor with shape (N, C1, H, W) where N is the batch size, C1 is the number of input + channels, H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with shape (N, C2, H/2, W/2) where C2 is the number of output channels specified + during initialization. + + Example: + ```python + focus_layer = Focus(3, 64, k=3, s=1, p=1) + input_tensor = torch.rand(1, 3, 224, 224) + output_tensor = focus_layer(input_tensor) + print(output_tensor.shape) # Expected shape: (1, 64, 112, 112) + ``` + + Notes: + The Focus layer increases the channel dimension by concatenating four slices of the input tensor, each slice being + a downsampled version of the input. This effectively focuses width and height information into the channel space before + applying convolution. + """ return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1)) # return self.conv(self.contract(x)) @@ -344,8 +927,36 @@ class Focus(nn.Module): class GhostConv(nn.Module): # Ghost Convolution https://github.com/huawei-noah/ghostnet def __init__(self, c1, c2, k=1, s=1, g=1, act=True): - """Initializes GhostConv with in/out channels, kernel size, stride, groups, and activation; halves out channels - for efficiency. + """ + Initialize a Ghost Convolution layer for efficient feature extraction using fewer parameters. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int, optional): Kernel size for convolution. Default is 1. + s (int, optional): Stride for convolution. Default is 1. + g (int, optional): Number of groups for convolution, facilitating group-wise operations. Default is 1. + act (bool | nn.Module, optional): Activation function to use. Default is True, which uses the default activation; + can also accept an nn.Module for custom activation or False for no activation. + + Returns: + (None): This method initializes the Ghost Convolution layer without returning any value. + + Example: + ```python + import torch + from ultralytics.models.common import GhostConv + + x = torch.randn(1, 64, 128, 128) # Example input tensor with shape (B, C1, H, W) + conv_layer = GhostConv(64, 128) # Initialize GhostConv with 64 input channels and 128 output channels + y = conv_layer(x) # Forward pass + print(y.shape) # Should output: torch.Size([1, 128, 128, 128]) + ``` + + Note: + The Ghost Convolution technique effectively reduces computational complexity by splitting convolution into two steps: + a primary convolution and a series of cheaper operations to generate 'ghost' feature maps. The technique is published + by Huawei Noah's Ark Lab and is aimed at optimizing neural network performance on edge devices. """ super().__init__() c_ = c2 // 2 # hidden channels @@ -353,7 +964,32 @@ class GhostConv(nn.Module): self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act) def forward(self, x): - """Performs forward pass, concatenating outputs of two convolutions on input `x`: shape (B,C,H,W).""" + """ + Perform a forward pass through the Ghost Convolution layer. + + Args: + x (torch.Tensor): Input tensor with shape (N, C1, H, W), where N is the batch size, C1 is the number of input + channels, H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with shape (N, C2, H, W), where C2 is the number of output channels after + applying Ghost Convolution operations. + + Example: + ```python + import torch + from ultralytics.models.common import GhostConv + + input_tensor = torch.randn(1, 64, 128, 128) # Example input tensor with shape (B, C1, H, W) + ghost_conv_layer = GhostConv(64, 128) # Initialize GhostConv with 64 input channels and 128 output channels + output_tensor = ghost_conv_layer.forward(input_tensor) # Forward pass + print(output_tensor.shape) # Should output: torch.Size([1, 128, 128, 128]) + ``` + + Note: + Ghost Convolution aims to optimize feature extraction by combining standard convolutions with cheaper operations to + generate 'ghost' feature maps, enhancing computational efficiency and performance. + """ y = self.cv1(x) return torch.cat((y, self.cv2(y)), 1) @@ -361,7 +997,36 @@ class GhostConv(nn.Module): class GhostBottleneck(nn.Module): # Ghost Bottleneck https://github.com/huawei-noah/ghostnet def __init__(self, c1, c2, k=3, s=1): - """Initializes GhostBottleneck with ch_in `c1`, ch_out `c2`, kernel size `k`, stride `s`; see https://github.com/huawei-noah/ghostnet.""" + """ + Initialize a GhostBottleneck layer for efficient feature extraction and processing with optional downsampling. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size for depth-wise convolution. Defaults to 3. + s (int): Stride for depth-wise convolution, determines downsampling. Defaults to 1. + + Returns: + None + + Example: + ```python + from ultralytics.models.common import GhostBottleneck + import torch + + # Initialize GhostBottleneck with 64 input channels, 128 output channels + ghost_bottleneck = GhostBottleneck(c1=64, c2=128, k=3, s=2) + x = torch.randn(1, 64, 56, 56) # Example input tensor + output = ghost_bottleneck(x) + print(output.shape) # Expected output tensor shape: (1, 128, 28, 28) + ``` + + Note: + The GhostBottleneck module incorporates GhostConvs and optional depth-wise convolutions for efficient feature + processing. The use of GhostConv layers reduces computational overhead while maintaining performance, making + this bottleneck design suitable for deploying neural networks on resource-constrained devices. The specific + implementation is inspired by the GhostNet architecture: https://github.com/huawei-noah/ghostnet. + """ super().__init__() c_ = c2 // 2 self.conv = nn.Sequential( @@ -374,22 +1039,76 @@ class GhostBottleneck(nn.Module): ) def forward(self, x): - """Processes input through conv and shortcut layers, returning their summed output.""" + """ + Performs a forward pass through the GhostBottleneck layer, leveraging Ghost convolution operations for efficient feature extraction. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with shape (N, C2, H, W) after applying Ghost Convolutions and optional + shortcut connections, where C2 is the number of output channels specified during initialization. + + Example: + ```python + import torch + from ultralytics.models.common import GhostBottleneck + + # Initialize GhostBottleneck with 64 input and 128 output channels + ghost_bottleneck = GhostBottleneck(64, 128) + x = torch.randn(1, 64, 56, 56) # Example input + y = ghost_bottleneck(x) # Forward pass + print(y.shape) # Output shape should be (1, 128, 56, 56) + ``` + + Note: + This layer is part of the GhostNet architecture, designed for lightweight and efficient neural network models, + particularly on edge devices. The architecture minimizes computational complexity by generating fewer primary + feature maps and using cheap operations to produce 'ghost' feature maps. + """ return self.conv(x) + self.shortcut(x) class Contract(nn.Module): # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40) def __init__(self, gain=2): - """Initializes a layer to contract spatial dimensions (width-height) into channels, e.g., input shape - (1,64,80,80) to (1,256,40,40). + """ + Initialize the Contract module for transforming spatial dimensions into channels. + + Args: + gain (int): The factor by which to contract the dimensions. For example, a gain of 2 will halve the spatial + dimensions and quadruple the channel dimension. + + Example: + ```python + contract_layer = Contract(gain=2) + x = torch.randn(1, 64, 80, 80) + output = contract_layer(x) # results in shape (1, 256, 40, 40) + ``` """ super().__init__() self.gain = gain def forward(self, x): - """Processes input tensor to expand channel dimensions by contracting spatial dimensions, yielding output shape - `(b, c*s*s, h//s, w//s)`. + """ + Forward pass for contracting the spatial dimensions into the channel dimension. + + Args: + x (torch.Tensor): Input tensor of shape (B, C, H, W) where B is the batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Tensor with the spatial dimensions contracted into the channel dimension, with shape + (B, C * gain * gain, H // gain, W // gain). + + Example: + ```python + contract_layer = Contract(gain=2) + input_tensor = torch.randn(1, 64, 80, 80) + output_tensor = contract_layer(input_tensor) + assert output_tensor.shape == (1, 256, 40, 40) + ``` """ b, c, h, w = x.size() # assert (h / s == 0) and (W / s == 0), 'Indivisible gain' s = self.gain @@ -402,17 +1121,46 @@ class Expand(nn.Module): # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160) def __init__(self, gain=2): """ - Initializes the Expand module to increase spatial dimensions by redistributing channels, with an optional gain - factor. - - Example: x(1,64,80,80) to x(1,16,160,160). + Initialize the Expand module to increase spatial dimensions by redistributing channels. + + Args: + gain (int): Factor to redistribute channels into spatial dimensions. Default is 2. + + Returns: + None + + Example: + ```python + expand_layer = Expand(gain=2) + input_tensor = torch.randn(1, 64, 80, 80) + output_tensor = expand_layer(input_tensor) # Output shape will be (1, 16, 160, 160) + ``` """ super().__init__() self.gain = gain def forward(self, x): - """Processes input tensor x to expand spatial dimensions by redistributing channels, requiring C / gain^2 == - 0. + """ + Expand channels into spatial dimensions, i.e., transforms tensor shape (B, C, H, W) to (B, C/(gain^2), H*gain, W*gain). + + Args: + x (torch.Tensor): Input tensor with shape (B, C, H, W), where B is the batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with expanded spatial dimensions and reshaped channels. For example, an input tensor + with shape (B, C, H, W) is transformed into (B, C/(gain^2), H*gain, W*gain), where gain is the expansion factor. + + Example: + ```python + expand_layer = Expand(gain=2) + input_tensor = torch.rand(1, 64, 80, 80) + output_tensor = expand_layer(input_tensor) + print(output_tensor.shape) # Expected output: torch.Size([1, 16, 160, 160]) + ``` + + Note: + Ensure that the number of input channels `C` is divisible by `gain^2` to avoid reshaping errors. """ b, c, h, w = x.size() # assert C / s ** 2 == 0, 'Indivisible gain' s = self.gain @@ -424,13 +1172,52 @@ class Expand(nn.Module): class Concat(nn.Module): # Concatenate a list of tensors along dimension def __init__(self, dimension=1): - """Initializes a Concat module to concatenate tensors along a specified dimension.""" + """ + Initializes a Concat module to concatenate tensors along a specified dimension. + + Args: + dimension (int): Dimension along which to concatenate the input tensors. Default is 1. + + Returns: + None: This method initializes the Concat module without returning any value. + + Example: + ```python + concat_layer = Concat(dimension=1) + input_tensor1 = torch.randn(2, 3, 64, 64) + input_tensor2 = torch.randn(2, 3, 64, 64) + output_tensor = concat_layer([input_tensor1, input_tensor2]) + print(output_tensor.shape) # Expected output shape: (2, 6, 64, 64) + ``` + """ super().__init__() self.d = dimension def forward(self, x): - """Concatenates a list of tensors along a specified dimension; `x` is a list of tensors, `dimension` is an - int. + """ + Concatenate a list of tensors along a specified dimension. + + Args: + x (list[torch.Tensor]): A list of tensors to concatenate along the specified dimension. Each tensor + must have the same shape except along the concatenation dimension. + + Returns: + (torch.Tensor): The concatenated tensor along the specified dimension. + + Example: + ```python + import torch + + t1 = torch.randn(2, 3) + t2 = torch.randn(2, 3) + concat_module = Concat(dimension=0) + result = concat_module([t1, t2]) + print(result.shape) # Output shape will be (4, 3) + ``` + + Note: + The concatenation dimension is specified during the initialization of the Concat module. Ensure that + all tensors to be concatenated have matching shapes except along the dimension specified. """ return torch.cat(x, self.d) @@ -438,7 +1225,35 @@ class Concat(nn.Module): class DetectMultiBackend(nn.Module): # YOLOv5 MultiBackend class for python inference on various backends def __init__(self, weights="yolov5s.pt", device=torch.device("cpu"), dnn=False, data=None, fp16=False, fuse=True): - """Initializes DetectMultiBackend with support for various inference backends, including PyTorch and ONNX.""" + """ + Initialize the DetectMultiBackend class for inference on multiple backends such as PyTorch, ONNX, TensorRT, and more. + + Args: + weights (str | list[str]): Path to the model weights. Multiple weights can be specified as a list for ensemble. + Supported extensions include .pt, .onnx, .torchscript, .xml, .engine, .mlmodel, .pb, .tflite, and more. + device (torch.device): The device to run the model on, e.g., torch.device('cpu'), torch.device('cuda:0'). + Default is torch.device('cpu'). + dnn (bool): Flag to use OpenCV DNN for ONNX models. Default is False. + data (str | None): Path to the dataset configuration file containing class names. If None, default names will be used. + Default is None. + fp16 (bool): Flag to enable half-precision FP16 inference. Default is False. + fuse (bool): Flag to fuse model convolutions for improved runtime efficiency. Default is True. + + Returns: + None + + Example: + ```python + from ultralytics import DetectMultiBackend + + model = DetectMultiBackend(weights='yolov5s.pt', device=torch.device('cuda:0')) + ``` + + Note: + - Successfully supports multiple backends such as PyTorch, ONNX, TensorRT, OpenCV DNN, PaddlePaddle, TensorFlow, and more. + - Ensure that appropriate dependency packages for various backends are installed as required. + - Utilizes efficient pre-initializations and backend-specific optimizations defined within the `__init__` method to support diverse methods of model loading and inference. + """ # PyTorch: weights = *.pt # TorchScript: *.torchscript # ONNX Runtime: *.onnx @@ -655,7 +1470,39 @@ class DetectMultiBackend(nn.Module): self.__dict__.update(locals()) # assign all variables to self def forward(self, im, augment=False, visualize=False): - """Performs YOLOv5 inference on input images with options for augmentation and visualization.""" + """ + Performs inference on input images with support for multiple backends (PyTorch, ONNX, TensorRT, etc.). + + Args: + im (torch.Tensor): Input tensor containing images, with shape (B, C, H, W) where B is batch size, C is number of + channels, H is height, and W is width. + augment (bool): Boolean flag to perform data augmentation during inference. Defaults to False. + visualize (bool): Boolean flag to store or visualize the features/activations. Defaults to False. + + Returns: + (torch.Tensor): Inference output tensor. Depending on the backend, this can be a single torch.Tensor or a list of + torch.Tensors. Each tensor contains detection results such as bounding boxes and class scores. + + Example: + ```python + import torch + from ultralytics.models.common import DetectMultiBackend + + # Initialize the model for a specific backend + model = DetectMultiBackend(weights='yolov5s.pt', device=torch.device('cpu')) + + # Example input tensor of shape (B, C, H, W) + input_tensor = torch.randn(1, 3, 640, 640) + + # Perform inference + output_tensor = model.forward(input_tensor) + ``` + + Note: + This function handles input preprocessing, model inference, and postprocessing. It supports multiple deep learning + backends such as PyTorch, ONNX, TensorRT, TensorFlow, and more, with device compatibility checks and backend-specific + operations. + """ b, ch, h, w = im.shape # batch, channel, height, width if self.fp16 and im.dtype != torch.float16: im = im.half() # to FP16 @@ -737,11 +1584,50 @@ class DetectMultiBackend(nn.Module): return self.from_numpy(y) def from_numpy(self, x): - """Converts a NumPy array to a torch tensor, maintaining device compatibility.""" + """ + Convert NumPy array `x` to a torch tensor, maintaining device compatibility. + + Args: + x (numpy.ndarray): Input array to convert to torch tensor, with any shape. + + Returns: + (torch.Tensor): Converted torch tensor with the same data and shape as input array. + + Example: + ```python + import numpy as np + input_array = np.random.randn(3, 224, 224) # Example input array + tensor = detect_multi_backend_instance.from_numpy(input_array) + print(tensor.shape) # Should output: torch.Size([3, 224, 224]) + ``` + + Note: + This function ensures that the resulting torch tensor retains the appropriate device (CPU or GPU) setting. + """ return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x def warmup(self, imgsz=(1, 3, 640, 640)): - """Performs a single inference warmup to initialize model weights, accepting an `imgsz` tuple for image size.""" + """ + Warms up the model by performing initial inference to prepare weights and memory allocations. + + Args: + imgsz (tuple[int]): Input image size tuple in the format (B, C, H, W), where B is batch size, C is number of + channels, H is height, and W is width for the warmup run. Defaults to (1, 3, 640, 640). + + Returns: + None + + Example: + ```python + detect_backend = DetectMultiBackend(weights='yolov5s.pt') + detect_backend.warmup(imgsz=(1, 3, 320, 320)) + ``` + + Note: + The warmup process involves passing a blank tensor through the model to ensure that weights are moved to the + selected device, and memory is allocated properly. This is particularly useful for models running on GPU or with + FP16 precision. + """ warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton if any(warmup_types) and (self.device.type != "cpu" or self.triton): im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input @@ -751,9 +1637,37 @@ class DetectMultiBackend(nn.Module): @staticmethod def _model_type(p="path/to/model.pt"): """ - Determines model type from file path or URL, supporting various export formats. - - Example: path='path/to/model.onnx' -> type=onnx + Determine the model type from a given file path or URL. + + Args: + p (str): File path or URL for the model. + Supported formats include PyTorch, TorchScript, ONNX, OpenVINO, TensorRT, CoreML, TensorFlow, TFLite, and PaddlePaddle. + + Returns: + (tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool]): + A tuple of booleans representing the type of model inferred from the file path or URL. Each boolean indicates: + - PyTorch (.pt) + - TorchScript + - ONNX (.onnx) + - OpenVINO (.xml) + - TensorRT (.engine) + - CoreML (.mlmodel) + - TensorFlow SavedModel + - TensorFlow GraphDef (.pb) + - TensorFlow Lite (.tflite) + - TensorFlow Edge TPU (.tflite) + - TensorFlow.js + - PaddlePaddle + + Example: + ```python + model_type = DetectMultiBackend._model_type("model.onnx") + assert model_type == (False, False, True, False, False, False, False, False, False, False, False, False) + ``` + + Note: + This method relies on the file suffix and URL scheme to determine the type of model. Use this method to + programmatically infer the model type, facilitating subsequent backend-specific operations. """ # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle] from export import export_formats @@ -770,7 +1684,29 @@ class DetectMultiBackend(nn.Module): @staticmethod def _load_metadata(f=Path("path/to/meta.yaml")): - """Loads metadata from a YAML file, returning strides and names if the file exists, otherwise `None`.""" + """ + Load metadata from a specified YAML file. + + Args: + f (Path): The path to the YAML file containing metadata. + + Returns: + (int, dict): A tuple containing the following: + - stride (int): The stride value extracted from the YAML file. + - names (dict): A dictionary of class names mapped by their index. + + Example: + ```python + from pathlib import Path + metadata_path = Path("path/to/meta.yaml") + stride, names = DetectMultiBackend._load_metadata(metadata_path) + print(f"Stride: {stride}") + print(f"Class Names: {names}") + ``` + + Note: + Ensure the YAML file at the specified path exists and contains 'stride' and 'names' keys for successful metadata extraction. + """ if f.exists(): d = yaml_load(f) return d["stride"], d["names"] # assign stride, names @@ -788,7 +1724,27 @@ class AutoShape(nn.Module): amp = False # Automatic Mixed Precision (AMP) inference def __init__(self, model, verbose=True): - """Initializes YOLOv5 model for inference, setting up attributes and preparing model for evaluation.""" + """ + Initializes an input-robust YOLO model with preprocessing, inference, and post-processing capabilities. + + Args: + model (torch.nn.Module): The YOLO model to be wrapped. + verbose (bool): If True, logs information about the initialization. Defaults to True. + + Returns: + None + + Example: + ```python + from ultralytics import YOLO + from ultralytics.models.common import AutoShape + + model = YOLO("yolov5s.pt") + auto_shape_model = AutoShape(model) + input_image = "path/to/image.jpg" + predictions = auto_shape_model(input_image) + ``` + """ super().__init__() if verbose: LOGGER.info("Adding AutoShape... ") @@ -803,9 +1759,25 @@ class AutoShape(nn.Module): def _apply(self, fn): """ - Applies to(), cpu(), cuda(), half() etc. - - to model tensors excluding parameters or registered buffers. + Apply a function to model tensors excluding parameters or registered buffers. + + Args: + fn (Callable): The function to apply to the tensors. Common choices include `to()`, `cpu()`, `cuda()`, `half()`, etc. + + Returns: + (AutoShape): The current instance with the function applied. + + Note: + This method is useful for moving all tensors to a specific device (e.g., GPU) or changing their data types. + ```python + self = super()._apply(fn) + if self.pt: + m = self.model.model.model[-1] if self.dmb else self.model.model[-1] # Detect() + m.stride = fn(m.stride) + m.grid = list(map(fn, m.grid)) + if isinstance(m.anchor_grid, list): + m.anchor_grid = list(map(fn, m.anchor_grid)) + ``` """ self = super()._apply(fn) if self.pt: @@ -819,9 +1791,41 @@ class AutoShape(nn.Module): @smart_inference_mode() def forward(self, ims, size=640, augment=False, profile=False): """ - Performs inference on inputs with optional augment & profiling. - - Supports various formats including file, URI, OpenCV, PIL, numpy, torch. + Perform inference on given image inputs with support for various input formats. + + Args: + ims (str | list[str] | pathlib.Path | list[pathlib.Path] | np.ndarray | list[np.ndarray] | torch.Tensor | + list[torch.Tensor] | PIL.Image.Image | list[PIL.Image.Image]): + Input images. Supported formats: + - File path as string ('data/images/zidane.jpg') or Path object. + - URL as string ('https://ultralytics.com/images/zidane.jpg'). + - OpenCV image (cv2.imread()) with shape (H, W, 3). + - PIL image (Image.open()) with shape (H, W, 3). + - NumPy array with shape (H, W, 3) or (B, C, H, W). + - Torch tensor with shape (B, C, H, W). + - List of any of the above. + size (int | tuple[int, int], optional): Target size for resizing input images, specified as an integer or + a tuple (height, width). Defaults to 640. + augment (bool, optional): If True, apply image augmentations during inference. Defaults to False. + profile (bool, optional): If True, profile the inference process. Defaults to False. + + Returns: + (list[torch.Tensor]): List of detection results, where each tensor has shape (N, 6) representing + (x1, y1, x2, y2, conf, cls) for each detection. + + Example: + ```python + from PIL import Image + from ultralytics import YOLO + + model = YOLO("yolov5s.pt") + img = Image.open("path/to/image.jpg") + results = model.autoshape.forward(img) + ``` + + Note: + Inference can be performed with Automatic Mixed Precision (AMP) if `amp` attribute is set to `True` and + the current hardware supports it. """ # For size(height=640, width=1280), RGB images example inputs are: # file: ims = 'data/images/zidane.jpg' # str or PosixPath @@ -891,7 +1895,36 @@ class AutoShape(nn.Module): class Detections: # YOLOv5 detections class for inference results def __init__(self, ims, pred, files, times=(0, 0, 0), names=None, shape=None): - """Initializes the YOLOv5 Detections class with image info, predictions, filenames, timing and normalization.""" + """ + Initialize the Detections object, which stores prediction results from the YOLO model. + + Args: + ims (list[np.ndarray]): A list of images as numpy arrays, where each array represents an image in HWC format. + pred (list[torch.Tensor]): List of tensors containing the predicted bounding boxes and scores for each image. + Each tensor has shape (N, 6) for (x1, y1, x2, y2, conf, cls). + files (list[str]): List of filenames corresponding to images. + times (tuple[float, float, float], optional): Profiling times, default is (0, 0, 0). + names (list[str], optional): List of class names used for predictions, default is None. + shape (tuple[int, int], optional): Shape of the input image, given as (height, width), default is None. + + Returns: + None + + Example: + ```python + ims = [cv2.imread("image1.jpg"), cv2.imread("image2.jpg")] + pred = [torch.tensor([[50, 50, 200, 200, 0.9, 1]]), torch.tensor([[30, 30, 150, 150, 0.8, 0]])] + names = ["class0", "class1"] + files = ["image1.jpg", "image2.jpg"] + times = (0.1, 0.2, 0.3) + detections = Detections(ims, pred, files, times, names) + ``` + + Note: + This class simplifies tracing of predictions through different stages of the YOLOv5 inference pipeline, including + preprocessing, model inference, and postprocessing. The Detections class maintains a convenient interface to + access both raw and normalized bounding box coordinates and associated metadata. + """ super().__init__() d = pred[0].device # device gn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in ims] # normalizations @@ -909,7 +1942,34 @@ class Detections: self.s = tuple(shape) # inference BCHW shape def _run(self, pprint=False, show=False, save=False, crop=False, render=False, labels=True, save_dir=Path("")): - """Executes model predictions, displaying and/or saving outputs with optional crops and labels.""" + """ + Perform desired post-processing actions (e.g., pretty-print results, show images, save outputs). + + Args: + pprint (bool): If True, pretty-print the detection results. + show (bool): If True, display the detection results using the default image viewer. + save (bool): If True, save the detection results to the specified directory. + crop (bool): If True, crop detected objects and save them. + render (bool): If True, render annotated results onto the images. + labels (bool): If True, add labels to the bounding boxes in the rendered images. + save_dir (Path): Directory where the processed results will be saved. This is only used if `save` or `crop` + is True. + + Returns: + (str | None): Formatted string of the results if `pprint` is True, otherwise None. + + Example: + ```python + det = Detections(ims, pred, files, times=[0.1, 0.2, 0.3], names=["person", "bike"]) + result_str = det._run(pprint=True, show=True, save=False, crop=False, render=False, labels=True, + save_dir=Path("./outputs")) + print(result_str) # Prints the formatted string of results. + ``` + + Note: + Ensure that the `save_dir` exists when saving the results. The function handles different modes of result + presentation, such as showing images using default viewers or displaying in Jupyter notebooks. + """ s, crops = "", [] for i, (im, pred) in enumerate(zip(self.ims, self.pred)): s += f"\nimage {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} " # string @@ -966,39 +2026,156 @@ class Detections: def show(self, labels=True): """ Displays detection results with optional labels. - - Usage: show(labels=True) + + Args: + labels (bool): If True, include class labels and confidence scores in the displayed results. + + Returns: + None: The function does not return anything. + + Example: + ```python + detections = model(im) # Perform inference + detections.show(labels=True) # Display results with labels + ``` + + Note: + This function leverages the `Annotator` class to draw bounding boxes and labels on images and then displays + them using either Jupyter notebook's display function or the default image viewer in other environments. """ self._run(show=True, labels=labels) # show results def save(self, labels=True, save_dir="runs/detect/exp", exist_ok=False): """ - Saves detection results with optional labels to a specified directory. - - Usage: save(labels=True, save_dir='runs/detect/exp', exist_ok=False) + Save detection results with optional labeling and directory creation. + + Args: + labels (bool): Flag to include labels on the saved images. Defaults to True. + save_dir (str | Path): Directory path where result images and optionally cropped images will be saved. + Defaults to 'runs/detect/exp'. + exist_ok (bool): Flag to allow existing directory content without creating a new directory. + Defaults to False. + + Returns: + None + + Example: + ```python + detections = Detections(ims, pred, files, names=names) + detections.save(labels=True, save_dir='runs/detect/exp', exist_ok=False) + ``` + + Note: + If `exist_ok` is False, the function will create a unique directory by incrementing the name to avoid conflicts + with existing directories. If `save` is True, the images and crops will be saved in the specified `save_dir`. + + This function is particularly useful to persist detection results for future reference, further analysis, or + debugging. """ save_dir = increment_path(save_dir, exist_ok, mkdir=True) # increment save_dir self._run(save=True, labels=labels, save_dir=save_dir) # save results def crop(self, save=True, save_dir="runs/detect/exp", exist_ok=False): """ - Crops detection results, optionally saves them to a directory. - - Args: save (bool), save_dir (str), exist_ok (bool). + Crop detected objects from the input images. + + Args: + save (bool): Whether to save the cropped images to disk. Default is True. + save_dir (str): Directory to save the cropped images. Default is 'runs/detect/exp'. + exist_ok (bool): Whether to overwrite the existing directory if it exists. Default is False. + + Returns: + (list[dict]): List of dictionaries, each containing information about a cropped image, with the keys: + - 'box' (torch.Tensor): Bounding box of the crop with shape (4,). + - 'conf' (torch.Tensor): Confidence score of the detection. + - 'cls' (torch.Tensor): Class of the detected object. + - 'label' (str): Label string with class name and confidence score. + - 'im' (np.ndarray): Cropped image as a numpy array. + + Example: + ```python + detections = model.detect(images) + crops = detections.crop(save=True, save_dir='runs/crops') + for crop in crops: + print(crop['label'], crop['im'].shape) + ``` + + Note: + - If `save` is True, the cropped images will be saved in the specified `save_dir`, which will be incremented + automatically if `exist_ok` is False and the directory already exists. + - This function returns both the cropped images and their metadata, which can be useful for further analysis + or display. """ save_dir = increment_path(save_dir, exist_ok, mkdir=True) if save else None return self._run(crop=True, save=save, save_dir=save_dir) # crop results def render(self, labels=True): - """Renders detection results with optional labels on images; args: labels (bool) indicating label inclusion.""" + """ + Render detection results on an image by drawing the predicted bounding boxes and labels. + + Args: + imgs (np.ndarray | List[np.ndarray]): List of images as NumPy arrays on which detections were made. + annotator (ultradatasets.utils.plotting.Annotator, optional): The annotator instance used for drawing bounding boxes + and labels. Default is None. + + Returns: + (np.ndarray | List[np.ndarray]): The image or list of images with rendered detections. + + Example: + ```python + from PIL import Image + import requests + import io + import torch + from ultralytics import YOLO + + # Load a sample image + img_url = 'https://ultralytics.com/images/zidane.jpg' + response = requests.get(img_url) + img = Image.open(io.BytesIO(response.content)) + + # Load the YOLO model + model = YOLO('yolov5s.pt') + + # Perform inference + results = model(auto_shape.forward([img])) + + # Render the detections on the image + rendered_img = results.render() + + # Show the image + rendered_img.show() + ``` + + Note: + - Input images should be in RGB format. + - The function supports rendering on multiple images if a list of images is provided. + - This function is mainly used for visualization purposes in notebooks or GUI applications. + """ self._run(render=True, labels=labels) # render results return self.ims def pandas(self): """ - Returns detections as pandas DataFrames for various box formats (xyxy, xyxyn, xywh, xywhn). - - Example: print(results.pandas().xyxy[0]). + Convert detections to pandas DataFrames for each box format. + + Args: + None + + Returns: + (dict): Dictionary of pandas DataFrames, one for each box format (xyxy, xyxyn, xywh, xywhn): + - xyxy: DataFrame with columns ["xmin", "ymin", "xmax", "ymax", "confidence", "class", "name"]. + - xyxyn: DataFrame with columns ["xmin", "ymin", "xmax", "ymax", "confidence", "class", "name"] (normalized). + - xywh: DataFrame with columns ["xcenter", "ycenter", "width", "height", "confidence", "class", "name"]. + - xywhn: DataFrame with columns ["xcenter", "ycenter", "width", "height", "confidence", "class", "name"] + (normalized). + + Example: + ```python + results = infer_image(image_path) + dfs = results.pandas() + print(dfs['xyxy'][0]) # print DataFrame for 'xyxy' format + ``` """ new = copy(self) # return copy ca = "xmin", "ymin", "xmax", "ymax", "confidence", "class", "name" # xyxy columns @@ -1010,9 +2187,19 @@ class Detections: def tolist(self): """ - Converts a Detections object into a list of individual detection results for iteration. - - Example: for result in results.tolist(): + Convert detection results to a list of individual detection results. + + Returns: + (list[Detections]): A list where each element is a `Detections` object for a single image, maintaining all + relevant detection attributes. + + Example: + ```python + detections = model.detect(imgs) + detections_list = detections.tolist() + for detection in detections_list: + print(detection.pandas().xyxy) + ``` """ r = range(self.n) # iterable return [ @@ -1028,28 +2215,85 @@ class Detections: ] def print(self): - """Logs the string representation of the current object's state via the LOGGER.""" + """ + Logs detection results for each image, including class names and detection counts per class. + + Example: + ```python + detections = model.predict(images) + detections.print() + ``` + ```python + def print(self): + print(self._run(pprint=True)) # print results + ``` + """ LOGGER.info(self.__str__()) def __len__(self): - """Returns the number of results stored, overrides the default len(results).""" + """ + Returns: + (int): Number of detections. + """ return self.n def __str__(self): - """Returns a string representation of the model's results, suitable for printing, overrides default - print(results). + """ + Returns a concise string representation of the detection results. + + Returns: + (str): A string summarizing the image detection results, including number of detections per class and their respective + confidences, along with processing time details. + + Example: + ```python + detections = Detections(ims, pred, files, times, names, shape) + print(str(detections)) + ``` + + Notes: + Used primarily for logging and quick inspection of detection outputs. """ return self._run(pprint=True) # print results def __repr__(self): - """Returns a string representation of the YOLOv5 object, including its class and formatted results.""" + """ + Return a string representation of the Detections object including its class and formatted results. + + Returns: + (str): A string representation of the Detections object, including class name and formatted results of the + detection process. + + Example: + ```python + detections = Detections(ims, pred, files, times, names, shape) + print(repr(detections)) + ``` + + Note: + This function is particularly useful for debugging and logging purposes, providing a clear, concise summary of + the Detections object. + """ return f"YOLOv5 {self.__class__} instance\n" + self.__str__() class Proto(nn.Module): # YOLOv5 mask Proto module for segmentation models def __init__(self, c1, c_=256, c2=32): - """Initializes YOLOv5 Proto module for segmentation with input, proto, and mask channels configuration.""" + """ + Initialize the YOLOv5 Proto module for segmentation models. + + Args: + c1 (int): Number of input channels. + c_ (int): Number of intermediate channels, default is 256. + c2 (int): Number of output channels, default is 32. + + Example: + ```python + from ultralytics.models.common import Proto + proto = Proto(c1=64, c_=256, c2=32) + ``` + """ super().__init__() self.cv1 = Conv(c1, c_, k=3) self.upsample = nn.Upsample(scale_factor=2, mode="nearest") @@ -1057,7 +2301,25 @@ class Proto(nn.Module): self.cv3 = Conv(c_, c2) def forward(self, x): - """Performs a forward pass using convolutional layers and upsampling on input tensor `x`.""" + """ + Applies convolutional layers and upsampling to generate segmentation masks from input tensor `x`. + + Args: + x (torch.Tensor): Input feature map tensor with shape (N, C1, H, W) where N is the batch size, C1 is the number of + input channels, H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor with shape (N, C2, H_out, W_out), where C2 is the number of mask channels, and + H_out and W_out are the height and width after upsampling. + + Example: + ```python + proto_layer = Proto(c1=512, c_=256, c2=32) + input_tensor = torch.rand(1, 512, 64, 64) # Example input tensor + output_tensor = proto_layer(input_tensor) + print(output_tensor.shape) # Expected output shape: (1, 32, 128, 128) + ``` + """ return self.cv3(self.cv2(self.upsample(self.cv1(x)))) @@ -1066,8 +2328,26 @@ class Classify(nn.Module): def __init__( self, c1, c2, k=1, s=1, p=None, g=1, dropout_p=0.0 ): # ch_in, ch_out, kernel, stride, padding, groups, dropout probability - """Initializes YOLOv5 classification head with convolution, pooling, and dropout layers for input to output - channel transformation. + """ + Initializes the Classify module for YOLOv5, transforming input feature maps to classification scores. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels corresponding to the number of classes. + k (int): Convolutional kernel size. Defaults to 1. + s (int): Convolutional stride size. Defaults to 1. + p (int | None): Convolutional padding size. Defaults to None which implies automatic padding. + g (int): Number of groups in convolutional layer. Defaults to 1. + dropout_p (float): Dropout probability. Defaults to 0.0. + + Returns: + (None): This method initializes the Classify instance without returning any value. + + Example: + ```python + classify_head = Classify(c1=2048, c2=1000, k=1, s=1, dropout_p=0.5) + output = classify_head(input_tensor) # where input_tensor is of shape (B, 2048, 20, 20) + ``` """ super().__init__() c_ = 1280 # efficientnet_b0 size @@ -1077,7 +2357,26 @@ class Classify(nn.Module): self.linear = nn.Linear(c_, c2) # to x(b,c2) def forward(self, x): - """Processes input through conv, pool, drop, and linear layers; supports list concatenation input.""" + """ + Forward pass for the YOLOv5 classification head. + + This method takes an input tensor, applies convolution, pooling, and linear layers to produce classification scores. + + Args: + x (torch.Tensor | list[torch.Tensor]): Input tensor or list of tensors with shape (..., C_in, H, W), + where C_in is the number of input channels. + + Returns: + (torch.Tensor): Output tensor with shape (B, C_out), where B is the batch size and C_out is the number of classes. + + Example: + ```python + classify_head = Classify(c1=2048, c2=1000, k=1, s=1, dropout_p=0.5) + input_tensor = torch.rand(8, 2048, 20, 20) # Example input tensor with shape (B, C_in, H, W) + output_tensor = classify_head(input_tensor) + print(output_tensor.shape) # Should output: torch.Size([8, 1000]) + ``` + """ if isinstance(x, list): x = torch.cat(x, 1) return self.linear(self.drop(self.pool(self.conv(x)).flatten(1))) diff --git a/models/experimental.py b/models/experimental.py index ab9b0ed23..45a69fe88 100644 --- a/models/experimental.py +++ b/models/experimental.py @@ -14,8 +14,19 @@ class Sum(nn.Module): """Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070.""" def __init__(self, n, weight=False): - """Initializes a module to sum outputs of layers with number of inputs `n` and optional weighting, supporting 2+ - inputs. + """ + Initialize the Sum module to aggregate outputs from multiple layers, optionally with weights. + + Args: + n (int): Number of layers to sum. Must be 2 or more. + weight (bool): If True, applies weights to the inputs before summing. + + Returns: + None + + Notes: + Refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070 for detailed insights + and usage scenarios. """ super().__init__() self.weight = weight # apply weights boolean @@ -24,7 +35,26 @@ class Sum(nn.Module): self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True) # layer weights def forward(self, x): - """Processes input through a customizable weighted sum of `n` inputs, optionally applying learned weights.""" + """ + Compute a weighted or unweighted sum of input tensors. + + Args: + x (list[torch.Tensor]): List of input tensors to be summed, with each tensor having the same shape (N, D). + + Returns: + (torch.Tensor): The resulting tensor after summing the input tensors, maintaining the same shape (N, D). + + Example: + ```python + sum_layer = Sum(n=3, weight=False) + inputs = [torch.rand(1, 10), torch.rand(1, 10), torch.rand(1, 10)] + result = sum_layer.forward(inputs) + ``` + + Note: + If `weight` is set to True when initializing the class, weights will be applied to the inputs before summing. + For more information, refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070. + """ y = x[0] # no weight if self.weight: w = torch.sigmoid(self.w) * 2 @@ -40,8 +70,29 @@ class MixConv2d(nn.Module): """Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595.""" def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True): - """Initializes MixConv2d with mixed depth-wise convolutional layers, taking input and output channels (c1, c2), - kernel sizes (k), stride (s), and channel distribution strategy (equal_ch). + """ + Initialize the MixConv2d module, handling mixed depth-wise convolutional operations. + + Args: + c1 (int): Number of input channels (C1). + c2 (int): Number of output channels (C2). + k (tuple[int]): Kernel sizes for the convolutional layers. + s (int): Stride value for the convolutional layers. + equal_ch (bool): Flag to determine if channels are distributed equally. True for equal channels per group, False + for equal weight.numel() per group. + + Example: + ```python + mixconv = MixConv2d(c1=32, c2=64, k=(1, 3, 5), s=1, equal_ch=True) + output = mixconv(input_tensor) + ``` + + Note: + The `MixConv2d` layer applies multiple depth-wise convolutions with different kernel sizes in parallel, which + can capture multi-scale features within a single layer. This technique is particularly useful for improving + spatial feature extraction and reducing model complexity. + + Further reading: https://arxiv.org/abs/1907.09595 """ super().__init__() n = len(k) # number of convolutions @@ -63,8 +114,24 @@ class MixConv2d(nn.Module): self.act = nn.SiLU() def forward(self, x): - """Performs forward pass by applying SiLU activation on batch-normalized concatenated convolutional layer - outputs. + """ + Perform forward pass by applying mixed depth-wise convolutions followed by batch normalization and SiLU activation. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels, + H is the height, and W is the width. + + Returns: + (torch.Tensor): Output tensor after applying mixed convolutions, batch normalization, and SiLU activation, + maintaining the shape (N, C', H', W') where C' is the output channels based on the convolutional layer + configuration. + + Example: + ```python + mixconv = MixConv2d(c1=32, c2=64, k=(1, 3), s=1) + x = torch.randn(16, 32, 128, 128) + output = mixconv(x) + ``` """ return self.act(self.bn(torch.cat([m(x) for m in self.m], 1))) @@ -73,11 +140,51 @@ class Ensemble(nn.ModuleList): """Ensemble of models.""" def __init__(self): - """Initializes an ensemble of models to be used for aggregated predictions.""" + """ + Initializes an ensemble of models for combined inference and aggregated predictions. + + Example: + ```python + ensemble = Ensemble() + model1 = MyModel1() + model2 = MyModel2() + ensemble.append(model1) + ensemble.append(model2) + ``` + """ super().__init__() def forward(self, x, augment=False, profile=False, visualize=False): - """Performs forward pass aggregating outputs from an ensemble of models..""" + """ + Aggregates outputs from multiple models in the ensemble by concatenating them during the forward pass. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels, + H is the height, and W is the width. + augment (bool): Flag to apply test-time augmentation (TTA) during inference. Default is False. + profile (bool): If True, enables profiling of the forward pass. Default is False. + visualize (bool): If True, enables visualization of model predictions. Default is False. + + Returns: + (torch.Tensor): Aggregated output tensor from the ensemble models, with shape dependent on the number of models + and their architectures. + + Example: + ```python + from ultralytics import Ensemble + import torch + + # Initialize the ensemble + ensemble = Ensemble() + # Assume models are already added to the ensemble + + # Create a dummy input tensor + x = torch.randn(8, 3, 640, 640) # Example input for 8 images of 3 channels and 640x640 resolution + + # Perform forward pass + output = ensemble.forward(x, augment=False, profile=False, visualize=False) + ``` + """ y = [module(x, augment, profile, visualize)[0] for module in self] # y = torch.stack(y).max(0)[0] # max ensemble # y = torch.stack(y).mean(0) # mean ensemble @@ -87,9 +194,32 @@ class Ensemble(nn.ModuleList): def attempt_load(weights, device=None, inplace=True, fuse=True): """ - Loads and fuses an ensemble or single YOLOv5 model from weights, handling device placement and model adjustments. - - Example inputs: weights=[a,b,c] or a single model weights=[a] or weights=a. + Loads and fuses a YOLOv5 model or an ensemble of models from provided weights, adjusting device placement and model + attributes for optimal performance. + + Args: + weights (str | list[str]): Path(s) to model weight file(s). It can be a single path or a list of paths. + device (torch.device | None, optional): Device to load the model on. If None, loads on CPU by default. + inplace (bool, optional): If True, enables inplace operations in certain layers like activation layers. + Defaults to True. + fuse (bool, optional): Whether to fuse Conv2d + BatchNorm2d layers for speedup during inference. Defaults to True. + + Returns: + (torch.nn.Module): Loaded YOLOv5 model or an ensemble of models loaded onto the specified device. + + Example: + ```python + # Load a single model weight + model = attempt_load('yolov5s.pt') + + # Load an ensemble of models + model = attempt_load(['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt']) + ``` + + Note: + - This function ensures compatibility and performance optimization by adjusting attributes and configurations of the + loaded model(s). + - If `fuse` is set to True, it will fuse Conv2d and BatchNorm2d layers within the model(s) to speed up inference. """ from models.yolo import Detect, Model diff --git a/models/tf.py b/models/tf.py index 9884ec3db..b6b0f2f76 100644 --- a/models/tf.py +++ b/models/tf.py @@ -51,7 +51,28 @@ from utils.general import LOGGER, make_divisible, print_args class TFBN(keras.layers.Layer): # TensorFlow BatchNormalization wrapper def __init__(self, w=None): - """Initializes a TensorFlow BatchNormalization layer with optional pretrained weights.""" + """ + Initializes a TensorFlow BatchNormalization layer, optionally using pretrained weights for initialization. + + Args: + w (torch.nn.Module | None): PyTorch BatchNormalization layer whose weights are used to initialize the TensorFlow + BatchNormalization layer. If None, the BatchNormalization layer is initialized with default parameters. + + Returns: + (None): This constructor does not return any value. + + Example: + ```python + import torch.nn as nn + from tensorflow.keras import layers + + # Create a PyTorch batch normalization layer + torch_bn = nn.BatchNorm2d(num_features=64) + + # Initialize a TFBN layer with PyTorch BN weights + tf_bn = TFBN(w=torch_bn) + ``` + """ super().__init__() self.bn = keras.layers.BatchNormalization( beta_initializer=keras.initializers.Constant(w.bias.numpy()), @@ -62,7 +83,27 @@ class TFBN(keras.layers.Layer): ) def call(self, inputs): - """Applies batch normalization to the inputs.""" + """ + Apply batch normalization to the given inputs using pretrained weights. + + Args: + inputs (tf.Tensor): Input tensor to normalize, with shape (batch_size, ..., channels). + + Returns: + (tf.Tensor): Batch-normalized tensor with same shape as the input. + + Example: + ```python + # Assume `inputs` is a TensorFlow tensor with shape (N, H, W, C) + bn_layer = TFBN(w=pretrained_weights) + normalized_output = bn_layer.call(inputs) + ``` + + Note: + The `w` parameter used during initialization must be a PyTorch BatchNorm layer containing + pretrained weights. Ensure the `w` object has `bias`, `weight`, `running_mean`, `running_var`, + and `eps` attributes used for initializing the TFBN layer. + """ return self.bn(inputs) @@ -70,10 +111,28 @@ class TFPad(keras.layers.Layer): # Pad inputs in spatial dimensions 1 and 2 def __init__(self, pad): """ - Initializes a padding layer for spatial dimensions 1 and 2 with specified padding, supporting both int and tuple - inputs. - - Inputs are + Initialize a padding layer for spatial dimensions 1 and 2. + + Args: + pad (int | tuple[int, int]): Padding size for the spatial dimensions. If an integer is provided, the same + padding is applied symmetrically to the spatial dimensions. If a tuple is provided, it should contain two + integers representing padding for height and width respectively. + + Returns: + None + + Example: + ```python + # Using integer padding + padding_layer = TFPad(1) + + # Using tuple padding + padding_layer = TFPad((1, 2)) + ``` + + Note: + The padding is added to the input tensor in TensorFlow format, i.e., [[0, 0], [pad_height, pad_height], + [pad_width, pad_width], [0, 0]]. """ super().__init__() if isinstance(pad, int): @@ -82,7 +141,36 @@ class TFPad(keras.layers.Layer): self.pad = tf.constant([[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]]) def call(self, inputs): - """Pads input tensor with zeros using specified padding, suitable for int and tuple pad dimensions.""" + """ + Pad an input tensor with zeros in specified spatial dimensions. + + Args: + inputs (tf.Tensor): Input tensor to be padded, with shape (N, H, W, C). + + Returns: + (tf.Tensor): Padded tensor with shape (N, H + 2 * pad_height, W + 2 * pad_width, C). + + Example: + ```python + import tensorflow as tf + from your_module import TFPad + + # Create a sample input tensor with shape (1, 5, 5, 1) + input_tensor = tf.random.normal((1, 5, 5, 1)) + + # Using integer padding + padding_layer = TFPad(1) + output_tensor = padding_layer.call(input_tensor) + + # Using tuple padding + padding_layer = TFPad((1, 2)) + output_tensor = padding_layer.call(input_tensor) + ``` + + Note: + The padding is added to the input tensor in TensorFlow format, i.e., [[0, 0], [pad_height, pad_height], + [pad_width, pad_width], [0, 0]]. + """ return tf.pad(inputs, self.pad, mode="constant", constant_values=0) @@ -90,10 +178,27 @@ class TFConv(keras.layers.Layer): # Standard convolution def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None): """ - Initializes a standard convolution layer with optional batch normalization and activation; supports only - group=1. - - Inputs are ch_in, ch_out, weights, kernel, stride, padding, groups. + Performs a standard 2D convolution with optional batch normalization and activation in TensorFlow. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int, optional): Kernel size of the convolution. Default is 1. + s (int, optional): Stride of the convolution. Default is 1. + p (int | None, optional): Padding size. If None, padding is automatically determined. Default is None. + g (int, optional): Number of groups for grouped convolution. Default is 1. Note: must be 1 for TF. + act (bool, optional): Boolean to include activation. Default is True. + w (torch.nn.Module | None, optional): Pretrained weights from a PyTorch model to initialize the layer. Default is None. + + Returns: + None: This function initializes an instance of the TFConv class. + + Example: + ```python + tf_conv = TFConv(c1=32, c2=64, k=3, s=1, w=pretrained_weights) + ``` + Note: + TF v2.2 Conv2D does not support the 'groups' argument (must be 1). """ super().__init__() assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument" @@ -113,7 +218,28 @@ class TFConv(keras.layers.Layer): self.act = activations(w.act) if act else tf.identity def call(self, inputs): - """Applies convolution, batch normalization, and activation function to input tensors.""" + """ + Apply convolution, batch normalization, and activation to input tensors. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C) where N is the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Output tensor after applying convolution, batch normalization, and activation, + maintaining shape (N, H, W, C). + + Example: + ```python + input_tensor = tf.random.normal((1, 224, 224, 3)) + conv_layer = TFConv(c1=3, c2=16, k=3, s=1) + output_tensor = conv_layer(input_tensor) + ``` + + Note: + This method calls the `call` method of the internal sequential layers consisting of padding (if stride + isn't 1), convolution, batch normalization (if enabled), and activation function (if enabled). + """ return self.act(self.bn(self.conv(inputs))) @@ -121,10 +247,28 @@ class TFDWConv(keras.layers.Layer): # Depthwise convolution def __init__(self, c1, c2, k=1, s=1, p=None, act=True, w=None): """ - Initializes a depthwise convolution layer with optional batch normalization and activation for TensorFlow - models. - - Input are ch_in, ch_out, weights, kernel, stride, padding, groups. + Initialize a depthwise convolution layer with optional batch normalization and activation for TensorFlow models. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. Must be a multiple of `c1`. + k (int, optional): Size of the convolution kernel. Default is 1. + s (int, optional): Stride of the convolution. Default is 1. + p (int | tuple[int, int] | None, optional): Padding size; supports both integer and tuple inputs. Default is None. + act (bool, optional): Whether to apply an activation function. Default is True. + w (object | None, optional): Pretrained weights. Default is None. + + Returns: + (None): This constructor does not return any values. + + Example: + ```python + import keras + from models.tf import TFDWConv + + # Initialize the layer + conv_layer = TFDWConv(c1=32, c2=64, k=3, s=1, p=1, act=True, w=pretrained_weights) + ``` """ super().__init__() assert c2 % c1 == 0, f"TFDWConv() output={c2} must be a multiple of input={c1} channels" @@ -142,7 +286,35 @@ class TFDWConv(keras.layers.Layer): self.act = activations(w.act) if act else tf.identity def call(self, inputs): - """Applies convolution, batch normalization, and activation function to input tensors.""" + """ + Applies depthwise convolution, batch normalization, and an activation function to the input tensors. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C), representing a batch of images. + + Returns: + (tf.Tensor): Resulting tensor after the depthwise convolution, batch normalization, and activation are applied, + with shape (N, H', W', C') depending on the convolution parameters. + + Example: + ```python + import tensorflow as tf + from models.tf import TFDWConv + + # Dummy input tensor with shape (batch_size, height, width, channels) + inputs = tf.random.normal([8, 32, 32, 32]) + + # Initialize depthwise convolution layer + conv_layer = TFDWConv(c1=32, c2=64, k=3, s=1, p=1, act=True) + + # Apply depthwise convolution + outputs = conv_layer(inputs) + ``` + + Note: + Padding is added to the input tensor in TensorFlow format, i.e., [[0, 0], [pad_height, pad_height], [pad_width, + pad_width], [0, 0]]. + """ return self.act(self.bn(self.conv(inputs))) @@ -150,9 +322,37 @@ class TFDWConvTranspose2d(keras.layers.Layer): # Depthwise ConvTranspose2d def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0, w=None): """ - Initializes depthwise ConvTranspose2D layer with specific channel, kernel, stride, and padding settings. - - Inputs are ch_in, ch_out, weights, kernel, stride, padding, groups. + Initialize a Depthwise ConvTranspose2D layer with specific channel, kernel, stride, and padding configurations. + + Args: + c1 (int): Number of input channels; must equal `c2`. + c2 (int): Number of output channels; must equal `c1`. + k (int): Kernel size; currently supports only `k=4`. + s (int): Stride size for the transposed convolution. + p1 (int): Padding applied to the original input; currently supports only `p1=1`. + p2 (int): Additional padding applied to the transposed output. + w (torch.nn.Module): Pre-trained weights, including both kernel and bias, for initialization. + + Returns: + (None): This constructor does not return any values. + + Example: + ```python + import tensorflow as tf + from models.tf import TFDWConvTranspose2d + + # Define input tensor + input_tensor = tf.random.normal([1, 64, 64, 32]) + + # Initialize the TFDWConvTranspose2d layer + depthwise_conv_transpose2d = TFDWConvTranspose2d(c1=32, c2=32, k=4, s=2, p1=1, p2=0, w=pretrained_weights) + + # Apply the layer + output_tensor = depthwise_conv_transpose2d(input_tensor) + ``` + + Note: + This layer is designed for depthwise convolution with specific constraints on kernel size and initial padding. """ super().__init__() assert c1 == c2, f"TFDWConv() output={c2} must be equal to input={c1} channels" @@ -174,7 +374,33 @@ class TFDWConvTranspose2d(keras.layers.Layer): ] def call(self, inputs): - """Processes input through parallel convolutions and concatenates results, trimming border pixels.""" + """ + Perform upsampling using depthwise transposed convolution, followed by concatenation across channels. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C1), where N is batch size, H is height, W is width, + and C1 is the number of input channels. + + Returns: + (tf.Tensor): Output tensor after applying depthwise ConvTranspose2D and concatenation, with shape + (N, (H-1)*stride + kernel_size, (W-1)*stride + kernel_size, C1). After upsampling, 1 pixel is cropped + from the border of the output to match expected dimensions. + + Example: + ```python + # Define input tensor + input_tensor = tf.random.normal([1, 64, 64, 32]) + + # Initialize the TFDWConvTranspose2d layer + depthwise_conv_transpose2d_layer = TFDWConvTranspose2d(c1=32, c2=32, k=4, s=2, p1=1, p2=0, w=w) + + # Apply the layer + output_tensor = depthwise_conv_transpose2d_layer(input_tensor) + ``` + + Note: + This function handles specific kernel size (k=4) and padding constraints (p1=1) for depthwise ConvTranspose2D. + """ return tf.concat([m(x) for m, x in zip(self.conv, tf.split(inputs, self.c1, 3))], 3)[:, 1:-1, 1:-1] @@ -182,19 +408,53 @@ class TFFocus(keras.layers.Layer): # Focus wh information into c-space def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None): """ - Initializes TFFocus layer to focus width and height information into channel space with custom convolution - parameters. - - Inputs are ch_in, ch_out, kernel, stride, padding, groups. + Initializes TFFocus layer to focus width and height information into channel space with custom convolution parameters. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int, optional): Size of the convolutional kernel. Default is 1. + s (int, optional): Stride value for the convolutional layer. Default is 1. + p (int | None, optional): Padding value. If None, will be automatically determined based on `k`. Default is None. + g (int, optional): Number of groups for the convolution. Default is 1. + act (bool, optional): Whether to use an activation layer. Default is True. + w (torch.nn.Module | None, optional): Pre-trained weight object containing convolution, batch norm, and activation + layers. Default is None. + + Returns: + None + + Example: + ```python + focus_layer = TFFocus(c1=3, c2=64, k=3, s=1, p=1, act=True) + output = focus_layer(inputs) + ``` + + Note: + Ensure that the input tensor dimensions match the expected values for width and height focusing to operate correctly. """ super().__init__() self.conv = TFConv(c1 * 4, c2, k, s, p, g, act, w.conv) def call(self, inputs): """ - Performs pixel shuffling and convolution on input tensor, downsampling by 2 and expanding channels by 4. - - Example x(b,w,h,c) -> y(b,w/2,h/2,4c). + Perform pixel shuffling and convolution on the input tensor, converting spatial dimensions into channel space. + + Args: + inputs (tf.Tensor): Input tensor with shape (B, H, W, C), where B is the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Output tensor after pixel shuffling and convolution, with shape (B, H/2, W/2, 4C). + + Example: + ```python + focus_layer = TFFocus(c1=32, c2=64, k=1, s=1) + output = focus_layer(inputs) # inputs should be a tensor with shape (B, H, W, C) + ``` + + Note: + Ensure input tensor dimensions match expected values for correct width and height focusing. """ inputs = [inputs[:, ::2, ::2, :], inputs[:, 1::2, ::2, :], inputs[:, ::2, 1::2, :], inputs[:, 1::2, 1::2, :]] return self.conv(tf.concat(inputs, 3)) @@ -204,10 +464,38 @@ class TFBottleneck(keras.layers.Layer): # Standard bottleneck def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, w=None): """ - Initializes a standard bottleneck layer for TensorFlow models, expanding and contracting channels with optional - shortcut. - - Arguments are ch_in, ch_out, shortcut, groups, expansion. + Initialize a standard bottleneck layer for TensorFlow models, typically used for residual connections in a network. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + shortcut (bool, optional): Whether to include a shortcut connection. Default is True. + g (int, optional): Number of groups for group convolution. Default is 1. + e (float, optional): Expansion factor for hidden channels. Default is 0.5. + w (object, optional): Pretrained weights from a PyTorch model to initialize the layer. Default is None. + + Returns: + None + + Example: + ```python + import tensorflow as tf + + # Initialize the TFBottleneck layer + c1, c2 = 64, 128 + bottleneck_layer = TFBottleneck(c1, c2) + + # Define input tensor + inputs = tf.random.normal([1, 32, 32, c1]) + + # Apply the bottleneck layer + outputs = bottleneck_layer(inputs) + print(outputs.shape) # Expected output shape: (1, 32, 32, c2) + ``` + + Note: + The bottleneck layer can be customized using pretrained weights for improved performance. Ensure the input tensor + dimensions match the expected values when applying the bottleneck transformation. """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -216,8 +504,27 @@ class TFBottleneck(keras.layers.Layer): self.add = shortcut and c1 == c2 def call(self, inputs): - """Performs forward pass; if shortcut is True & input/output channels match, adds input to the convolution - result. + """ + Perform forward pass of the TFBottleneck module. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C), where N is the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Output tensor with shape (N, H, W, C2) where C2 is the number of output channels after the + bottleneck operation. + + Example: + ```python + bottleneck = TFBottleneck(64, 128, shortcut=True) + x = tf.random.uniform((1, 128, 128, 64)) + y = bottleneck(x) + ``` + + Note: + If `self.add` is True, the function will add the input tensor to the convolution output. This typically occurs + when the input and output channels are the same, and the `shortcut` parameter is set to True. """ return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs)) @@ -225,7 +532,33 @@ class TFBottleneck(keras.layers.Layer): class TFCrossConv(keras.layers.Layer): # Cross Convolution def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False, w=None): - """Initializes cross convolution layer with optional expansion, grouping, and shortcut addition capabilities.""" + """ + Perform an enhanced cross convolution operation with optional expansion, grouping, and shortcut addition capabilities. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size for the convolution operations. Default is 3. + s (int): Stride size for the convolution operations. Default is 1. + g (int): Number of groups for the grouped convolution. Default is 1. + e (float): Expansion coefficient for intermediate channels. Default is 1.0. + shortcut (bool): Whether to apply a shortcut connection (residual connection). Default is False. + w (object | None): Pretrained weights object containing convolution parameters. Default is None. + + Returns: + None: This constructor initializes an instance of the TFCrossConv class. + + Example: + ```python + cross_conv_layer = TFCrossConv(c1=32, c2=64, k=5, s=2, w=pretrained_weights) + output = cross_conv_layer(inputs) # Input tensor should have a shape compatible with these parameters + ``` + + Note: + The cross convolution operation applies a two-step convolutions with different kernel shapes `(1, k)` and `(k, 1)`, + preceded by an optional expansion through an intermediate layer. When `shortcut` is True, the input is directly + added to the output of the two-step convolutions. + """ super().__init__() c_ = int(c2 * e) # hidden channels self.cv1 = TFConv(c1, c_, (1, k), (1, s), w=w.cv1) @@ -233,15 +566,75 @@ class TFCrossConv(keras.layers.Layer): self.add = shortcut and c1 == c2 def call(self, inputs): - """Passes input through two convolutions optionally adding the input if channel dimensions match.""" + """ + Perform cross convolution operations on input tensors. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C), where N is the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Tensor after applying cross convolution operations, with shape (N, H, W, C2) where C2 is the + number of output channels. + + Example: + ```python + import tensorflow as tf + + # Define input tensor + input_tensor = tf.random.normal([1, 64, 64, 32]) + + # Initialize the TFCrossConv layer + cross_conv_layer = TFCrossConv(c1=32, c2=64, k=3, s=1, g=1, e=1.0, shortcut=True) + + # Apply the layer + output_tensor = cross_conv_layer(input_tensor) + print(output_tensor.shape) # Expected output shape: (1, 64, 64, 64) + ``` + + Note: + If `shortcut` is True and the number of input channels (`c1`) equals the number of output channels (`c2`), + a shortcut connection is added between the input and the output. + """ return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs)) class TFConv2d(keras.layers.Layer): # Substitution for PyTorch nn.Conv2D def __init__(self, c1, c2, k, s=1, g=1, bias=True, w=None): - """Initializes a TensorFlow 2D convolution layer, mimicking PyTorch's nn.Conv2D functionality for given filter - sizes and stride. + """ + Initialize a TensorFlow Conv2D layer, mimicking the behavior of PyTorch's Conv2D, with optional pretrained weights. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int | tuple[int, int]): Size of the convolutional kernel. + s (int, optional): Stride size. Defaults to 1. + g (int, optional): Number of groups. Only supported value is 1. Defaults to 1. + bias (bool, optional): Whether to include a bias term. Defaults to True. + w (torch.nn.Module | None, optional): Pretrained weights taken from a PyTorch model. Defaults to None. + + Example: + ```python + import torch + from models.tf import TFConv2d + + # Define parameters + c1, c2, k, s = 3, 64, 3, 1 + + # Pretrained weights from a PyTorch model + pretrained_weights = torch.nn.Conv2d(c1, c2, k) + + # Initialize TFConv2d layer + conv_layer = TFConv2d(c1=c1, c2=c2, k=k, s=s, bias=True, w=pretrained_weights) + ``` + + Note: + TensorFlow's `keras.layers.Conv2D` does not support the 'groups' argument prior to version 2.2, which limits + `g` to 1. + + Returns: + None: This constructor initializes the Conv2D layer within the class. """ super().__init__() assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument" @@ -256,7 +649,38 @@ class TFConv2d(keras.layers.Layer): ) def call(self, inputs): - """Applies a convolution operation to the inputs and returns the result.""" + """ + Provide only the docstring content, without quotation marks or function code. + + Apply a convolution operation to the input tensor. + + Args: + inputs (tf.Tensor): Input tensor with shape (B, H, W, C), where B is the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Tensor resulting from the convolution operation, with shape (B, H_out, W_out, C_out) where H_out and + W_out are the output height and width, and C_out is the number of output channels. + + Example: + ```python + import tensorflow as tf + from models.tf import TFConv2d + + # Example input tensor + input_tensor = tf.random.normal([1, 32, 32, 3]) + + # Initialize Conv2D layer + conv2d_layer = TFConv2d(c1=3, c2=16, k=3, s=1, bias=True, w=pretrained_weights) + + # Apply Conv2D layer + output_tensor = conv2d_layer(input_tensor) + ``` + + Note: + This function uses TensorFlow's Conv2D operation to simulate PyTorch's nn.Conv2D behavior. The layer supports only + single-group convolutions (`g=1`) and padding is added manually when necessary. + """ return self.conv(inputs) @@ -264,10 +688,29 @@ class TFBottleneckCSP(keras.layers.Layer): # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None): """ - Initializes CSP bottleneck layer with specified channel sizes, count, shortcut option, groups, and expansion - ratio. - - Inputs are ch_in, ch_out, number, shortcut, groups, expansion. + Initializes CSP bottleneck layer with specified input/output channels, layer count, and network topology options. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int): Number of bottleneck layers. Default is 1. + shortcut (bool): Whether to use shortcut connections or not. + g (int): Number of groups for group convolution. Default is 1. + e (float): Expansion ratio for hidden layers. Default is 0.5. + w (object): Weights container to initialize the layers. + + Returns: + (keras.layers.Layer): Constructed TensorFlow layer with CSP bottleneck configuration. + + Example: + ```python + csp_bottleneck = TFBottleneckCSP(c1=64, c2=128, n=2, shortcut=True, g=1, e=0.5, w=weights) + output = csp_bottleneck(inputs) + ``` + + Note: + Uses `TFConv` and `TFConv2d` for convolution operations, `TFBN` for batch normalization, and Keras Swish + activation by default. This is based on the Cross Stage Partial Networks (CSPNet) architecture. """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -280,8 +723,36 @@ class TFBottleneckCSP(keras.layers.Layer): self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)]) def call(self, inputs): - """Processes input through the model layers, concatenates, normalizes, activates, and reduces the output - dimensions. + """ + Applies a CSP (Cross Stage Partial Networks) Bottleneck convolutional block to the input tensor. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C), where N represents the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Output tensor after applying the CSP bottleneck block, which maintains the same batch size and + spatial dimensions but with modified channel dimensions depending on the convolutions' configurations. + + Example: + ```python + import tensorflow as tf + from yolov5_models import TFBottleneckCSP + + # Define input tensor with shape (batch_size, height, width, channels) + inputs = tf.random.normal([1, 128, 128, 64]) + + # Initialize TFBottleneckCSP layer + bottleneck_csp_layer = TFBottleneckCSP(c1=64, c2=128, n=1, shortcut=True, g=1, e=0.5) + + # Apply the layer to input tensor + outputs = bottleneck_csp_layer(inputs) + print(outputs.shape) # Expected output shape: (1, 128, 128, 128) + ``` + + Note: + The CSP architecture helps in strengthening gradient flow across the network, improving training dynamics, and + ensuring efficient parameter utilization. """ y1 = self.cv3(self.m(self.cv1(inputs))) y2 = self.cv2(inputs) @@ -292,9 +763,31 @@ class TFC3(keras.layers.Layer): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None): """ - Initializes CSP Bottleneck with 3 convolutions, supporting optional shortcuts and group convolutions. - - Inputs are ch_in, ch_out, number, shortcut, groups, expansion. + Perform CSP bottleneck operations with 3 convolutions, supporting optional shortcuts and group convolutions. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int): Number of bottleneck layers to apply. + shortcut (bool): Determines whether to use shortcuts. Default is True. + g (int): Number of groups for convolutions. Default is 1. + e (float): Expansion ratio for bottleneck channels. Default is 0.5. + w (torch.nn.Module | None): Pretrained weights from a PyTorch model to initialize the TensorFlow layer. Default is None. + + Returns: + None: This method initializes the TFC3 layer. + + Example: + ```python + # Example usage of TFC3 + tfc3_layer = TFC3(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5) + input_tensor = tf.random.normal([1, 64, 64, 64]) + output_tensor = tfc3_layer(input_tensor) + ``` + + Note: + This layer implements CSPNet architecture with 3 convolutions, integrated in TensorFlow. Ideal for deep learning + models that require efficient channel-wise transformations while maintaining the original network dimensions. """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -305,9 +798,23 @@ class TFC3(keras.layers.Layer): def call(self, inputs): """ - Processes input through a sequence of transformations for object detection (YOLOv5). - - See https://github.com/ultralytics/yolov5. + Perform a forward pass through the CSP Bottleneck layer with 3 convolutions. + + Args: + inputs (tf.Tensor): Input tensor to the layer, with shape (batch_size, height, width, channels). + + Returns: + (tf.Tensor): Output tensor produced after applying CSP Bottleneck layer transformations. + + Example: + ```python + csp_bottleneck = TFC3(c1=64, c2=128, n=2, shortcut=True, g=1, e=0.5) + output_tensor = csp_bottleneck(input_tensor) + ``` + + Note: + This layer is part of the Ultralytics YOLOv5 model configuration for TensorFlow. + See https://github.com/ultralytics/yolov5 for more details. """ return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3)) @@ -316,9 +823,23 @@ class TFC3x(keras.layers.Layer): # 3 module with cross-convolutions def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None): """ - Initializes layer with cross-convolutions for enhanced feature extraction in object detection models. - - Inputs are ch_in, ch_out, number, shortcut, groups, expansion. + Perform non-maximum suppression (NMS) on prediction boxes. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + n (int, optional): Number of CrossConv layers. Defaults to 1. + shortcut (bool, optional): Whether to use shortcut connection. Defaults to True. + g (int, optional): Number of groups for grouped convolution. Defaults to 1. + e (float, optional): Expansion ratio. Defaults to 0.5. + w (object, optional): Pretrained weights from a PyTorch model. + + Returns: + None + + Note: + This class is a part of TensorFlow, Keras, and TFLite versions of YOLOv5 as authored in + https://github.com/ultralytics/yolov5/pull/1127. For usage, see https://github.com/ultralytics/yolov5. """ super().__init__() c_ = int(c2 * e) # hidden channels @@ -330,14 +851,56 @@ class TFC3x(keras.layers.Layer): ) def call(self, inputs): - """Processes input through cascaded convolutions and merges features, returning the final tensor output.""" + """ + TFC3x.call(inputs) + + Processes input through cross-convolutions and merges features for enhanced detection. + + Args: + inputs (tf.Tensor): Input tensor with shape (batch_size, height, width, channels). + + Returns: + (tf.Tensor): Output tensor after processing through cross-convolutions and feature merging, with shape + (batch_size, new_height, new_width, new_channels). + + Example: + ```python + tfc3x_layer = TFC3x(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5) + input_tensor = tf.random.normal([1, 64, 64, 64]) + output_tensor = tfc3x_layer(input_tensor) + ``` + + Note: + This class is part of the TensorFlow, Keras, and TFLite versions of YOLOv5. See https://github.com/ultralytics/yolov5 + for more information. + """ return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3)) class TFSPP(keras.layers.Layer): # Spatial pyramid pooling layer used in YOLOv3-SPP def __init__(self, c1, c2, k=(5, 9, 13), w=None): - """Initializes a YOLOv3-SPP layer with specific input/output channels and kernel sizes for pooling.""" + """ + Initialize a spatial pyramid pooling (SPP) layer for YOLO models. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (tuple[int, int, int]): Kernel sizes for the spatial pooling layers. Default is (5, 9, 13). + w (object | None): Weights from a pretrained model. Default is None. + + Returns: + None + + Example: + ```python + yolo_spp = TFSPP(c1=256, c2=512, k=(5, 9, 13), w=pretrained_weights) + ``` + + Notes: + The SPP layer is designed to increase the receptive field by applying a series of max pooling operations with + large kernel sizes, improving the detection of objects at various scales in YOLO models. + """ super().__init__() c_ = c1 // 2 # hidden channels self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1) @@ -345,7 +908,28 @@ class TFSPP(keras.layers.Layer): self.m = [keras.layers.MaxPool2D(pool_size=x, strides=1, padding="SAME") for x in k] def call(self, inputs): - """Processes input through two TFConv layers and concatenates with max-pooled outputs at intermediate stage.""" + """ + Perform spatial pyramid pooling (SPP) on the input tensor to extract multi-scale features. + + Args: + inputs (tf.Tensor): Input tensor from the previous layer with shape (B, H, W, C), where B is the batch size, + H is the height, W is the width, and C is the number of input channels. + + Returns: + (tf.Tensor): Output tensor with multi-scale features, after applying SPP and concatenation. The shape of + the output tensor will be (B, H, W, c2), where c2 is the number of output channels. + + Example: + ```python + spp_layer = TFSPP(c1=256, c2=512, k=(5, 9, 13)) + output = spp_layer(inputs) + ``` + + Note: + The layer performs convolution and max pooling with different pool sizes before concatenating the results + for enhanced feature extraction. This is typically used in object detection models like YOLO for capturing + multi-scale context. + """ x = self.cv1(inputs) return self.cv2(tf.concat([x] + [m(x) for m in self.m], 3)) @@ -353,8 +937,26 @@ class TFSPP(keras.layers.Layer): class TFSPPF(keras.layers.Layer): # Spatial pyramid pooling-Fast layer def __init__(self, c1, c2, k=5, w=None): - """Initializes a fast spatial pyramid pooling layer with customizable in/out channels, kernel size, and - weights. + """ + Initialize a TFSPPF (Spatial Pyramid Pooling-Fast) layer with specified parameters. + + Args: + c1 (int): Number of input channels. + c2 (int): Number of output channels. + k (int): Kernel size for max pooling. Default is 5. + w (None | dict): Weights to initialize the layer. A dictionary containing the necessary weights for the layers. + + Returns: + None: This method does not return anything as it initializes the layer in place. + + Example: + ```python + tf_sppf = TFSPPF(c1=256, c2=512, k=5, w=weights) + ``` + + Note: + This TFSPPF layer is specifically designed for YOLOv5 architecture, offering a faster variant of spatial pyramid + pooling by using fewer layers for efficiency while maintaining performance. """ super().__init__() c_ = c1 // 2 # hidden channels @@ -363,8 +965,24 @@ class TFSPPF(keras.layers.Layer): self.m = keras.layers.MaxPool2D(pool_size=k, strides=1, padding="SAME") def call(self, inputs): - """Executes the model's forward pass, concatenating input features with three max-pooled versions before final - convolution. + """ + Perform spatial pyramid pooling-Fast (SPPF) on input tensors, concatenating pooled features with the original tensor. + + Args: + inputs (tf.Tensor): Input tensor with shape (N, H, W, C) for batch size N, height H, width W, and channels C. + + Returns: + (tf.Tensor): Output tensor with shape (N, H, W, C_out), where C_out is the number of output channels. + + Example: + ```python + layer = TFSPPF(c1=256, c2=512, k=5) + output = layer(inputs) # inputs should be a tensor of shape (N, H, W, 256) + ``` + + Note: + This TFSPPF layer is specifically designed for YOLOv5 architecture, offering a faster variant of spatial pyramid + pooling by using fewer layers for efficiency while maintaining performance. """ x = self.cv1(inputs) y1 = self.m(x) @@ -375,8 +993,27 @@ class TFSPPF(keras.layers.Layer): class TFDetect(keras.layers.Layer): # TF YOLOv5 Detect layer def __init__(self, nc=80, anchors=(), ch=(), imgsz=(640, 640), w=None): - """Initializes YOLOv5 detection layer for TensorFlow with configurable classes, anchors, channels, and image - size. + """ + Initializes YOLOv5 detection layer for TensorFlow. + + Args: + nc (int, optional): Number of classes. Defaults to 80. + anchors (tuple, optional): Tuple of anchor box dimensions. Defaults to (). + ch (tuple, optional): Number of input channels for each detection layer. Defaults to (). + imgsz (tuple[int, int], optional): Input image size as (height, width). Defaults to (640, 640). + w (object, optional): Weights object containing pretrained weight tensors and other parameters. + + Returns: + None + + Note: + This detection layer forms part of the YOLOv5 architecture for object detection tasks in TensorFlow, handling the + prediction of bounding boxes and class probabilities for detected objects. + + Example: + ```python + detection_layer = TFDetect(nc=80, anchors=((10, 13, 16, 30, 33, 23),), ch=(256, 512, 1024), imgsz=(640, 640)) + ``` """ super().__init__() self.stride = tf.convert_to_tensor(w.stride.numpy(), dtype=tf.float32) @@ -395,7 +1032,28 @@ class TFDetect(keras.layers.Layer): self.grid[i] = self._make_grid(nx, ny) def call(self, inputs): - """Performs forward pass through the model layers to predict object bounding boxes and classifications.""" + """ + Perform object detection computations using inputs from multiple feature layers, applying activation, convolution, and + grid-based adjustments to generate final output. + + Args: + inputs (list[tf.Tensor]): List of input tensors from multiple feature layers, each with shape (B, H, W, C), + where B is batch size, H is height, W is width, and C is the number of channels. + + Returns: + (tf.Tensor): Final processed tensor with object detection information, shape (B, N, 85), where N is the number + of predictions and 85 represents the output features (4 box coords + 1 objectness score + 80 class scores). + + Example: + ```python + detection_layer = TFDetect(nc=80, anchors=((10, 13, 16, 30, 33, 23),), ch=(256, 512, 1024), imgsz=(640, 640)) + output = detection_layer([feature_map1, feature_map2, feature_map3]) + ``` + + Note: + Ensure input tensors have consistent shapes aligned with the detection layer configuration. The function + reshapes, normalizes, and concatenates features from each input tensor to produce final detection outputs. + """ z = [] # inference output x = [] for i in range(self.nl): @@ -420,7 +1078,21 @@ class TFDetect(keras.layers.Layer): @staticmethod def _make_grid(nx=20, ny=20): - """Generates a 2D grid of coordinates in (x, y) format with shape [1, 1, ny*nx, 2].""" + """ + Generate a 2D coordinate grid for anchors with shape (1, 1, ny*nx, 2). + + Args: + nx (int): Number of grid anchors along the x-axis. Default is 20. + ny (int): Number of grid anchors along the y-axis. Default is 20. + + Returns: + (tf.Tensor): A tensor containing the 2D coordinate grid with shape (1, 1, ny*nx, 2). + + Example: + ```python + grid = TFDetect._make_grid(20, 20) + ``` + """ # return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() xv, yv = tf.meshgrid(tf.range(nx), tf.range(ny)) return tf.cast(tf.reshape(tf.stack([xv, yv], 2), [1, 1, ny * nx, 2]), dtype=tf.float32) @@ -429,8 +1101,32 @@ class TFDetect(keras.layers.Layer): class TFSegment(TFDetect): # YOLOv5 Segment head for segmentation models def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), imgsz=(640, 640), w=None): - """Initializes YOLOv5 Segment head with specified channel depths, anchors, and input size for segmentation - models. + """ + Initialize the YOLOv5 segmentation head for TensorFlow models. + + Args: + nc (int): Number of classes for segmentation. + anchors (list[float]): List of anchor boxes used in YOLOv5, this should be an iterable containing anchor sizes. + nm (int): Number of segmentation masks. + npr (int): Number of prototypes. + ch (list[int]): List of input channels for each detection layer. + imgsz (tuple[int, int]): Image size in the format (height, width). + w (object): Pretrained weights for initializing the model. + + Returns: + None + + Example: + ```python + import tensorflow as tf + from your_module import TFSegment + + segmentor = TFSegment(nc=80, anchors=[[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], + nm=32, npr=256, ch=[256, 512, 1024], imgsz=(640, 640), w=weights) + ``` + + Note: + The 'w' parameter is critical for performance as it utilizes pretrained weights to enhance segmentation accuracy. """ super().__init__(nc, anchors, ch, imgsz, w) self.nm = nm # number of masks @@ -441,7 +1137,30 @@ class TFSegment(TFDetect): self.detect = TFDetect.call def call(self, x): - """Applies detection and proto layers on input, returning detections and optionally protos if training.""" + """ + Perform segmentation using the YOLOv5 segmentation head for TensorFlow models. + + Args: + x (list[tf.Tensor]): Input feature maps from backbone network. + + Returns: + (tuple[tf.Tensor]): A tuple containing: + - detections (tf.Tensor): Detection predictions with shape (N, num_detections, 5 + num_classes + num_masks), + where N is the batch size. + - proto (tf.Tensor): Prototype masks with shape (N, num_prototypes, height, width). + + Example: + ```python + # Assuming 'backbone_features' is a list of TensorFlow tensors from the backbone network + segmentor = TFSegment(nc=80, anchors=[[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], + nm=32, npr=256, ch=[256, 512, 1024], imgsz=(640, 640), w=weights) + detections, proto = segmentor(backbone_features) + ``` + + Note: + The method processes the input feature maps to produce object detection predictions and prototype masks used in + segmentation tasks. + """ p = self.proto(x[0]) # p = TFUpsample(None, scale_factor=4, mode='nearest')(self.proto(x[0])) # (optional) full-size protos p = tf.transpose(p, [0, 3, 1, 2]) # from shape(1,160,160,32) to shape(1,32,160,160) @@ -451,8 +1170,26 @@ class TFSegment(TFDetect): class TFProto(keras.layers.Layer): def __init__(self, c1, c_=256, c2=32, w=None): - """Initializes TFProto layer with convolutional and upsampling layers for feature extraction and - transformation. + """ + Initializes TFProto layer with convolutional and upsampling layers for feature extraction and transformation. + + Args: + c1 (int): Number of input channels. + c_ (int): Number of hidden channels, default is 256. + c2 (int): Number of output channels, default is 32. + w (object | None): Pretrained weights for initializing the convolutional layers. If None, layers are initialized + with default settings. + + Returns: + (TFProto): Instance of the TFProto layer, ready for use in a TensorFlow model. + + Example: + ```python + tf_proto_layer = TFProto(c1=128) + ``` + + Note: + This layer is designed to be a part of the YOLOv5 model pipeline, specifically for segmenting image features. """ super().__init__() self.cv1 = TFConv(c1, c_, k=3, w=w.cv1) @@ -461,7 +1198,24 @@ class TFProto(keras.layers.Layer): self.cv3 = TFConv(c_, c2, w=w.cv3) def call(self, inputs): - """Performs forward pass through the model, applying convolutions and upscaling on input tensor.""" + """ + Handles forwarding through convolutional and upsampling layers to generate mask prototypes in TF models. + + Args: + inputs (tf.Tensor): A tensor with shape (N, H, W, C), where N is the batch size, H is the height, + W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): A tensor with the transformed features, having shape (N, H_new, W_new, C2) where H_new and W_new + are the new height and width after processing, and C2 is the number of output channels. + + Example: + ```python + tf_proto_layer = TFProto(c1=128) + input_tensor = tf.random.normal([1, 64, 64, 128]) + output_tensor = tf_proto_layer(input_tensor) + ``` + """ return self.cv3(self.cv2(self.upsample(self.cv1(inputs)))) @@ -469,10 +1223,25 @@ class TFUpsample(keras.layers.Layer): # TF version of torch.nn.Upsample() def __init__(self, size, scale_factor, mode, w=None): """ - Initializes a TensorFlow upsampling layer with specified size, scale_factor, and mode, ensuring scale_factor is - even. - - Warning: all arguments needed including 'w' + Initialize a TensorFlow upsampling layer. + + Args: + size (tuple[int] | None): Desired output size. Default is `None`. + scale_factor (int | None): Multiplier for the height and width of the input. Must be even. Default is `None`. + mode (str): Upsampling algorithm to use. Options are ('nearest', 'bilinear', etc.). + w (torch.nn.Module | None): Placeholder for compatibility. Default is `None`. + + Returns: + None + + Example: + ```python + upsample_layer = TFUpsample(size=None, scale_factor=2, mode="nearest") + result = upsample_layer(input_tensor) + ``` + + Note: + Ensure that 'scale_factor' is a multiple of 2. """ super().__init__() assert scale_factor % 2 == 0, "scale_factor must be multiple of 2" @@ -483,25 +1252,130 @@ class TFUpsample(keras.layers.Layer): # size=(x.shape[1] * 2, x.shape[2] * 2)) def call(self, inputs): - """Applies upsample operation to inputs using nearest neighbor interpolation.""" + """ + Perform nearest neighbor upsampling on input tensors using the specified scale factor and mode in TensorFlow. + + Args: + inputs (tf.Tensor): Input tensor to be upsampled, typically with shape (B, H, W, C) where B is the batch size, + H is the height, W is the width, and C is the number of channels. + + Returns: + (tf.Tensor): Upsampled tensor with dimensions equal to original dimensions multiplied by the scale factor. + The output tensor will have a shape of (B, H * scale_factor, W * scale_factor, C). + + Example: + ```python + import tensorflow as tf + from your_module import TFUpsample + + upsample_layer = TFUpsample(size=None, scale_factor=2, mode="nearest") + input_tensor = tf.random.normal([1, 64, 64, 32]) + output_tensor = upsample_layer(input_tensor) + print(output_tensor.shape) # Expected output shape: (1, 128, 128, 32) + ``` + + Note: + Ensure that 'scale_factor' is a multiple of 2. This layer resizes the spatial dimensions (height and width) + of the input tensor by the specified scale factor. + """ return self.upsample(inputs) class TFConcat(keras.layers.Layer): # TF version of torch.concat() def __init__(self, dimension=1, w=None): - """Initializes a TensorFlow layer for NCHW to NHWC concatenation, requiring dimension=1.""" + """ + Initializes a TensorFlow layer for concatenating tensors along the specified dimension. + + Args: + dimension (int, optional): The dimension along which to concatenate tensors. Default is 1, converting + from NCHW to NHWC format. + w (torch.nn.Module | None, optional): Pretrained weights from a PyTorch model to match the dimension + order. Default is None. + + Returns: + None + + Example: + ```python + import tensorflow as tf + from models.tf import TFConcat + + # Initialize the concatenate layer + concat_layer = TFConcat(dimension=1) + + # Concatenate two sample tensors along the specified dimension + tensor1 = tf.random.normal([1, 256, 256, 64]) + tensor2 = tf.random.normal([1, 256, 256, 64]) + output_tensor = concat_layer([tensor1, tensor2]) + ``` + + Note: + This class ensures compatibility between PyTorch and TensorFlow tensor formats by handling only the + concatenation of NCHW (PyTorch) to NHWC (TensorFlow). + """ super().__init__() assert dimension == 1, "convert only NCHW to NHWC concat" self.d = 3 def call(self, inputs): - """Concatenates a list of tensors along the last dimension, used for NCHW to NHWC conversion.""" + """ + Concatenates input tensors along the last dimension, converting from NCHW to NHWC format. + + Args: + inputs (list[tf.Tensor]): List of input tensors in NCHW format to be concatenated. + + Returns: + (tf.Tensor): Concatenated tensor in NHWC format. + + Example: + ```python + concat_layer = TFConcat() + input1 = tf.random.normal([1, 64, 32, 32]) + input2 = tf.random.normal([1, 64, 32, 32]) + output = concat_layer([input1, input2]) + ``` + """ return tf.concat(inputs, self.d) def parse_model(d, ch, model, imgsz): - """Parses a model definition dict `d` to create YOLOv5 model layers, including dynamic channel adjustments.""" + """ + Parses the model configuration dictionary to create YOLOv5 model layers with dynamic channel adjustments. + + This function processes the model configuration, initializing layers and setting up the neural network architecture + for YOLOv5 model training and inference in TensorFlow. + + Args: + d (dict): Model configuration dictionary containing anchor boxes, class counts, depth multiple, width multiple, + and backbone/head layer details. + ch (list[int]): List of input channels for each layer. + model (object): Existing model object containing pretrained weights and other parameters. + imgsz (tuple[int, int]): Input image size as (height, width). + + Returns: + (list[keras.Sequential]): Parsed list of keras.Sequential model layers set up for YOLOv5, with adjusted channels. + + Example: + ```python + model_config = { + 'anchors': [[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119], [10, 13, 16, 30, 33, 23]], + 'nc': 80, 'depth_multiple': 0.33, 'width_multiple': 0.50, + 'backbone': [ + [-1, 1, 'Conv', [32, 3, 1]], + [-1, 1, 'C3', [64, 3, 0.5]], + ], + 'head': [ + [-1, 1, 'SPPF', [256, 5]], + ], + } + input_channels = [3] + parsed_model_layers = parse_model(model_config, input_channels, pretrained_model, (640, 640)) + ``` + + Note: + This function converts PyTorch model layers to TensorFlow Keras layers, maintaining parameter consistency. + """ LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}") anchors, nc, gd, gw, ch_mul = ( d["anchors"], @@ -583,8 +1457,28 @@ def parse_model(d, ch, model, imgsz): class TFModel: # TF YOLOv5 model def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, model=None, imgsz=(640, 640)): - """Initializes TF YOLOv5 model with specified configuration, channels, classes, model instance, and input - size. + """ + Initialize a TensorFlow YOLOv5 model with specified configuration, input channels, and classes. + + Args: + cfg (str | dict): Model configuration, either a file path to a yaml file or a dictionary containing network + structure and parameters. + ch (int): Number of input channels. + nc (int | None): Number of classes for detection tasks. + model (torch.nn.Module | None): PyTorch model instance to map to TensorFlow model structure. + imgsz (tuple[int, int]): Input image size as a tuple of (height, width). + + Returns: + None + + Example: + ```python + tf_model = TFModel(cfg='yolov5s.yaml', ch=3, nc=80, imgsz=(640, 640)) + ``` + + Note: + This model supports YOLOv5 architectures and is compatible with TensorFlow and Keras frameworks. + Ensure to provide properly formatted configuration files or dictionaries for successful model initialization. """ super().__init__() if isinstance(cfg, dict): @@ -612,7 +1506,38 @@ class TFModel: iou_thres=0.45, conf_thres=0.25, ): - """Runs inference on input data, with an option for TensorFlow NMS.""" + """ + Perform prediction on input data using the TensorFlow YOLOv5 model, optionally applying non-max suppression. + + Args: + inputs (tf.Tensor): Input tensor containing the image data, shape (B, H, W, C). + tf_nms (bool): Apply TensorFlow non-max suppression after prediction. Default is False. + agnostic_nms (bool): Class-agnostic non-max suppression. Default is False. + topk_per_class (int): Top-K maximum detections per class. Default is 100. + topk_all (int): Top-K maximum total detections. Default is 100. + iou_thres (float): Intersection-over-union (IoU) threshold for NMS. Default is 0.45. + conf_thres (float): Confidence score threshold for filtering predictions. Default is 0.25. + + Returns: + (tuple[tf.Tensor]): Tuple containing predicted bounding boxes, confidence scores, and class probabilities. + If `tf_nms` is True, returns the results of TensorFlow NMS, shape (N, 7) where N is the number of predictions + with columns for (x1, y1, x2, y2, score, class). If `tf_nms` is False, returns the raw tensor output from the model. + + Example: + ```python + import tensorflow as tf + from ultralytics import TFModel + + # Initialize the model + model = TFModel(cfg='yolov5s.yaml', ch=3, nc=80) + + # Prepare input tensor + img = tf.random.normal([1, 640, 640, 3]) + + # Perform prediction + predictions = model.predict(img, tf_nms=True) + ``` + """ y = [] # outputs x = inputs for m in self.model.layers: @@ -645,8 +1570,25 @@ class TFModel: @staticmethod def _xywh2xyxy(xywh): - """Converts bounding box format from [x, y, w, h] to [x1, y1, x2, y2], where xy1=top-left and xy2=bottom- - right. + """ + Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]. + + Args: + xywh (torch.Tensor): Bounding boxes in the format (x, y, w, h) with shape (N, 4) where N is the number of boxes. + + Returns: + (torch.Tensor): Bounding boxes in the format (x1, y1, x2, y2) with shape (N, 4), where x1, y1 are top-left coordinates, + and x2, y2 are bottom-right coordinates. + + Notes: + This method is useful for converting bounding box formats for various operations like plotting, Non-Maximum + Suppression (NMS), or further model predictions. + + Examples: + ```python + boxes_xywh = torch.Tensor([[50, 50, 100, 100], [30, 40, 120, 80]]) + boxes_xyxy = TFModel._xywh2xyxy(boxes_xywh) + ``` """ x, y, w, h = tf.split(xywh, num_or_size_splits=4, axis=-1) return tf.concat([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=-1) @@ -655,7 +1597,40 @@ class TFModel: class AgnosticNMS(keras.layers.Layer): # TF Agnostic NMS def call(self, input, topk_all, iou_thres, conf_thres): - """Performs agnostic NMS on input tensors using given thresholds and top-K selection.""" + """ + Perform class-agnostic non-maximum suppression (NMS) on input bounding boxes. + + Args: + input (tuple[tf.Tensor, tf.Tensor, tf.Tensor]): Tuple containing: + boxes (tf.Tensor): Bounding boxes with shape (N, 4), where N is the number of boxes. + classes (tf.Tensor): Class predictions with shape (N,), where N is the number of boxes. + scores (tf.Tensor): Confidence scores with shape (N, C), where C is the number of classes. + topk_all (int): Maximum number of final boxes to keep after non-max suppression. + iou_thres (float): Intersection over union (IoU) threshold for NMS. + conf_thres (float): Confidence threshold to filter boxes before NMS. + + Returns: + (tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]): Four tensors containing: + boxes (tf.Tensor): Filtered bounding boxes after NMS, with shape (M, 4), where M is the number of kept boxes. + scores (tf.Tensor): Scores of kept boxes, with shape (M,). + classes (tf.Tensor): Class indices of kept boxes, with shape (M,). + indices (tf.Tensor): Original indices of the kept boxes, with shape (M,). + + Example: + ```python + boxes = tf.random.uniform((100, 4), minval=0, maxval=640) + scores = tf.random.uniform((100, 80), minval=0, maxval=1) + classes = tf.argmax(scores, axis=1) + input = (boxes, classes, scores) + nms_layer = AgnosticNMS() + final_boxes, final_scores, final_classes, final_indices = nms_layer( + input, topk_all=20, iou_thres=0.5, conf_thres=0.25) + ``` + + Note: + The function is designed to operate on single-anchor format inputs, performing class-agnostic NMS to reduce redundancy + in detected bounding boxes. + """ return tf.map_fn( lambda x: self._nms(x, topk_all, iou_thres, conf_thres), input, @@ -665,8 +1640,39 @@ class AgnosticNMS(keras.layers.Layer): @staticmethod def _nms(x, topk_all=100, iou_thres=0.45, conf_thres=0.25): - """Performs agnostic non-maximum suppression (NMS) on detected objects, filtering based on IoU and confidence - thresholds. + """ + Perform agnostic non-maximum suppression on given bounding box predictions. + + Args: + input (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): A tuple containing bounding boxes (N, 4), classes (N, C) + where C is the number of classes, and scores (N, C) where N is the number of predictions. + topk_all (int): The maximum number of detections to keep. + iou_thres (float): Intersection over Union (IoU) threshold for NMS. + conf_thres (float): Confidence threshold for filtering low-confidence predictions. + + Returns: + tuple: A tuple containing: + - torch.Tensor: Padded bounding boxes with shape (topk_all, 4). + - torch.Tensor: Padded scores with shape (topk_all). + - torch.Tensor: Padded class indices with shape (topk_all). + - int: Number of valid detections. + + Example: + ```python + boxes = torch.rand(100, 4) + classes = torch.rand(100, 5) + scores = torch.rand(100, 5) + topk_all = 50 + iou_thres = 0.5 + conf_thres = 0.3 + + selected_boxes, padded_scores, selected_classes, valid_detections = AgnosticNMS._nms( + (boxes, classes, scores), topk_all, iou_thres, conf_thres) + ``` + + Note: + This function considers detections class-agnostic and clusters all predicted boxes without regard to class labels + during NMS. """ boxes, classes, scores = x class_inds = tf.cast(tf.argmax(classes, axis=-1), tf.float32) @@ -700,7 +1706,25 @@ class AgnosticNMS(keras.layers.Layer): def activations(act=nn.SiLU): - """Converts PyTorch activations to TensorFlow equivalents, supporting LeakyReLU, Hardswish, and SiLU/Swish.""" + """ + Convert PyTorch activation functions to their TensorFlow equivalents. + + Args: + act (type[torch.nn.Module], optional): Activation function from PyTorch. Default is nn.SiLU. + + Returns: + (callable): A TensorFlow-compatible activation function. + + Example: + ```python + tf_activation = activations(nn.LeakyReLU) + output = tf_activation(input_tensor) + ``` + + Note: + Supports the conversion of LeakyReLU, Hardswish, and SiLU (Swish) activation functions. For unsupported types, + raises an error. + """ if isinstance(act, nn.LeakyReLU): return lambda x: keras.activations.relu(x, alpha=0.1) elif isinstance(act, nn.Hardswish): @@ -712,8 +1736,29 @@ def activations(act=nn.SiLU): def representative_dataset_gen(dataset, ncalib=100): - """Generates a representative dataset for calibration by yielding transformed numpy arrays from the input - dataset. + """ + Generates a representative dataset for calibration by yielding transformed numpy arrays from the input dataset. + + Args: + dataset (iterable): Dataset to yield images for calibration. Each item in the dataset should be a tuple containing + (path, img, im0s, vid_cap, string), where 'img' is the image represented as a numpy array with shape (C, H, W). + ncalib (int): Number of samples to yield for calibration (default is 100). + + Returns: + (generator): A generator yielding a list of numpy arrays, each representing an image with shape (1, H, W, C) scaled and + preprocessed for model calibration. + + Example: + ```python + dataset = DataLoader(...) # define your dataset + data_gen = representative_dataset_gen(dataset, ncalib=50) + for calibration_data in data_gen: + # perform calibration + ``` + + Notes: + - The function stops yielding data once ncalib samples have been produced from the dataset. + - Images are converted from shape (C, H, W) to (1, H, W, C) and scaled to a range of [0, 1]. """ for n, (path, img, im0s, vid_cap, string) in enumerate(dataset): im = np.transpose(img, [1, 2, 0]) @@ -731,7 +1776,27 @@ def run( dynamic=False, # dynamic batch size ): # PyTorch model - """Exports YOLOv5 model from PyTorch to TensorFlow and Keras formats, performing inference for validation.""" + """ + Exports YOLOv5 model from PyTorch to TensorFlow and Keras formats, performing inference for validation. + + Args: + weights (str | pathlib.Path): Path to the weights file. Default is ROOT / "yolov5s.pt". + imgsz (tuple[int, int]): Tuple of integers representing the height and width of the image for inference. Default is (640, 640). + batch_size (int): Size of the batch for inference. Default is 1. + dynamic (bool): Flag to indicate if dynamic batch size should be used in Keras model. Default is False. + + Returns: + None: The function exports the model and performs inference without returning any value. + + Example: + ```python + run(weights='best.pt', imgsz=(640, 640), batch_size=1, dynamic=False) + ``` + + Note: + - Ensure you have the necessary dependencies installed (`torch`, `tensorflow`, `keras`). + - Adjust the `weights` path, `imgsz`, `batch_size`, and `dynamic` flag as needed for your setup. + """ im = torch.zeros((batch_size, 3, *imgsz)) # BCHW image model = attempt_load(weights, device=torch.device("cpu"), inplace=True, fuse=False) _ = model(im) # inference @@ -751,8 +1816,34 @@ def run( def parse_opt(): - """Parses and returns command-line options for model inference, including weights path, image size, batch size, and - dynamic batching. + """ + Parse command-line arguments for model inference configuration. + + This utility function parses command-line arguments to configure the inference properties such as paths to weight files, + image sizes, batch sizes, and dynamic batch size options. + + Args: + None + + Returns: + (argparse.Namespace): Namespace object containing parsed command-line options: + - weights (str): Path to the model weights. + - imgsz (list[int]): Inference image size (height, width). + - batch_size (int): Batch size for inference. + - dynamic (bool): Whether to use dynamic batch size. + + Example: + ```python + opt = parse_opt() + print(opt.weights) + print(opt.imgsz) + print(opt.batch_size) + print(opt.dynamic) + ``` + + Note: + The --imgsz argument accepts either a single integer or a tuple of two integers. If only one value is provided, + it will be duplicated to form a square shape (height, width). """ parser = argparse.ArgumentParser() parser.add_argument("--weights", type=str, default=ROOT / "yolov5s.pt", help="weights path") @@ -766,7 +1857,28 @@ def parse_opt(): def main(opt): - """Executes the YOLOv5 model run function with parsed command line options.""" + """ + Execute the main function to run model export and validation processes for YOLOv5, including conversion to TensorFlow + and Keras formats. + + Args: + opt (argparse.Namespace): Parsed command-line arguments which include: + - weights (str): Path to the model weights. + - imgsz (list[int]): Inference image size (height, width). + - batch_size (int): Batch size for inference. + - dynamic (bool): Whether to use dynamic batch size. + + Example: + ```python + if __name__ == "__main__": + opt = parse_opt() + main(opt) + ``` + + Note: + This function integrates and validates the conversion of YOLOv5 from PyTorch to TensorFlow and Keras frameworks. + For additional export options, refer to the export.py script. + """ run(**vars(opt)) diff --git a/models/yolo.py b/models/yolo.py index d89c5da01..1d8da8273 100644 --- a/models/yolo.py +++ b/models/yolo.py @@ -76,7 +76,27 @@ class Detect(nn.Module): export = False # export mode def __init__(self, nc=80, anchors=(), ch=(), inplace=True): - """Initializes YOLOv5 detection layer with specified classes, anchors, channels, and inplace operations.""" + """ + Initializes the YOLOv5 Detect layer with class count, anchors, channels, and inplace operations. + + Args: + nc (int, optional): Number of classes. Default is 80. + anchors (tuple, optional): Anchor box dimensions, typically specified for each detection layer. Default is (). + ch (tuple, optional): Number of input channels for each detection layer. Default is (). + inplace (bool, optional): If True, operations are done inplace. Default is True. + + Returns: + None + + Example: + ```python + detect_layer = Detect(nc=80, anchors=(), ch=(256, 512, 1024), inplace=True) + ``` + + Note: + This function initializes detection heads in the YOLOv5 model, setting up convolution layers, grids, and + anchor grids required for object detection inference. + """ super().__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor @@ -89,7 +109,23 @@ class Detect(nn.Module): self.inplace = inplace # use inplace ops (e.g. slice assignment) def forward(self, x): - """Processes input through YOLOv5 layers, altering shape for detection: `x(bs, 3, ny, nx, 85)`.""" + """ + Processes input through detection layers, reshaping and applying convolution for YOLOv5 inference. + + Args: + x (list[torch.Tensor]): List of feature maps from backbone with shape (B, C, H, W) where B is the batch + size, C is the number of channels, and H and W are height and width. + + Returns: + (list[torch.Tensor]): List of processed detections, each a torch Tensor with shape (B, N, D) where B + is the batch size, N is the number of detections, and D is the dimensions of each detection + (e.g., bounding box coordinates, objectness score, class probabilities). + + Note: + This method applies a series of convolutions to transform the input feature maps into detection + outputs. It also handles reshaping and permutation to align with YOLOv5's output format. During + inference, additional steps are performed to compute final object locations and dimensions. + """ z = [] # inference output for i in range(self.nl): x[i] = self.m[i](x[i]) # conv @@ -115,7 +151,29 @@ class Detect(nn.Module): return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x) def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, "1.10.0")): - """Generates a mesh grid for anchor boxes with optional compatibility for torch versions < 1.10.""" + """ + Generate a mesh grid for anchor boxes with torch version compatibility for detection models. + + Args: + nx (int): Number of grid cells along the x-axis. + ny (int): Number of grid cells along the y-axis. + i (int): Index of the detection layer for which the grid is being generated. + torch_1_10 (bool): Indicator whether the torch version is at least 1.10.0 for meshgrid compatibility. + + Returns: + (tuple[torch.Tensor, torch.Tensor]): A tuple containing two tensors: + - grid (torch.Tensor): The generated grid with shape (1, num_anchors, ny, nx, 2), containing xy coordinates. + - anchor_grid (torch.Tensor): The anchor grid scaled by the stride, with shape (1, num_anchors, ny, nx, 2). + + Example: + ```python + detector = Detect() + grid, anchor_grid = detector._make_grid(20, 20, 0) + ``` + + Note: + The function ensures compatibility with different torch versions by using appropriate meshgrid indexing options. + """ d = self.anchors[i].device t = self.anchors[i].dtype shape = 1, self.na, ny, nx, 2 # grid shape @@ -129,7 +187,25 @@ class Detect(nn.Module): class Segment(Detect): # YOLOv5 Segment head for segmentation models def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True): - """Initializes YOLOv5 Segment head with options for mask count, protos, and channel adjustments.""" + """ + Initializes YOLOv5 Segment head with parameters for masks, prototypes, class count, anchors, and channels. + + Args: + nc (int): Number of classes for the segmentation model (default is 80). + anchors (tuple): Tuple of anchor box dimensions for the segmentation model. + nm (int): Number of masks for the segmentation (default is 32). + npr (int): Number of prototypes for the masks (default is 256). + ch (tuple): Tuple of input channels for each detection layer. + inplace (bool): If True, use in-place operations for layer computations (default is True). + + Returns: + None + + Example: + ```python + segment_head = Segment(nc=80, anchors=anchors, nm=32, npr=256, ch=[512, 256, 128], inplace=True) + ``` + """ super().__init__(nc, anchors, ch, inplace) self.nm = nm # number of masks self.npr = npr # number of protos @@ -139,8 +215,38 @@ class Segment(Detect): self.detect = Detect.forward def forward(self, x): - """Processes input through the network, returning detections and prototypes; adjusts output based on - training/export mode. + """ + Processes input through the network, returning detections and prototypes. + + Args: + x (list[torch.Tensor]): List of input tensors corresponding to different detection layers, each with shape + (B, C, H, W), where B is batch size, C is number of channels, H and W are height and width. + + Returns: + (tuple[torch.Tensor, torch.Tensor]): A tuple containing: + - `detection` (torch.Tensor): The detection output tensor with shape (B, N, 85), where B is batch size, N is + the number of detections. + - `prototypes` (torch.Tensor): The prototype masks tensor produced by the network with shape (B, P, H', W'), + where B is batch size, P is the number of prototypes, and H' and W' correspond to height and width. + + Example: + ```python + import torch + from ultralytics import YOLOv5 + + # Initialize model + model = YOLOv5.Segment() + + # Generate dummy input + x = [torch.randn(1, 3, 640, 640) for _ in range(3)] + + # Forward pass + detection, prototypes = model.forward(x) + ``` + + Note: + During inference (evaluation mode), detection outputs are post-processed to generate final bounding boxes and classes. + In training mode, the outputs are not processed. """ p = self.proto(x[0]) x = self.detect(self, x) @@ -151,13 +257,64 @@ class BaseModel(nn.Module): """YOLOv5 base model.""" def forward(self, x, profile=False, visualize=False): - """Executes a single-scale inference or training pass on the YOLOv5 base model, with options for profiling and - visualization. + """ + Perform a forward pass through the YOLOv5 model, optionally profiling and visualizing features. + + Args: + x (torch.Tensor): Input data tensor with shape (N, C, H, W). + profile (bool): Whether to profile execution time of each layer. Defaults to False. + visualize (bool): Whether to store and visualize feature maps. Defaults to False. + + Returns: + (torch.Tensor | tuple): In training mode, returns predictions as tuples with shapes (N, 3, H, W, no). + In inference mode, returns a single tensor with shape (N, M, no), where M is the number of predicted + objects after non-maximum suppression (NMS). + + Example: + ```python + model = BaseModel() + input_tensor = torch.randn(1, 3, 640, 640) + output = model.forward(input_tensor, profile=True, visualize=True) + ``` + + Note: + - In training mode, the method returns unprocessed predictions for each scale, suitable for loss calculation. + - In inference mode, non-maximum suppression is applied to refine predictions. """ return self._forward_once(x, profile, visualize) # single-scale inference, train def _forward_once(self, x, profile=False, visualize=False): - """Performs a forward pass on the YOLOv5 model, enabling profiling and feature visualization options.""" + """ + Execute a forward pass through the YOLOv5 model layers with optional profiling and visualization. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number + of channels, and H and W are the height and width of the input image, respectively. + profile (bool): If True, profiles the execution time for each layer. Defaults to False. + visualize (bool): If True, stores and visualizes feature maps. Defaults to False. + + Returns: + (torch.Tensor): Model output tensor with shape depending on whether the model is in training or + inference mode. + - In training mode: Returns a list of tensors for each detection layer, each tensor has shape + (N, 3, H, W, no), where `no` is the number of outputs per anchor. + - In inference mode: If not exporting, returns a tuple with a single tensor of shape (N, M, no), + where M is the number of predicted objects. + - If exporting: Returns a tensor of shape (N, M, no). + + Example: + ```python + model = BaseModel() + input_tensor = torch.randn(1, 3, 640, 640) # Generate a random input tensor + output = model._forward_once(input_tensor, profile=True, visualize=True) + ``` + + Note: + This method conducts a single-scale inference or training pass through the model. Depending on the mode + (training or inference), the method behaves differently. In training mode, it returns unprocessed + predictions for each detection layer. In inference mode, non-maximum suppression (NMS) is typically + applied after this method to refine predictions. + """ y, dt = [], [] # outputs for m in self.model: if m.f != -1: # if not from previous layer @@ -171,7 +328,32 @@ class BaseModel(nn.Module): return x def _profile_one_layer(self, m, x, dt): - """Profiles a single layer's performance by computing GFLOPs, execution time, and parameters.""" + """ + Profiles a single model layer's GFLOPs, parameters, and execution time within the YOLOv5 model. + + Args: + m (nn.Module): The model layer to be profiled. + x (torch.Tensor): Input tensor passed to the model layer, with shape (N, C, H, W). + dt (list[float]): List to record execution times of the profiled layer. + + Returns: + None: The function updates the `dt` list with the execution time of the layer in milliseconds. + + Example: + ```python + model = BaseModel() + layer = nn.Conv2d(3, 16, 3, 1) # Example layer + input_tensor = torch.randn(1, 3, 640, 640) # Example input + execution_times = [] + + model._profile_one_layer(layer, input_tensor, execution_times) + ``` + + Note: + - Profiling is done for the purpose of understanding the computational load (GFLOPs) and time taken per layer within + the YOLOv5 model. + - If the `thop` library is not available, FLOPs computation will not be performed. + """ c = m == self.model[-1] # is final layer, copy input as inplace fix o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1e9 * 2 if thop else 0 # FLOPs t = time_sync() @@ -185,7 +367,25 @@ class BaseModel(nn.Module): LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s} Total") def fuse(self): - """Fuses Conv2d() and BatchNorm2d() layers in the model to improve inference speed.""" + """ + Fuses Conv2d and BatchNorm2d layers in the model to optimize inference speed. + + This method modifies the model in place by merging Conv2d and BatchNorm2d layers into single Conv2d + layers where applicable. This can significantly improve inference speed and reduce memory usage. + + Returns: + None + + Example: + ```python + model = BaseModel() + model.fuse() + ``` + + Note: + After fusing layers, the forward method of fused layers is updated to `forward_fuse`, optimizing + the execution path. + """ LOGGER.info("Fusing layers... ") for m in self.model.modules(): if isinstance(m, (Conv, DWConv)) and hasattr(m, "bn"): @@ -196,12 +396,44 @@ class BaseModel(nn.Module): return self def info(self, verbose=False, img_size=640): - """Prints model information given verbosity and image size, e.g., `info(verbose=True, img_size=640)`.""" + """ + Display model summary, including layer details and computational complexity for a specified image size. + + Args: + verbose (bool): If True, prints a detailed summary including information about each layer. Defaults to False. + img_size (int | tuple[int]): Size of the input image as an integer (for square images) or tuple (H, W). + Defaults to 640. + + Returns: + (None): This function does not return any value. It directly prints the model summary to the console. + + Example: + ```python + model = BaseModel() + model.info(verbose=True, img_size=640) + ``` + + Note: + Ensure that the `verbose` parameter is set to True for a comprehensive layer-by-layer summary. The image size should + be supplied based on the expected input size for the model. + """ model_info(self, verbose, img_size) def _apply(self, fn): - """Applies transformations like to(), cpu(), cuda(), half() to model tensors excluding parameters or registered - buffers. + """ + Apply a function to the model and its layer parameters, including specific modifications for Detect and Segment layers. + + Args: + fn (function): A function to apply to the model's tensors. + + Returns: + (torch.nn.Module): The module with applied transformations. + + Note: + The function is particularly useful for operations like converting tensors to a target device + (e.g., CUDA, CPU) or altering their precision (e.g., float16). The Detect layer's stride and grid + parameters, as well as the Segment layer's anchor grids, are specifically modified to ensure consistency + after such transformations. """ self = super()._apply(fn) m = self.model[-1] # Detect() @@ -216,7 +448,36 @@ class BaseModel(nn.Module): class DetectionModel(BaseModel): # YOLOv5 detection model def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, anchors=None): - """Initializes YOLOv5 model with configuration file, input channels, number of classes, and custom anchors.""" + """ + Initializes YOLOv5 model using the specified config, input channels, class count, and custom anchors. + + Args: + cfg (str | dict): Model configuration, either a path to a YAML config file or a configuration dictionary. + ch (int): Number of input channels. Defaults to 3. + nc (int | None): Number of classes. If provided, overrides the value in the YAML file/config dictionary. Defaults to None. + anchors (list[int] | None): Custom anchors. If provided, overrides the anchors defined in the YAML file/config + dictionary. Defaults to None. + + Returns: + None + + Example: + ```python + from ultralytics.models.yolo import DetectionModel + + # Initialize model with path to YAML config + model1 = DetectionModel(cfg="yolov5s.yaml") + + # Initialize model with configuration dictionary + cfg_dict = {"nc": 80, "depth_multiple": 0.33, "width_multiple": 0.50} + model2 = DetectionModel(cfg=cfg_dict, ch=3, nc=80) + ``` + + Note: + If `cfg` is a dictionary, it should include the necessary parameters such as `nc`, `depth_multiple`, and `width_multiple`. + During initialization, the model configuration from the YAML file or dictionary is parsed, and the internal model + structure is built accordingly. This includes defining the detection layers and adjusting anchors and strides. + """ super().__init__() if isinstance(cfg, dict): self.yaml = cfg # model dict @@ -261,13 +522,64 @@ class DetectionModel(BaseModel): LOGGER.info("") def forward(self, x, augment=False, profile=False, visualize=False): - """Performs single-scale or augmented inference and may include profiling or visualization.""" + """ + Perform forward pass through the YOLOv5 detection model for training or inference, with options for augmentation, + profiling, and visualization. + + Args: + x (torch.Tensor): Input tensor with a shape of (N, C, H, W), where N is the batch size, C is the number of channels, + H is the height, and W is the width. + augment (bool): If True, performs augmented inference. Defaults to False. + profile (bool): If True, profiles the execution time of each layer. Defaults to False. + visualize (bool): If True, stores and visualizes feature maps. Defaults to False. + + Returns: + (torch.Tensor | tuple): Depending on the mode, returns either: + - In training mode: tuple containing predictions for each scale with shapes (N, 3, H, W, no). + - In inference mode: tensor with shape (N, M, no), where M is the number of predicted objects after + non-maximum suppression. + - When exporting: tuple containing concatenated inference output tensor and intermediate feature maps. + + Example: + ```python + model = DetectionModel(cfg="yolov5s.yaml", ch=3, nc=80) + input_tensor = torch.randn(1, 3, 640, 640) + output = model.forward(input_tensor, augment=False, profile=True, visualize=False) + ``` + + Note: + This method adapts to training and inference modes, with different return types based on the operational mode. + During training mode, it returns raw predictions across various scales for loss calculation, whereas in inference + mode, non-maximum suppression (NMS) is applied to refine predictions. + """ if augment: return self._forward_augment(x) # augmented inference, None return self._forward_once(x, profile, visualize) # single-scale inference, train def _forward_augment(self, x): - """Performs augmented inference across different scales and flips, returning combined detections.""" + """ + Performs augmented inference by processing input across different scales and flips, merging the outputs. + + Args: + x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of channels, + H and W are height and width. + + Returns: + (torch.Tensor): Merged output tensor after multi-scale and flip augmentations, with shape (N, M, no), + where N is batch size, M is the number of predictions, and no is the number of output features. + + Example: + ```python + model = DetectionModel(cfg='yolov5s.yaml') + input_tensor = torch.randn(1, 3, 640, 640) + output = model._forward_augment(input_tensor) + ``` + + Note: + The function processes the input using different scales (1, 0.83, 0.67) and flips (None, horizontal), + descaling predictions before merging. This helps to improve model robustness and accuracy + during inference. + """ img_size = x.shape[-2:] # height, width s = [1, 0.83, 0.67] # scales f = [None, 3, None] # flips (2-ud, 3-lr) @@ -282,7 +594,23 @@ class DetectionModel(BaseModel): return torch.cat(y, 1), None # augmented inference, train def _descale_pred(self, p, flips, scale, img_size): - """De-scales predictions from augmented inference, adjusting for flips and image size.""" + """ + Adjusts predictions for augmented inference by de-scaling and correcting for flips or image size changes. + + Args: + p (torch.Tensor): Predictions tensor with shape (..., N) where N indicates prediction attributes like + bounding box coordinates, confidence score, etc. + flips (int | None): Specifies flip mode. `2` for vertical flip, `3` for horizontal flip, and `None` for no flip. + scale (float): Scale factor used during augmentation. + img_size (tuple[int, int]): Original image dimensions as (height, width). + + Returns: + (torch.Tensor): Adjusted predictions tensor with the same shape as input, de-scaled and de-flipped appropriately. + + Note: + If inplace operations are enabled, the adjustments are applied directly on the tensor. Otherwise, new tensors are + created for the adjusted values to avoid modifying the original input. + """ if self.inplace: p[..., :4] /= scale # de-scale if flips == 2: @@ -299,8 +627,18 @@ class DetectionModel(BaseModel): return p def _clip_augmented(self, y): - """Clips augmented inference tails for YOLOv5 models, affecting first and last tensors based on grid points and - layer counts. + """ + Clip augmented inference tails for YOLOv5 models, adjusting predictions from the first and last layers. + + Args: + y (list[torch.Tensor]): List of tensors, where each tensor represents detections from augmented inference across different layers. + + Returns: + (list[torch.Tensor]): Modified list of tensors with clipped augmented inference tails. + + Notes: + This function helps to discard the augmented tails by adjusting predictions from the first and last layers, + which might otherwise introduce artifacts due to the augmentation process. """ nl = self.model[-1].nl # number of detection layers (P3-P5) g = sum(4**x for x in range(nl)) # grid points @@ -313,9 +651,36 @@ class DetectionModel(BaseModel): def _initialize_biases(self, cf=None): """ - Initializes biases for YOLOv5's Detect() module, optionally using class frequencies (cf). - - For details see https://arxiv.org/abs/1708.02002 section 3.3. + Initialize biases for the YOLOv5 Detect module using specified or default bias adjustments. + + Args: + cf (torch.Tensor | None): Optional tensor representing class frequencies for bias initialization. The shape should be + (N,), where N is the number of classes. If not provided, default adjustments are applied based on the number of + classes and image dimensions. + + Returns: + (torch.Tensor): Updated biases for the model with shape (N, M), where N is the number of anchors and M is the number of + outputs per anchor. + + Note: + The function calculates the biases based on principles from https://arxiv.org/abs/1708.02002, section 3.3. If class + frequencies (`cf`) are not provided, default bias adjustments are made. Adjustments primarily ensure that objectness and + class biases are reasonably initialized for effective training. + + Example: + ```python + from ultralytics.yolov5 import DetectionModel + import torch + + # Initialize model + model = DetectionModel(cfg="yolov5s.yaml") + + # Optional class frequencies tensor + class_frequencies = torch.tensor([100, 150, 200]) + + # Initialize biases + model._initialize_biases(cf=class_frequencies) + ``` """ # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1. m = self.model[-1] # Detect() module @@ -334,22 +699,91 @@ Model = DetectionModel # retain YOLOv5 'Model' class for backwards compatibilit class SegmentationModel(DetectionModel): # YOLOv5 segmentation model def __init__(self, cfg="yolov5s-seg.yaml", ch=3, nc=None, anchors=None): - """Initializes a YOLOv5 segmentation model with configurable params: cfg (str) for configuration, ch (int) for channels, nc (int) for num classes, anchors (list).""" + """ + Initializes a YOLOv5 segmentation model with configurable parameters. + + Args: + cfg (str): Path to the configuration file containing model architecture and parameters. Defaults to "yolov5s-seg.yaml". + ch (int): Number of input channels. Defaults to 3. + nc (int | None): Number of classes. If provided, overrides the number of classes specified in the cfg file. + anchors (list | None): List of anchor points. If provided, overrides the anchor configuration in the cfg file. + + Returns: + (None): Initializes various components of the SegmentationModel class. + + Example: + ```python + from ultralytics import SegmentationModel + model = SegmentationModel() + ``` + + Note: + The initialization includes setting up model layers, anchors, and other configurations based on the provided + or default configuration file. + """ super().__init__(cfg, ch, nc, anchors) class ClassificationModel(BaseModel): # YOLOv5 classification model def __init__(self, cfg=None, model=None, nc=1000, cutoff=10): - """Initializes YOLOv5 model with config file `cfg`, input channels `ch`, number of classes `nc`, and `cuttoff` - index. + """ + Initializes a YOLOv5 classification model with either a configuration file or a pre-built model, specifying + the number of classes and a cutoff layer index. + + Args: + cfg (str | None): Path to the model configuration file, or None if using `model`. + model (torch.nn.Module | None): Pre-built torch model, or None if using `cfg`. + nc (int): Number of output classes, default is 1000. + cutoff (int): Index of the cutoff layer, default is 10. + + Returns: + None + + Example: + ```python + # Initializing from a configuration file + model = ClassificationModel(cfg='yolov5-class-config.yaml', nc=1000, cutoff=10) + + # Initializing from an existing model + model = ClassificationModel(model=prebuilt_model, nc=1000, cutoff=10) + ``` + + Note: + This model can be extended or customized by modifying the configuration file or the pre-built model. """ super().__init__() self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg) def _from_detection_model(self, model, nc=1000, cutoff=10): - """Creates a classification model from a YOLOv5 detection model, slicing at `cutoff` and adding a classification - layer. + """ + Perform a transformation from a YOLOv5 detection model to a classification model. + + Args: + model (DetectionModel): A pre-trained YOLOv5 detection model. + nc (int): Number of classes for the classification model. Default is 1000. + cutoff (int): Index to slice the model's layers up to the classification layer. Default is 10. + + Returns: + None. The function modifies the model in place. + + Notes: + This function takes a detection model and transforms it into a classification model by slicing the model layers + at the specified cutoff point and adding a classification layer with the specified number of classes. + - If the input model is wrapped by `DetectMultiBackend`, it unwraps the model to get the underlying YOLOv5 model. + - Constructs a `Classify` layer, replacing the final detection layer with this new classification layer. + + Example: + ```python + from ultralytics import YOLOv5 + + # Load a pre-trained detection model + detection_model = YOLOv5.load('yolov5s.pt') + + # Create a classification model from detection model + classification_model = YOLOv5.ClassificationModel() + classification_model._from_detection_model(detection_model, nc=1000, cutoff=10) + ``` """ if isinstance(model, DetectMultiBackend): model = model.model # unwrap DetectMultiBackend @@ -365,12 +799,49 @@ class ClassificationModel(BaseModel): self.nc = nc def _from_yaml(self, cfg): - """Creates a YOLOv5 classification model from a specified *.yaml configuration file.""" + """ + Perform initialization and parsing from a YOLOv5 configuration file. + + Args: + cfg (str): Path to the YOLOv5 YAML configuration file. + + Returns: + None. The function modifies the model in place utilizing the defined configuration parameters. + + Notes: + This function reads a YOLOv5 YAML configuration file and constructs the classification model accordingly. It sets the + appropriate channels, layers, and output classes based on the parsed configuration data. + """ self.model = None def parse_model(d, ch): - """Parses a YOLOv5 model from a dict `d`, configuring layers based on input channels `ch` and model architecture.""" + """ + Parses YOLOv5 model architecture from a configuration dictionary and initializes its layers. + + Args: + d (dict): Dictionary containing model configuration. Must include keys: "anchors", "nc", "depth_multiple", + "width_multiple", and optionally "activation" and "channel_multiple". + ch (list[int]): List of input channels for each layer. + + Returns: + (tuple[nn.Sequential, list[int]]): A tuple containing: + - `model` (nn.Sequential): The constructed YOLOv5 model based on the configuration. + - `save` (list[int]): List of layers whose outputs should be preserved during the forward pass. + + Example: + ```python + from pathlib import Path + import yaml + + # Load model configuration YAML + with open(Path('yolov5s.yaml'), 'r') as file: + model_config = yaml.safe_load(file) + + # Parse model and initialize + model, save = parse_model(model_config, ch=[3]) + ``` + """ LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}") anchors, nc, gd, gw, act, ch_mul = ( d["anchors"],